------------------------------------------------------------ -- Copyright: 2010 Integrated Sytems Laboratory, ETH Zurich -- http://www.iis.ee.ethz.ch/~sha3 ------------------------------------------------------------ ------------------------------------------------------------------------------- -- Title : ECHO 20 Gb/s implementation -- Project : ------------------------------------------------------------------------------- -- File : echo_fast.vhd -- Author : Frank/Luca account -- Company : Integrated Systems Laboratory, ETH Zurich -- Created : 2010-02-01 -- Last update: 2010-02-05 -- Platform : ModelSim (simulation), Synopsys (synthesis) -- Standard : VHDL'87 ------------------------------------------------------------------------------- -- Description: This is a fast ECHO datapath with 4 double AES rounds in -- parallel ------------------------------------------------------------------------------- -- Copyright (c) 2010 Integrated Systems Laboratory, ETH Zurich ------------------------------------------------------------------------------- -- Revisions : -- Date Version Author Description -- 2010-02-01 1.0 sha3 Created ------------------------------------------------------------------------------- library ieee; use ieee.std_logic_1164.all; use ieee.numeric_std.all; entity echo is port ( DataInxDI : in std_logic_vector(1535 downto 0); -- All inputs parallel LastxSI : in std_logic; -- 1: this data will be the last round -- 0: continue chaining InENxSI : in std_logic; DataOutxDO : out std_logic_vector(255 downto 0); OutEnxSO : out std_logic; CLKxCI : in std_logic; RSTxRBI : in std_logic); end echo; architecture fast of echo is signal DataShufflexD : std_logic_vector(1535 downto 0); -- initialization string for VxDP according to section 2.1, on pg 7 of the -- ECHO description constant VINIT : std_logic_vector(511 downto 0) := (496 => '1', 368 => '1' , 240 => '1', 112 => '1', others => '0'); -- somehow we fixed the length of the message to be one 1376. It is not -- difficult to make it work with any length, but at the moment this is the -- easiest alternative constant CINIT : std_logic_vector(61 downto 0) := X"000000000000056"&"00"; -- this has to be one less than CINIT(4 downto 0); constant CSTOP : std_logic_vector(4 downto 0) := "10111"; signal VxDN, VxDP : std_logic_vector(511 downto 0); signal OxDN, OxDP : std_logic_vector(255 downto 0); signal SxDN, SxDP,SxD : std_logic_vector(2047 downto 0); type s_type is array (0 to 15) of std_logic_vector(127 downto 0); signal SNxD, SPxD : s_type; signal T1xDP, T2xDP, T3xDP, T4xDP : std_logic_vector(127 downto 0); signal T1xDN, T2xDN, T3xDN, T4xDN : std_logic_vector(127 downto 0); signal A1xD, A2xD, A3xD, A4xD : std_logic_vector(127 downto 0); signal B1xD, B2xD, B3xD, B4xD : std_logic_vector(127 downto 0); signal BM1xD, BM2xD, BM3xD, BM4xD : std_logic_vector(127 downto 0); -- The name is Big Mixcolumns Input and BigMixcolumnsOutput type bm_type is array (0 to 15) of std_logic_vector(31 downto 0); signal BMIxD,BMOxD : bm_type; -- Big Final Signals signal BF1xD, BF2xD : std_logic_vector(511 downto 0); signal K1xD, K2xD, K3xD, K4xD : std_logic_vector(63 downto 0); signal K1SxD, K2SxD, K3SxD, K4SxD : std_logic_vector(63 downto 0); signal CntxSP, CntxSN : std_logic_vector(61 downto 0); signal LastxSP, LastxSN : std_logic; signal OutEnxSN, OutEnxSP : std_logic; signal PartCntxS : std_logic_vector(1 downto 0); type states_type is (init, run, chain, last); signal StatexDP, StatexDN : states_type; component doubleAES port ( InxDI : in std_logic_vector(127 downto 0); OutxDO : out std_logic_vector(127 downto 0); RoundKeyxDI : in std_logic_vector(63 downto 0)); end component; component mixcolumn port ( InpxDI : in std_logic_vector(31 downto 0); OupxDO : out std_logic_vector(31 downto 0)); end component; begin -- fast -- this is to make typing slightly easier -- perl -e 'for $i (0..15){ print "SP",$i,"xDSxDP(",128*($i+1)-1," downto ",128*$i,");\n"}' gen_smap: for i in 0 to 15 generate SPxD(i) <= SxDP((128*(i+1)) -1 downto 128*i); SxD((128*(i+1)) -1 downto 128*i) <= SNxD(i); end generate gen_smap; PartCntxS <= CntxSP(1 downto 0); -- shows which cycle we are in -- The total calculation is four cock -- cycles ------------------------------------------------------------------------------- -- Select input for the four AESblocks -- This essentially also implements the Big ShiftRows on S ------------------------------------------------------------------------------- with PartCntxS select A1xD <= SPxD(0) when "00", SPxD(4) when "01", SPxD(8) when "10", SPxD(12) when others; with PartCntxS select A2xD <= SPxD(5) when "00", SPxD(9) when "01", SPxD(13) when "10", SPxD(1) when others; with PartCntxS select A3xD <= SPxD(10) when "00", SPxD(14) when "01", SPxD(2) when "10", SPxD(6) when others; with PartCntxS select A4xD <= SPxD(15) when "00", SPxD(3) when "01", SPxD(7) when "10", SPxD(11) when others; ------------------------------------------------------------------------------- -- Calculate the Roundkeys -- -- This is now more complex as we do not pick the S'es in order but we pick -- them post-BigShiftRows ------------------------------------------------------------------------------- with PartCntxS select K1xD <= CntxSP(61 downto 2) & "0000" when "00", CntxSP(61 downto 2) & "0100" when "01", CntxSP(61 downto 2) & "1000" when "10", CntxSP(61 downto 2) & "1100" when others; with PartCntxS select K2xD <= CntxSP(61 downto 2) & "0101" when "00", CntxSP(61 downto 2) & "1001" when "01", CntxSP(61 downto 2) & "1101" when "10", CntxSP(61 downto 2) & "0001" when others; with PartCntxS select K3xD <= CntxSP(61 downto 2) & "1010" when "00", CntxSP(61 downto 2) & "1110" when "01", CntxSP(61 downto 2) & "0010" when "10", CntxSP(61 downto 2) & "0110" when others; with PartCntxS select K4xD <= CntxSP(61 downto 2) & "1111" when "00", CntxSP(61 downto 2) & "0011" when "01", CntxSP(61 downto 2) & "0111" when "10", CntxSP(61 downto 2) & "1011" when others; ------------------------------------------------------------------------------- -- Shuffle since ECHO has a strange way of representing data ------------------------------------------------------------------------------- gen_keyshuffle: for i in 0 to 7 generate K1SxD((i+1)*8-1 downto i*8) <= K1xD ((8-i)*8-1 downto (7-i)*8); K2SxD((i+1)*8-1 downto i*8) <= K2xD ((8-i)*8-1 downto (7-i)*8); K3SxD((i+1)*8-1 downto i*8) <= K3xD ((8-i)*8-1 downto (7-i)*8); K4SxD((i+1)*8-1 downto i*8) <= K4xD ((8-i)*8-1 downto (7-i)*8); end generate gen_keyshuffle; ------------------------------------------------------------------------------- -- Instantiate four double AES ------------------------------------------------------------------------------- i_doubleAES1: doubleAES port map ( InxDI => A1xD, OutxDO => B1xD, RoundKeyxDI => K1SxD); i_doubleAES2: doubleAES port map ( InxDI => A2xD, OutxDO => B2xD, RoundKeyxDI => K2SxD); i_doubleAES3: doubleAES port map ( InxDI => A3xD, OutxDO => B3xD, RoundKeyxDI => K3SxD); i_doubleAES4: doubleAES port map ( InxDI => A4xD, OutxDO => B4xD, RoundKeyxDI => K4SxD); ------------------------------------------------------------------------------- -- Now comes the Big Mixed Column ------------------------------------------------------------------------------- gen_bigmixcols: for i in 0 to 15 generate -- map inputs BMIxD(i) <= B1xD((i+1)*8-1 downto i*8) & B2xD((i+1)*8-1 downto i*8) & B3xD((i+1)*8-1 downto i*8) & B4xD((i+1)*8-1 downto i*8); --instantiate the mixcolumns i_mixcolumn: mixcolumn port map ( InpxDI => BMIxD(i), OupxDO => BMOxD(i)); -- map back to four signals BM1xD((i+1)*8-1 downto i*8) <= BMOxD(i)(31 downto 24); BM2xD((i+1)*8-1 downto i*8) <= BMOxD(i)(23 downto 16); BM3xD((i+1)*8-1 downto i*8) <= BMOxD(i)(15 downto 8); BM4xD((i+1)*8-1 downto i*8) <= BMOxD(i)( 7 downto 0); end generate gen_bigmixcols; ------------------------------------------------------------------------------- -- This process determines the next state for the S and Temp registers ------------------------------------------------------------------------------- p_write: process (SPxD, PartCntxS, BM1xD,BM2xD,BM3xD,BM4xD, T1xDP,T2xDP,T3xDP,T4xDP) begin -- process p_write --defaults SNxD <= SPxD; T1xDN <= T1xDP; T2xDN <= T2xDP; T3xDN <= T3xDP; T4xDN <= T4xDP; case PartCntxS is when "00" => SNxD(0) <= BM1xD; -- 0 T1xDN <= BM2xD; -- T1= 1 T2xDN <= BM3xD; -- T2= 2 T3xDN <= BM4xD; -- T3= 3 when "01" => SNxD(4) <= BM1xD; -- 4 SNxD(5) <= BM2xD; -- 5 T3xDN <= BM3xD; -- T3=6 T4xDN <= BM4xD; -- T4=7 SNxD(3) <= T3xDP; -- 3 when "10" => SNxD(8) <= BM1xD; -- 8 SNxD(9) <= BM2xD; -- 9 SNxD(10) <= BM3xD; -- 10 T2xDN <= BM4xD; -- T2=11 SNxD(2) <= T2xDP; -- 2 SNxD(7) <= T4xDP; -- 7 when others => SNxD(12) <= BM1xD; -- 12 SNxD(13) <= BM2xD; -- 13 SNxD(14) <= BM3xD; -- 14 SNxD(15) <= BM4xD; -- 15 SNxD(1) <= T1xDP; -- 1 SNxD(6) <= T3xDP; -- 6 SNxD(11) <= T2xDP; -- 11 end case; end process p_write; ------------------------------------------------------------------------------- -- Big Final -- This is divided into two steps. Once the new data is here it is immediately -- XOR'ed and added to V (BF1) -- At the end of calculation 31st clock the State S is XOR'ed with V again -- (BF2) -- Technically the VxDP VxDN xor BF1xD xor BF2xD. However BF1 is available -- at the very beginning and BF2 is available at the very end. ------------------------------------------------------------------------------- -- this is the State XOR'ed. This is the next state BF2xD(511 downto 384) <= SNxD(0) xor SNxD(4) xor SNxD(8) xor SNxD(12) ; BF2xD(383 downto 256) <= SNxD(1) xor SNxD(5) xor SNxD(9) xor SNxD(13) ; BF2xD(255 downto 128) <= SNxD(2) xor SNxD(6) xor SNxD(10) xor SNxD(14) ; BF2xD(127 downto 0) <= SNxD(3) xor SNxD(7) xor SNxD(11) xor SNxD(15) ; -- this one is the message XOR'ed -- NOte that we need the shuffled data in for this as well BF1xD(511 downto 384) <= DataShufflexD( 127 downto 0) xor DataShufflexD( 639 downto 512) xor DataShufflexD(1151 downto 1024); BF1xD(383 downto 256) <= DataShufflexD( 255 downto 128) xor DataShufflexD( 767 downto 640) xor DataShufflexD(1279 downto 1152); BF1xD(255 downto 128) <= DataShufflexD( 383 downto 256) xor DataShufflexD( 895 downto 768) xor DataShufflexD(1407 downto 1280); BF1xD(127 downto 0) <= DataShufflexD( 511 downto 384) xor DataShufflexD(1023 downto 896) xor DataShufflexD(1535 downto 1408); ------------------------------------------------------------------------------- -- State Machines -- -- There are (at the moment) three state machines: -- p_s: controls what happens with the STATE (SxDN) -- p_v: controls the VECTOR (VxDN) and at the same time to allow fast working -- also the OUTPUT (OxDN) -- p_fsm: is for the main control flow, Counters, Last Flag, OutEn signal -- -- The first two should produce larger multiplexer structures, the last one is -- the 'real' FSM of the system. ------------------------------------------------------------------------------- -- reshuffling, strange way of writing in data.. at least for me gen_shuffle: for i in 0 to 11 generate DataShufflexD((i+1)*128-1 downto i*128) <= DataInxDI((12-i)*128 -1 downto (11-i)*128); end generate gen_shuffle; p_s: process (SxD, InENxSI, StatexDP, DataShufflexD, VxDN, CntxSP, LastxSP) begin -- process p_s SxDN <= SxD; case StatexDP is when init => if InENxSI='1' then SxDN <= DataShufflexD & VINIT; end if; when run => if CntxSP(4 downto 0)=CSTOP then if InENxSI='1' then if LastxSP='1' then SxDN <= DataShufflexD & VINIT; else SxDN <= DataShufflexD & VxDN; end if; end if; end if; when others => null; end case; end process p_s; -- Next state for V, O (Output) p_v: process (VxDP, OxDP, BF1xD, BF2xD, StatexDP, InENxSI, CntxSP, LastxSP) begin -- process p_v -- default VxDN <= VxDP; -- by default do not update OxDN <= OxDP; case StatexDP is when init => VxDN <= VINIT; -- initialize if InENxSI = '1' then VxDN <= VINIT xor BF1xD; -- add the message to initial value end if; when run => if CntxSP(4 downto 0) = CSTOP then -- the last round VxDN <= VxDP xor BF2xD; -- add the SxDN of run 31 which will be -- the final state -- we are done with computing at this stage write the output and be done OxDN <= VxDP(511 downto 256) xor BF2xD(511 downto 256); if InENxSI='1' then -- new block starting if LastxSP = '1' then -- is this the last block VxDN <= VINIT xor BF1xD; -- Clear V, get Input else VxDN <= VxDP xor BF2xD xor BF1xD; -- We keep the V, add the last -- state and also add the -- input which has arrived end if; end if; end if; when chain => if InENxSI='1' then VxDN <= VxDP xor BF1xD; -- add the message end if; when others => null; end case; end process p_v; -- main FSM p_fsm: process (CntxSP, LastxSI, LastxSP, InENxSI, StatexDP) begin -- process p_fsm --defaults StatexDN <= StatexDP; OutEnxSN <= '0'; -- output is not ready CntxSN <= std_logic_vector(unsigned (CntxSP) + "1"); LastxSN <= LastxSP; case StatexDP is when init => CntxSN <= CINIT; if InENxSI='1' then -- there is new data StatexDN <= run; -- go to run state if LastxSI='1' then -- this will be the last block LastxSN <= '1'; -- record the last block end if; end if; when run => if CntxSP(4 downto 0) = CSTOP then OutEnxSN <= '1'; -- Next cycle the output is ready if InENxSI='1' then -- New data is here StatexDN <= run; -- Go to run state if LastxSP = '1' then -- was this the last ?? LastxSN <= '0'; -- Clear the last flag CntxSN <= CINIT; -- reset the counter end if; else if LastxSP = '1' then -- is this the last message block ?? StatexDN <= init; -- restart counters LastxSN <= '0'; -- reset also Last else StatexDN <= chain; -- keep the count end if; end if; end if; when chain => CntxSN <= CntxSP; -- wait until there is new data if InENxSI='1' then -- there is new data StatexDN <= run; -- now continue if LastxSI='1' then -- this will be the last block LastxSN <= '1'; -- keep the information end if; end if; when others => null; end case; end process p_fsm; -- Outputs -- Now from the register DataOutxDO <= OxDP; -- Output Enable is also registered OutEnxSO <= OutEnxSP; ------------------------------------------------------------------------------- -- Clocked process for all the registers ------------------------------------------------------------------------------- p_clk : process (CLKxCI, RSTxRBI) begin -- process p_clk if RSTxRBI = '0' then -- asynchronous reset (active low) SxDP <= (others => '0'); VxDP <= VINIT; T1xDP <= (others => '0'); T2xDP <= (others => '0'); T3xDP <= (others => '0'); T4xDP <= (others => '0'); OxDP <= (others => '0'); CntxSP <= CINIT; LastxSP <= '0'; OutEnxSP <= '0'; StatexDP <= init; elsif CLKxCI'event and CLKxCI = '1' then -- rising clock edge SxDP <= SxDN; VxDP <= VxDN; T1xDP <= T1xDN; T2xDP <= T2xDN; T3xDP <= T3xDN; T4xDP <= T4xDN; OxDP <= OxDN; CntxSP <= CntxSN; LastxSP <= LastxSN; OutEnxSP <= OutEnxSN; StatexDP <= StatexDN; end if; end process p_clk; end fast;