------------------------------------------------------------ -- Copyright: 2010 Integrated Sytems Laboratory, ETH Zurich -- http://www.iis.ee.ethz.ch/~sha3 ------------------------------------------------------------ ------------------------------------------------------------------------------- -- Title : ECHO 0.2 Gb/s implementation -- Project : ------------------------------------------------------------------------------- -- File : echo_slow.vhd -- Author : Frank/Luca account -- Company : Integrated Systems Laboratory, ETH Zurich -- Created : 2010-02-23 -- Last update: 2010-04-13 -- Platform : ModelSim (simulation), Synopsys (synthesis) -- Standard : VHDL'87 ------------------------------------------------------------------------------- -- Description: This is a slow ECHO datapath with a single 32bit AES ------------------------------------------------------------------------------- -- Copyright (c) 2010 Integrated Systems Laboratory, ETH Zurich ------------------------------------------------------------------------------- -- Revisions : -- Date Version Author Description -- 2010-02-23 1.0 sha3 Created -- 2010-04-12 1.1 sha3 There is a small problem with the assignment -- ordering in general. THis is being addressed -- Added a register to keep the SALT round. This -- is the round where we do not use the key ------------------------------------------------------------------------------- library ieee; use ieee.std_logic_1164.all; use ieee.numeric_std.all; entity echo is port ( DataInxDI : in std_logic_vector(1535 downto 0); -- All inputs parallel LastxSI : in std_logic; -- 1: this data will be the last round -- 0: continue chaining InENxSI : in std_logic; DataOutxDO : out std_logic_vector(255 downto 0); OutEnxSO : out std_logic; CLKxCI : in std_logic; RSTxRBI : in std_logic); end echo; architecture slow of echo is -- This file is based on the FAST ECHO implementation. Several things -- have been modified in order to use as much as possible from the fast -- echo. signal DataShufflexD : std_logic_vector(1535 downto 0); -- initialization string for VxDP according to section 2.1, on pg 7 of the -- ECHO description constant VINIT : std_logic_vector(511 downto 0) := (496 => '1', 368 => '1' , 240 => '1', 112 => '1', others => '0'); -- somehow we fixed the length of the message to be one 1376. It is not -- difficult to make it work with any length, but at the moment this is the -- easiest alternative constant CINIT : std_logic_vector(63 downto 0) := X"000000000000056"&"0000"; -- this has to be one less than CINIT(4 downto 0); constant CSTOP : std_logic_vector(4 downto 0) := "10111"; signal VxDN, VxDP : std_logic_vector(511 downto 0); signal SxDN, SxDP, SxD : std_logic_vector(2047 downto 0); signal BigSubxD, BigMixD : std_logic_vector(2047 downto 0); type s_type is array (0 to 15) of std_logic_vector(127 downto 0); signal SNxD, BSxD, BMxD, SPxD, AESxD : s_type; -- this is the 128 bit state of the aes signal APxD,ANxD : std_logic_vector(127 downto 0); -- the 4 temporary storage that we have for the shiftrows signal ATxDN,ATxDP : std_logic_vector(31 downto 0); -- and this is the 32 bit that comes in and out signal AInxD, ASubxD, AMixD, AKeyxD, AOutxD : std_logic_vector(31 downto 0); type BytesType is array (0 to 15) of std_logic_vector (7 downto 0); signal AxD, BxD : BytesType; -- The name is Big Mixcolumns Input and BigMixcolumnsOutput type bm_type is array (0 to 63) of std_logic_vector(31 downto 0); signal BMIxD,BMOxD : bm_type; -- Big Final Signals signal BF1xD, BF2xD : std_logic_vector(511 downto 0); signal KeyxD : std_logic_vector(127 downto 0); signal KSxD : std_logic_vector(63 downto 0); signal CntxSP, CntxSN : std_logic_vector(63 downto 0); signal SCntxSP, SCntxSN : integer range 0 to 15; signal RndCntxSP, RndCntxSN : integer range 0 to 15; signal LastxSP, LastxSN : std_logic; type states_type is (init, chain, aes0, aes1, aes2, aes3, bigmix, last); signal StatexDP, StatexDN : states_type; signal SaltxSP, SaltxSN : std_logic; component sbox port ( InpxDI : in std_logic_vector(7 downto 0); OupxDO : out std_logic_vector(7 downto 0)); end component; component mixcolumn port ( InpxDI : in std_logic_vector(31 downto 0); OupxDO : out std_logic_vector(31 downto 0)); end component; begin -- slow -- reshuffling, strange way of writing in data.. at least for me gen_shuffle: for i in 0 to 11 generate DataShufflexD((i+1)*128-1 downto i*128) <= DataInxDI((12-i)*128 -1 downto (11-i)*128); end generate gen_shuffle; -- make typing easier gen_smap: for i in 0 to 15 generate SPxD(i) <= SxDP((128*(i+1)) -1 downto 128*i); BigMixD((128*(i+1)) -1 downto 128*i) <= BMxD(i); BigSubxD((128*(i+1)) -1 downto 128*i) <= AESxD(i); end generate gen_smap; ------------------------------------------------------------------------------- -- S Register ------------------------------------------------------------------------------- -- input multiplexer chooses the next stae of SxDN p_inmux: process (DataShufflexD, VxDP, SxDP, StatexDP, BigMixD, BigSubxD) begin -- process p_inmux SxDN <= SxDP; case StatexDP is when init => SxDN <= DataShufflexD & VINIT; when chain => SxDN <= DataShufflexD & VxDP; when bigmix => SxDN <= BigMixD; when others => SxDN <= BigSubxD; end case; end process p_inmux; ------------------------------------------------------------------------------- -- AES block ------------------------------------------------------------------------------- -- Multiplexer to select one of the 16 BigStates for AES (APxD) -- At the same time write back the 128 bit value back to S -- -- This is a 2-level mux. At the first level we pick 128 out 2048 -- At the second level we will pick one out of APxD APxD <= SPxD(SCntxSP); p_AESdata: process (SCntxSP, SPxD, ANxD) begin -- process p_AESdata AESxD <= SPxD; AESxD(SCntxSP) <= ANxD; end process p_AESdata; -- for readability -- APxD is the input to the AES block -- this is mapped to AxD(i) -- The output of the AES Block is ANxD -- this is mapped to BxD(i) gen_byte_assign: for i in 0 to 15 generate AxD(i) <= APxD(((15-i)*8)+7 downto (15-i)*8); ANxD(((15-i)*8)+7 downto (15-i)*8) <= BxD(i); end generate gen_byte_assign; p_AESround: process (AxD, StatexDP, ATxDP, AOutxD) begin -- process p_AESround AInxD <= AxD(0) & AxD(5) & AxD(10) & AxD(15); ATxDN <= ATxDP; BxD <= AxD; -- case StatexDP is -- when aes0 => -- AInxD AxD(0) & AxD(5) & AxD(10) & AxD(15); -- 0 5 10 15 -- BxD(0) AOutxD(31 downto 24); --0 -- ATxDN( 7 downto 0) AOutxD(23 downto 16); --T1 =5 -- ATxDN(15 downto 8) AOutxD(15 downto 8); --T2 =10 -- ATxDN(23 downto 16) AOutxD( 7 downto 0); --T3 =15 -- when aes1 => -- AInxD AxD(4) & AxD(9) & AxD(14)& AxD(3); -- 4 9 14 3 -- BxD(4) AOutxD(31 downto 24); --4 -- ATxDN( 7 downto 0) AOutxD(23 downto 16); --T1=9 -- ATxDN(31 downto 24) AOutxD(15 downto 8); --T4=14 -- BxD(3) AOutxD( 7 downto 0); --3 -- BxD(5) ATxDP ( 7 downto 0); --5 -- when aes2 => -- AInxD AxD(8)& AxD(13)& AxD(2) & AxD(7); --8 13 2 7 -- BxD(8) AOutxD(31 downto 24); -- 8 -- ATxDN( 7 downto 0) AOutxD(23 downto 16); -- T1=13 -- BxD(2) AOutxD(15 downto 8); -- 2 -- BxD(7) AOutxD( 7 downto 0); -- 7 -- BxD(9) ATxDP( 7 downto 0); -- 9 -- BxD(10) ATxDP( 15 downto 8); -- 10 -- when aes3 => -- AInxD AxD(12)& AxD(1) & AxD(6) & AxD(11); --12 1 6 11 -- BxD(12) AOutxD(31 downto 24); -- 12 -- BxD(1) AOutxD(23 downto 16); -- 1 -- BxD(6) AOutxD(15 downto 8); -- 6 -- BxD(11) AOutxD( 7 downto 0); -- 11 -- BxD(13) ATxDP( 7 downto 0); -- 13 -- BxD(14) ATxDP( 31 downto 24); -- 14 -- BxD(15) ATxDP( 23 downto 16); -- 15 -- when others => null; case StatexDP is when aes0 => AInxD <= AxD(0) & AxD(5) & AxD(10) & AxD(15); -- 0 5 10 15 -- AInxD AxD(0) & AxD(4) & AxD(8) & AxD(12); -- 0 4 8 12 BxD(0) <= AOutxD(31 downto 24); --0 ATxDN( 7 downto 0) <= AOutxD(23 downto 16); --T1 =1 ATxDN(15 downto 8) <= AOutxD(15 downto 8); --T2 =2 ATxDN(23 downto 16) <= AOutxD( 7 downto 0); --T3 =3 when aes1 => AInxD <= AxD(4) & AxD(9) & AxD(14)& AxD(3); -- 4 9 14 3 -- AInxD AxD(5) & AxD(9) & AxD(13)& AxD(1); -- 5 9 13 1 BxD(4) <= AOutxD(31 downto 24); --4 BxD(5) <= AOutxD(23 downto 16); --5 ATxDN(23 downto 16) <= AOutxD(15 downto 8); --T3=6 ATxDN(31 downto 24) <= AOutxD( 7 downto 0); --T4=7 BxD(3) <= ATxDP( 23 downto 16); --3 when aes2 => AInxD <= AxD(8)& AxD(13)& AxD(2) & AxD(7); --8 13 2 7 -- AInxD AxD(10)& AxD(14)& AxD(2) & AxD(6); --10 14 2 6 BxD(8) <= AOutxD(31 downto 24); -- 8 BxD(9) <= AOutxD(23 downto 16); -- 9 BxD(10) <= AOutxD(15 downto 8); -- 10 ATxDN(15 downto 8) <= AOutxD( 7 downto 0); --T2=11 BxD(2) <= ATxDP( 15 downto 8); -- 2 BxD(7) <= ATxDP( 31 downto 24); -- 7 when aes3 => AInxD <= AxD(12)& AxD(1) & AxD(6) & AxD(11); --12 1 6 11 -- AInxD AxD(15)& AxD(3) & AxD(7) & AxD(11); --15 3 7 11 BxD(12) <= AOutxD(31 downto 24); -- 12 BxD(13) <= AOutxD(23 downto 16); -- 13 BxD(14) <= AOutxD(15 downto 8); -- 14 BxD(15) <= AOutxD( 7 downto 0); -- 15 BxD(1) <= ATxDP( 7 downto 0); -- 1 BxD(6) <= ATxDP( 23 downto 16); -- 6 BxD(11) <= ATxDP( 15 downto 8); -- 11 when others => null; end case; end process p_AESround; -- Instantiate four Sboxes g_sbox: for i in 0 to 3 generate i_sbox: sbox port map ( InpxDI => AInxD ( ((i+1)*8)-1 downto i*8), OupxDO => ASubxD( ((i+1)*8)-1 downto i*8)); end generate g_sbox; -- Now the mixcolumn i_mixcolumn: mixcolumn port map ( InpxDI => ASubxD, OupxDO => AMixD); ------------------------------------------------------------------------------- -- Shuffle since ECHO has a strange way of representing data ------------------------------------------------------------------------------- gen_keyshuffle: for i in 0 to 7 generate KSxD((i+1)*8-1 downto i*8) <= CntxSP ((8-i)*8-1 downto (7-i)*8); end generate gen_keyshuffle; -- Determine the round key -- Note that every odd round the key will be all zeroes (instead of the Salt) -- If SaltxSP = '1' we are in an odd round KeyxD <= KSxD & X"0000000000000000" when SaltxSP = '0' else (others => '0'); -- Select the small part of the roundkey p_AES_key_part: process (KeyxD, StatexDP) begin -- process p_AES_key_part AKeyxD <= KeyxD (31 downto 0); case StatexDP is when aes0 => AKeyxD <= KeyxD(127 downto 96); when aes1 => AKeyxD <= KeyxD( 95 downto 64); when aes2 => AKeyxD <= KeyxD( 63 downto 32); when aes3 => AKeyxD <= KeyxD( 31 downto 0); when others => null; end case; end process p_AES_key_part; -- Now add the Key to the part we have mixed AOutxD <= AMixD xor AKeyxD; -- The AOutxD will be copied to the appropriate BxD or ATxD by -- the process p_AESround above. ------------------------------------------------------------------------------- -- Big MixColumn ------------------------------------------------------------------------------- -- Now the big Shiftrows; BS == BigShift BSxD( 0) <= SPxD( 0); BSxD( 4) <= SPxD( 4); BSxD( 8) <= SPxD( 8); BSxD(12) <= SPxD(12); BSxD( 1) <= SPxD( 5); BSxD( 5) <= SPxD( 9); BSxD( 9) <= SPxD(13); BSxD(13) <= SPxD( 1); BSxD( 2) <= SPxD(10); BSxD( 6) <= SPxD(14); BSxD(10) <= SPxD( 2); BSxD(14) <= SPxD( 6); BSxD( 3) <= SPxD(15); BSxD( 7) <= SPxD( 3); BSxD(11) <= SPxD( 7); BSxD(15) <= SPxD(11); -- I think instead of adding the multiplexers here, it would be -- better to do everything in parallel. However, we need to see this -- as this is a fairly large chunk. 64 mixcols in parallel gen_bigmix: for i in 0 to 3 generate gen_bigmiy: for j in 0 to 15 generate --map inputs BMIxD(i*16 + j) <= BSxD( i*4 )((j+1)*8-1 downto j*8) & BSxD((i*4)+1)((j+1)*8-1 downto j*8) & BSxD((i*4)+2)((j+1)*8-1 downto j*8) & BSxD((i*4)+3)((j+1)*8-1 downto j*8) ; --instantiate the mixcolumns i_mixcolumn: mixcolumn port map ( InpxDI => BMIxD(i*16 + j), OupxDO => BMOxD(i*16 + j)); --map outputs BMxD( i*4 )((j+1)*8-1 downto j*8) <= BMOxD(i*16 + j)(31 downto 24); BMxD((i*4)+1)((j+1)*8-1 downto j*8) <= BMOxD(i*16 + j)(23 downto 16); BMxD((i*4)+2)((j+1)*8-1 downto j*8) <= BMOxD(i*16 + j)(15 downto 8); BMxD((i*4)+3)((j+1)*8-1 downto j*8) <= BMOxD(i*16 + j)( 7 downto 0); end generate gen_bigmiy; end generate gen_bigmix; -- BM is mapped to BigMixD in the gen statement gen_smap; ------------------------------------------------------------------------------- -- Big Final -- This is divided into two steps. Once the new data is here it is immediately -- XOR'ed and added to V (BF1) -- At the end of calculation State S is XOR'ed with V again -- (BF2) -- Technically the VxDP VxDN xor BF1xD xor BF2xD. However BF1 is available -- at the very beginning and BF2 is available at the very end. ------------------------------------------------------------------------------- -- this is the State XOR'ed. This is the present state BF2xD(511 downto 384) <= SPxD(0) xor SPxD(4) xor SPxD(8) xor SPxD(12) ; BF2xD(383 downto 256) <= SPxD(1) xor SPxD(5) xor SPxD(9) xor SPxD(13) ; BF2xD(255 downto 128) <= SPxD(2) xor SPxD(6) xor SPxD(10) xor SPxD(14) ; BF2xD(127 downto 0) <= SPxD(3) xor SPxD(7) xor SPxD(11) xor SPxD(15) ; -- this one is the message XOR'ed -- NOte that we need the shuffled data in for this as well BF1xD(511 downto 384) <= DataShufflexD( 127 downto 0) xor DataShufflexD( 639 downto 512) xor DataShufflexD(1151 downto 1024); BF1xD(383 downto 256) <= DataShufflexD( 255 downto 128) xor DataShufflexD( 767 downto 640) xor DataShufflexD(1279 downto 1152); BF1xD(255 downto 128) <= DataShufflexD( 383 downto 256) xor DataShufflexD( 895 downto 768) xor DataShufflexD(1407 downto 1280); BF1xD(127 downto 0) <= DataShufflexD( 511 downto 384) xor DataShufflexD(1023 downto 896) xor DataShufflexD(1535 downto 1408); ------------------------------------------------------------------------------- -- V Register ------------------------------------------------------------------------------- p_v: process (VxDP, StatexDP, BF1xD, BF2xD) begin -- process p_v VxDN <= VxDP; case StatexDP is when init => VxDN <= VINIT xor BF1xD; -- beginning when chain => VxDN <= VxDP xor BF1xD; -- we have new data when last => VxDN <= VxDP xor BF2xD; -- last cycle when others => null; end case; end process p_v; ------------------------------------------------------------------------------- -- Main FSM ------------------------------------------------------------------------------- p_fsm: process (StatexDP, LastxSI, InENxSI, SCntxSP, RndCntxSP, CntxSP, LastxSP, SaltxSP) begin -- process p_fsm -- defaults StatexDN <= StatexDP; OutEnxSO <= '0'; RndCntxSN <= RndCntxSP; SCntxSN <= SCntxSP; CntxSN <= CntxSP; LastxSN <= LastxSP; SaltxSN <= SaltxSP; case StatexDP is when init => RndCntxSN <= 0; SCntxSN <= 0; CntxSN <= CINIT; if InENxSI='1' then StatexDN <= aes0; if LastxSI = '1' then LastxSN <= '1'; end if; end if; when chain => RndCntxSN <= 0; SCntxSN <= 0; -- Cnt continues if InENxSI='1' then StatexDN <= aes0; end if; when aes0 => StatexDN <= aes1; when aes1 => StatexDN <= aes2; when aes2 => StatexDN <= aes3; when aes3 => if SaltxSP='0' then CntxSN <= std_logic_vector(unsigned (CntxSP) + "1"); end if; if SCntxSP=15 then SCntxSN <= 0; if RndCntxSP =15 then RndCntxSN <= 0; else RndCntxSN <= RndCntxSP + 1; end if; -- We will use the SaltxSP to determine in which round we are if SaltxSP= '0' then StatexDN <= aes0; -- even SaltxSN <= '1'; else StatexDN <= bigmix; -- odd SaltxSN <= '0'; end if; else SCntxSN <= SCntxSP + 1; StatexDN <= aes0; end if; when bigmix => -- be careful here RndCntxSP will be reset in the preceeding aes3 -- round, so we need to check for 0 to determine the end. if RndCntxSP=0 then StatexDN <= last; else StatexDN <= aes0; --not finished end if; when last => OutEnxSO <= '1'; -- Normally we would have an ack here if LastxSP='1' then -- Was last block LastxSN <= '0'; -- clear Last StatexDN <= init; else StatexDN <= chain; end if; when others => null; end case; end process p_fsm; ------------------------------------------------------------------------------- -- The Output ------------------------------------------------------------------------------- -- output is valid for *ONE* cycle in the state 'last' -- it is not ideal, we would probably would have liked to have an output -- acknowledge but at teh moment this will suffice DataOutxDO <= VxDN(511 downto 256); ------------------------------------------------------------------------------- -- Clocked process for all the registers ------------------------------------------------------------------------------- p_clk : process (CLKxCI, RSTxRBI) begin -- process p_clk if RSTxRBI = '0' then -- asynchronous reset (active low) SxDP <= (others => '0'); VxDP <= VINIT; ATxDP <= (others => '0'); CntxSP <= CINIT; LastxSP <= '0'; SCntxSP <= 0; RndCntxSP <= 0; StatexDP <= init; SaltxSP <= '0'; elsif CLKxCI'event and CLKxCI = '1' then -- rising clock edge SxDP <= SxDN; VxDP <= VxDN; ATxDP <= ATxDN; CntxSP <= CntxSN; LastxSP <= LastxSN; SCntxSP <= SCntxSN; RndCntxSP <= RndCntxSN; StatexDP <= StatexDN; SaltxSP <= SaltxSN; end if; end process p_clk; end slow;