------------------------------------------------------------
-- Copyright: 2010 Integrated Sytems Laboratory, ETH Zurich
-- http://www.iis.ee.ethz.ch/~sha3
------------------------------------------------------------
-------------------------------------------------------------------------------
-- Title : ECHO 0.2 Gb/s implementation
-- Project :
-------------------------------------------------------------------------------
-- File : echo_slow.vhd
-- Author : Frank/Luca account
-- Company : Integrated Systems Laboratory, ETH Zurich
-- Created : 2010-02-23
-- Last update: 2010-04-13
-- Platform : ModelSim (simulation), Synopsys (synthesis)
-- Standard : VHDL'87
-------------------------------------------------------------------------------
-- Description: This is a slow ECHO datapath with a single 32bit AES
-------------------------------------------------------------------------------
-- Copyright (c) 2010 Integrated Systems Laboratory, ETH Zurich
-------------------------------------------------------------------------------
-- Revisions :
-- Date Version Author Description
-- 2010-02-23 1.0 sha3 Created
-- 2010-04-12 1.1 sha3 There is a small problem with the assignment
-- ordering in general. THis is being addressed
-- Added a register to keep the SALT round. This
-- is the round where we do not use the key
-------------------------------------------------------------------------------
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity echo is
port (
DataInxDI : in std_logic_vector(1535 downto 0); -- All inputs parallel
LastxSI : in std_logic; -- 1: this data will be the last round
-- 0: continue chaining
InENxSI : in std_logic;
DataOutxDO : out std_logic_vector(255 downto 0);
OutEnxSO : out std_logic;
CLKxCI : in std_logic;
RSTxRBI : in std_logic);
end echo;
architecture slow of echo is
-- This file is based on the FAST ECHO implementation. Several things
-- have been modified in order to use as much as possible from the fast
-- echo.
signal DataShufflexD : std_logic_vector(1535 downto 0);
-- initialization string for VxDP according to section 2.1, on pg 7 of the
-- ECHO description
constant VINIT : std_logic_vector(511 downto 0) :=
(496 => '1', 368 => '1' , 240 => '1', 112 => '1', others => '0');
-- somehow we fixed the length of the message to be one 1376. It is not
-- difficult to make it work with any length, but at the moment this is the
-- easiest alternative
constant CINIT : std_logic_vector(63 downto 0) := X"000000000000056"&"0000";
-- this has to be one less than CINIT(4 downto 0);
constant CSTOP : std_logic_vector(4 downto 0) := "10111";
signal VxDN, VxDP : std_logic_vector(511 downto 0);
signal SxDN, SxDP, SxD : std_logic_vector(2047 downto 0);
signal BigSubxD, BigMixD : std_logic_vector(2047 downto 0);
type s_type is array (0 to 15) of std_logic_vector(127 downto 0);
signal SNxD, BSxD, BMxD, SPxD, AESxD : s_type;
-- this is the 128 bit state of the aes
signal APxD,ANxD : std_logic_vector(127 downto 0);
-- the 4 temporary storage that we have for the shiftrows
signal ATxDN,ATxDP : std_logic_vector(31 downto 0);
-- and this is the 32 bit that comes in and out
signal AInxD, ASubxD, AMixD, AKeyxD, AOutxD : std_logic_vector(31 downto 0);
type BytesType is array (0 to 15) of std_logic_vector (7 downto 0);
signal AxD, BxD : BytesType;
-- The name is Big Mixcolumns Input and BigMixcolumnsOutput
type bm_type is array (0 to 63) of std_logic_vector(31 downto 0);
signal BMIxD,BMOxD : bm_type;
-- Big Final Signals
signal BF1xD, BF2xD : std_logic_vector(511 downto 0);
signal KeyxD : std_logic_vector(127 downto 0);
signal KSxD : std_logic_vector(63 downto 0);
signal CntxSP, CntxSN : std_logic_vector(63 downto 0);
signal SCntxSP, SCntxSN : integer range 0 to 15;
signal RndCntxSP, RndCntxSN : integer range 0 to 15;
signal LastxSP, LastxSN : std_logic;
type states_type is (init, chain, aes0, aes1, aes2, aes3, bigmix, last);
signal StatexDP, StatexDN : states_type;
signal SaltxSP, SaltxSN : std_logic;
component sbox
port (
InpxDI : in std_logic_vector(7 downto 0);
OupxDO : out std_logic_vector(7 downto 0));
end component;
component mixcolumn
port (
InpxDI : in std_logic_vector(31 downto 0);
OupxDO : out std_logic_vector(31 downto 0));
end component;
begin -- slow
-- reshuffling, strange way of writing in data.. at least for me
gen_shuffle: for i in 0 to 11 generate
DataShufflexD((i+1)*128-1 downto i*128) <= DataInxDI((12-i)*128 -1 downto (11-i)*128);
end generate gen_shuffle;
-- make typing easier
gen_smap: for i in 0 to 15 generate
SPxD(i) <= SxDP((128*(i+1)) -1 downto 128*i);
BigMixD((128*(i+1)) -1 downto 128*i) <= BMxD(i);
BigSubxD((128*(i+1)) -1 downto 128*i) <= AESxD(i);
end generate gen_smap;
-------------------------------------------------------------------------------
-- S Register
-------------------------------------------------------------------------------
-- input multiplexer chooses the next stae of SxDN
p_inmux: process (DataShufflexD, VxDP, SxDP, StatexDP, BigMixD, BigSubxD)
begin -- process p_inmux
SxDN <= SxDP;
case StatexDP is
when init => SxDN <= DataShufflexD & VINIT;
when chain => SxDN <= DataShufflexD & VxDP;
when bigmix => SxDN <= BigMixD;
when others => SxDN <= BigSubxD;
end case;
end process p_inmux;
-------------------------------------------------------------------------------
-- AES block
-------------------------------------------------------------------------------
-- Multiplexer to select one of the 16 BigStates for AES (APxD)
-- At the same time write back the 128 bit value back to S
--
-- This is a 2-level mux. At the first level we pick 128 out 2048
-- At the second level we will pick one out of APxD
APxD <= SPxD(SCntxSP);
p_AESdata: process (SCntxSP, SPxD, ANxD)
begin -- process p_AESdata
AESxD <= SPxD;
AESxD(SCntxSP) <= ANxD;
end process p_AESdata;
-- for readability
-- APxD is the input to the AES block
-- this is mapped to AxD(i)
-- The output of the AES Block is ANxD
-- this is mapped to BxD(i)
gen_byte_assign: for i in 0 to 15 generate
AxD(i) <= APxD(((15-i)*8)+7 downto (15-i)*8);
ANxD(((15-i)*8)+7 downto (15-i)*8) <= BxD(i);
end generate gen_byte_assign;
p_AESround: process (AxD, StatexDP, ATxDP, AOutxD)
begin -- process p_AESround
AInxD <= AxD(0) & AxD(5) & AxD(10) & AxD(15);
ATxDN <= ATxDP;
BxD <= AxD;
-- case StatexDP is
-- when aes0 =>
-- AInxD AxD(0) & AxD(5) & AxD(10) & AxD(15); -- 0 5 10 15
-- BxD(0) AOutxD(31 downto 24); --0
-- ATxDN( 7 downto 0) AOutxD(23 downto 16); --T1 =5
-- ATxDN(15 downto 8) AOutxD(15 downto 8); --T2 =10
-- ATxDN(23 downto 16) AOutxD( 7 downto 0); --T3 =15
-- when aes1 =>
-- AInxD AxD(4) & AxD(9) & AxD(14)& AxD(3); -- 4 9 14 3
-- BxD(4) AOutxD(31 downto 24); --4
-- ATxDN( 7 downto 0) AOutxD(23 downto 16); --T1=9
-- ATxDN(31 downto 24) AOutxD(15 downto 8); --T4=14
-- BxD(3) AOutxD( 7 downto 0); --3
-- BxD(5) ATxDP ( 7 downto 0); --5
-- when aes2 =>
-- AInxD AxD(8)& AxD(13)& AxD(2) & AxD(7); --8 13 2 7
-- BxD(8) AOutxD(31 downto 24); -- 8
-- ATxDN( 7 downto 0) AOutxD(23 downto 16); -- T1=13
-- BxD(2) AOutxD(15 downto 8); -- 2
-- BxD(7) AOutxD( 7 downto 0); -- 7
-- BxD(9) ATxDP( 7 downto 0); -- 9
-- BxD(10) ATxDP( 15 downto 8); -- 10
-- when aes3 =>
-- AInxD AxD(12)& AxD(1) & AxD(6) & AxD(11); --12 1 6 11
-- BxD(12) AOutxD(31 downto 24); -- 12
-- BxD(1) AOutxD(23 downto 16); -- 1
-- BxD(6) AOutxD(15 downto 8); -- 6
-- BxD(11) AOutxD( 7 downto 0); -- 11
-- BxD(13) ATxDP( 7 downto 0); -- 13
-- BxD(14) ATxDP( 31 downto 24); -- 14
-- BxD(15) ATxDP( 23 downto 16); -- 15
-- when others => null;
case StatexDP is
when aes0 =>
AInxD <= AxD(0) & AxD(5) & AxD(10) & AxD(15); -- 0 5 10 15
-- AInxD AxD(0) & AxD(4) & AxD(8) & AxD(12); -- 0 4 8 12
BxD(0) <= AOutxD(31 downto 24); --0
ATxDN( 7 downto 0) <= AOutxD(23 downto 16); --T1 =1
ATxDN(15 downto 8) <= AOutxD(15 downto 8); --T2 =2
ATxDN(23 downto 16) <= AOutxD( 7 downto 0); --T3 =3
when aes1 =>
AInxD <= AxD(4) & AxD(9) & AxD(14)& AxD(3); -- 4 9 14 3
-- AInxD AxD(5) & AxD(9) & AxD(13)& AxD(1); -- 5 9 13 1
BxD(4) <= AOutxD(31 downto 24); --4
BxD(5) <= AOutxD(23 downto 16); --5
ATxDN(23 downto 16) <= AOutxD(15 downto 8); --T3=6
ATxDN(31 downto 24) <= AOutxD( 7 downto 0); --T4=7
BxD(3) <= ATxDP( 23 downto 16); --3
when aes2 =>
AInxD <= AxD(8)& AxD(13)& AxD(2) & AxD(7); --8 13 2 7
-- AInxD AxD(10)& AxD(14)& AxD(2) & AxD(6); --10 14 2 6
BxD(8) <= AOutxD(31 downto 24); -- 8
BxD(9) <= AOutxD(23 downto 16); -- 9
BxD(10) <= AOutxD(15 downto 8); -- 10
ATxDN(15 downto 8) <= AOutxD( 7 downto 0); --T2=11
BxD(2) <= ATxDP( 15 downto 8); -- 2
BxD(7) <= ATxDP( 31 downto 24); -- 7
when aes3 =>
AInxD <= AxD(12)& AxD(1) & AxD(6) & AxD(11); --12 1 6 11
-- AInxD AxD(15)& AxD(3) & AxD(7) & AxD(11); --15 3 7 11
BxD(12) <= AOutxD(31 downto 24); -- 12
BxD(13) <= AOutxD(23 downto 16); -- 13
BxD(14) <= AOutxD(15 downto 8); -- 14
BxD(15) <= AOutxD( 7 downto 0); -- 15
BxD(1) <= ATxDP( 7 downto 0); -- 1
BxD(6) <= ATxDP( 23 downto 16); -- 6
BxD(11) <= ATxDP( 15 downto 8); -- 11
when others => null;
end case;
end process p_AESround;
-- Instantiate four Sboxes
g_sbox: for i in 0 to 3 generate
i_sbox: sbox
port map (
InpxDI => AInxD ( ((i+1)*8)-1 downto i*8),
OupxDO => ASubxD( ((i+1)*8)-1 downto i*8));
end generate g_sbox;
-- Now the mixcolumn
i_mixcolumn: mixcolumn
port map (
InpxDI => ASubxD,
OupxDO => AMixD);
-------------------------------------------------------------------------------
-- Shuffle since ECHO has a strange way of representing data
-------------------------------------------------------------------------------
gen_keyshuffle: for i in 0 to 7 generate
KSxD((i+1)*8-1 downto i*8) <= CntxSP ((8-i)*8-1 downto (7-i)*8);
end generate gen_keyshuffle;
-- Determine the round key
-- Note that every odd round the key will be all zeroes (instead of the Salt)
-- If SaltxSP = '1' we are in an odd round
KeyxD <= KSxD & X"0000000000000000" when SaltxSP = '0' else (others => '0');
-- Select the small part of the roundkey
p_AES_key_part: process (KeyxD, StatexDP)
begin -- process p_AES_key_part
AKeyxD <= KeyxD (31 downto 0);
case StatexDP is
when aes0 => AKeyxD <= KeyxD(127 downto 96);
when aes1 => AKeyxD <= KeyxD( 95 downto 64);
when aes2 => AKeyxD <= KeyxD( 63 downto 32);
when aes3 => AKeyxD <= KeyxD( 31 downto 0);
when others => null;
end case;
end process p_AES_key_part;
-- Now add the Key to the part we have mixed
AOutxD <= AMixD xor AKeyxD;
-- The AOutxD will be copied to the appropriate BxD or ATxD by
-- the process p_AESround above.
-------------------------------------------------------------------------------
-- Big MixColumn
-------------------------------------------------------------------------------
-- Now the big Shiftrows; BS == BigShift
BSxD( 0) <= SPxD( 0); BSxD( 4) <= SPxD( 4); BSxD( 8) <= SPxD( 8); BSxD(12) <= SPxD(12);
BSxD( 1) <= SPxD( 5); BSxD( 5) <= SPxD( 9); BSxD( 9) <= SPxD(13); BSxD(13) <= SPxD( 1);
BSxD( 2) <= SPxD(10); BSxD( 6) <= SPxD(14); BSxD(10) <= SPxD( 2); BSxD(14) <= SPxD( 6);
BSxD( 3) <= SPxD(15); BSxD( 7) <= SPxD( 3); BSxD(11) <= SPxD( 7); BSxD(15) <= SPxD(11);
-- I think instead of adding the multiplexers here, it would be
-- better to do everything in parallel. However, we need to see this
-- as this is a fairly large chunk. 64 mixcols in parallel
gen_bigmix: for i in 0 to 3 generate
gen_bigmiy: for j in 0 to 15 generate
--map inputs
BMIxD(i*16 + j) <= BSxD( i*4 )((j+1)*8-1 downto j*8) &
BSxD((i*4)+1)((j+1)*8-1 downto j*8) &
BSxD((i*4)+2)((j+1)*8-1 downto j*8) &
BSxD((i*4)+3)((j+1)*8-1 downto j*8) ;
--instantiate the mixcolumns
i_mixcolumn: mixcolumn
port map (
InpxDI => BMIxD(i*16 + j),
OupxDO => BMOxD(i*16 + j));
--map outputs
BMxD( i*4 )((j+1)*8-1 downto j*8) <= BMOxD(i*16 + j)(31 downto 24);
BMxD((i*4)+1)((j+1)*8-1 downto j*8) <= BMOxD(i*16 + j)(23 downto 16);
BMxD((i*4)+2)((j+1)*8-1 downto j*8) <= BMOxD(i*16 + j)(15 downto 8);
BMxD((i*4)+3)((j+1)*8-1 downto j*8) <= BMOxD(i*16 + j)( 7 downto 0);
end generate gen_bigmiy;
end generate gen_bigmix;
-- BM is mapped to BigMixD in the gen statement gen_smap;
-------------------------------------------------------------------------------
-- Big Final
-- This is divided into two steps. Once the new data is here it is immediately
-- XOR'ed and added to V (BF1)
-- At the end of calculation State S is XOR'ed with V again
-- (BF2)
-- Technically the VxDP VxDN xor BF1xD xor BF2xD. However BF1 is available
-- at the very beginning and BF2 is available at the very end.
-------------------------------------------------------------------------------
-- this is the State XOR'ed. This is the present state
BF2xD(511 downto 384) <= SPxD(0) xor SPxD(4) xor SPxD(8) xor SPxD(12) ;
BF2xD(383 downto 256) <= SPxD(1) xor SPxD(5) xor SPxD(9) xor SPxD(13) ;
BF2xD(255 downto 128) <= SPxD(2) xor SPxD(6) xor SPxD(10) xor SPxD(14) ;
BF2xD(127 downto 0) <= SPxD(3) xor SPxD(7) xor SPxD(11) xor SPxD(15) ;
-- this one is the message XOR'ed
-- NOte that we need the shuffled data in for this as well
BF1xD(511 downto 384) <= DataShufflexD( 127 downto 0) xor DataShufflexD( 639 downto 512) xor DataShufflexD(1151 downto 1024);
BF1xD(383 downto 256) <= DataShufflexD( 255 downto 128) xor DataShufflexD( 767 downto 640) xor DataShufflexD(1279 downto 1152);
BF1xD(255 downto 128) <= DataShufflexD( 383 downto 256) xor DataShufflexD( 895 downto 768) xor DataShufflexD(1407 downto 1280);
BF1xD(127 downto 0) <= DataShufflexD( 511 downto 384) xor DataShufflexD(1023 downto 896) xor DataShufflexD(1535 downto 1408);
-------------------------------------------------------------------------------
-- V Register
-------------------------------------------------------------------------------
p_v: process (VxDP, StatexDP, BF1xD, BF2xD)
begin -- process p_v
VxDN <= VxDP;
case StatexDP is
when init => VxDN <= VINIT xor BF1xD; -- beginning
when chain => VxDN <= VxDP xor BF1xD; -- we have new data
when last => VxDN <= VxDP xor BF2xD; -- last cycle
when others => null;
end case;
end process p_v;
-------------------------------------------------------------------------------
-- Main FSM
-------------------------------------------------------------------------------
p_fsm: process (StatexDP, LastxSI, InENxSI, SCntxSP, RndCntxSP, CntxSP, LastxSP, SaltxSP)
begin -- process p_fsm
-- defaults
StatexDN <= StatexDP;
OutEnxSO <= '0';
RndCntxSN <= RndCntxSP;
SCntxSN <= SCntxSP;
CntxSN <= CntxSP;
LastxSN <= LastxSP;
SaltxSN <= SaltxSP;
case StatexDP is
when init =>
RndCntxSN <= 0;
SCntxSN <= 0;
CntxSN <= CINIT;
if InENxSI='1' then
StatexDN <= aes0;
if LastxSI = '1' then
LastxSN <= '1';
end if;
end if;
when chain =>
RndCntxSN <= 0;
SCntxSN <= 0;
-- Cnt continues
if InENxSI='1' then
StatexDN <= aes0;
end if;
when aes0 => StatexDN <= aes1;
when aes1 => StatexDN <= aes2;
when aes2 => StatexDN <= aes3;
when aes3 =>
if SaltxSP='0' then
CntxSN <= std_logic_vector(unsigned (CntxSP) + "1");
end if;
if SCntxSP=15 then
SCntxSN <= 0;
if RndCntxSP =15 then
RndCntxSN <= 0;
else
RndCntxSN <= RndCntxSP + 1;
end if;
-- We will use the SaltxSP to determine in which round we are
if SaltxSP= '0' then
StatexDN <= aes0; -- even
SaltxSN <= '1';
else
StatexDN <= bigmix; -- odd
SaltxSN <= '0';
end if;
else
SCntxSN <= SCntxSP + 1;
StatexDN <= aes0;
end if;
when bigmix =>
-- be careful here RndCntxSP will be reset in the preceeding aes3
-- round, so we need to check for 0 to determine the end.
if RndCntxSP=0 then
StatexDN <= last;
else
StatexDN <= aes0; --not finished
end if;
when last =>
OutEnxSO <= '1'; -- Normally we would have an ack here
if LastxSP='1' then -- Was last block
LastxSN <= '0'; -- clear Last
StatexDN <= init;
else
StatexDN <= chain;
end if;
when others => null;
end case;
end process p_fsm;
-------------------------------------------------------------------------------
-- The Output
-------------------------------------------------------------------------------
-- output is valid for *ONE* cycle in the state 'last'
-- it is not ideal, we would probably would have liked to have an output
-- acknowledge but at teh moment this will suffice
DataOutxDO <= VxDN(511 downto 256);
-------------------------------------------------------------------------------
-- Clocked process for all the registers
-------------------------------------------------------------------------------
p_clk : process (CLKxCI, RSTxRBI)
begin -- process p_clk
if RSTxRBI = '0' then -- asynchronous reset (active low)
SxDP <= (others => '0');
VxDP <= VINIT;
ATxDP <= (others => '0');
CntxSP <= CINIT;
LastxSP <= '0';
SCntxSP <= 0;
RndCntxSP <= 0;
StatexDP <= init;
SaltxSP <= '0';
elsif CLKxCI'event and CLKxCI = '1' then -- rising clock edge
SxDP <= SxDN;
VxDP <= VxDN;
ATxDP <= ATxDN;
CntxSP <= CntxSN;
LastxSP <= LastxSN;
SCntxSP <= SCntxSN;
RndCntxSP <= RndCntxSN;
StatexDP <= StatexDN;
SaltxSP <= SaltxSN;
end if;
end process p_clk;
end slow;