------------------------------------------------------------
-- Copyright: 2010 Integrated Sytems Laboratory, ETH Zurich
-- http://www.iis.ee.ethz.ch/~sha3
------------------------------------------------------------
-------------------------------------------------------------------------------
-- Title : ECHO 20 Gb/s implementation
-- Project :
-------------------------------------------------------------------------------
-- File : echo_fast.vhd
-- Author : Frank/Luca account
-- Company : Integrated Systems Laboratory, ETH Zurich
-- Created : 2010-02-01
-- Last update: 2010-02-05
-- Platform : ModelSim (simulation), Synopsys (synthesis)
-- Standard : VHDL'87
-------------------------------------------------------------------------------
-- Description: This is a fast ECHO datapath with 4 double AES rounds in
-- parallel
-------------------------------------------------------------------------------
-- Copyright (c) 2010 Integrated Systems Laboratory, ETH Zurich
-------------------------------------------------------------------------------
-- Revisions :
-- Date Version Author Description
-- 2010-02-01 1.0 sha3 Created
-------------------------------------------------------------------------------
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity echo is
port (
DataInxDI : in std_logic_vector(1535 downto 0); -- All inputs parallel
LastxSI : in std_logic; -- 1: this data will be the last round
-- 0: continue chaining
InENxSI : in std_logic;
DataOutxDO : out std_logic_vector(255 downto 0);
OutEnxSO : out std_logic;
CLKxCI : in std_logic;
RSTxRBI : in std_logic);
end echo;
architecture fast of echo is
signal DataShufflexD : std_logic_vector(1535 downto 0);
-- initialization string for VxDP according to section 2.1, on pg 7 of the
-- ECHO description
constant VINIT : std_logic_vector(511 downto 0) :=
(496 => '1', 368 => '1' , 240 => '1', 112 => '1', others => '0');
-- somehow we fixed the length of the message to be one 1376. It is not
-- difficult to make it work with any length, but at the moment this is the
-- easiest alternative
constant CINIT : std_logic_vector(61 downto 0) := X"000000000000056"&"00";
-- this has to be one less than CINIT(4 downto 0);
constant CSTOP : std_logic_vector(4 downto 0) := "10111";
signal VxDN, VxDP : std_logic_vector(511 downto 0);
signal OxDN, OxDP : std_logic_vector(255 downto 0);
signal SxDN, SxDP,SxD : std_logic_vector(2047 downto 0);
type s_type is array (0 to 15) of std_logic_vector(127 downto 0);
signal SNxD, SPxD : s_type;
signal T1xDP, T2xDP, T3xDP, T4xDP : std_logic_vector(127 downto 0);
signal T1xDN, T2xDN, T3xDN, T4xDN : std_logic_vector(127 downto 0);
signal A1xD, A2xD, A3xD, A4xD : std_logic_vector(127 downto 0);
signal B1xD, B2xD, B3xD, B4xD : std_logic_vector(127 downto 0);
signal BM1xD, BM2xD, BM3xD, BM4xD : std_logic_vector(127 downto 0);
-- The name is Big Mixcolumns Input and BigMixcolumnsOutput
type bm_type is array (0 to 15) of std_logic_vector(31 downto 0);
signal BMIxD,BMOxD : bm_type;
-- Big Final Signals
signal BF1xD, BF2xD : std_logic_vector(511 downto 0);
signal K1xD, K2xD, K3xD, K4xD : std_logic_vector(63 downto 0);
signal K1SxD, K2SxD, K3SxD, K4SxD : std_logic_vector(63 downto 0);
signal CntxSP, CntxSN : std_logic_vector(61 downto 0);
signal LastxSP, LastxSN : std_logic;
signal OutEnxSN, OutEnxSP : std_logic;
signal PartCntxS : std_logic_vector(1 downto 0);
type states_type is (init, run, chain, last);
signal StatexDP, StatexDN : states_type;
component doubleAES
port (
InxDI : in std_logic_vector(127 downto 0);
OutxDO : out std_logic_vector(127 downto 0);
RoundKeyxDI : in std_logic_vector(63 downto 0));
end component;
component mixcolumn
port (
InpxDI : in std_logic_vector(31 downto 0);
OupxDO : out std_logic_vector(31 downto 0));
end component;
begin -- fast
-- this is to make typing slightly easier
-- perl -e 'for $i (0..15){ print "SP",$i,"xDSxDP(",128*($i+1)-1," downto ",128*$i,");\n"}'
gen_smap: for i in 0 to 15 generate
SPxD(i) <= SxDP((128*(i+1)) -1 downto 128*i);
SxD((128*(i+1)) -1 downto 128*i) <= SNxD(i);
end generate gen_smap;
PartCntxS <= CntxSP(1 downto 0); -- shows which cycle we are in
-- The total calculation is four cock
-- cycles
-------------------------------------------------------------------------------
-- Select input for the four AESblocks
-- This essentially also implements the Big ShiftRows on S
-------------------------------------------------------------------------------
with PartCntxS select
A1xD <= SPxD(0) when "00",
SPxD(4) when "01",
SPxD(8) when "10",
SPxD(12) when others;
with PartCntxS select
A2xD <= SPxD(5) when "00",
SPxD(9) when "01",
SPxD(13) when "10",
SPxD(1) when others;
with PartCntxS select
A3xD <= SPxD(10) when "00",
SPxD(14) when "01",
SPxD(2) when "10",
SPxD(6) when others;
with PartCntxS select
A4xD <= SPxD(15) when "00",
SPxD(3) when "01",
SPxD(7) when "10",
SPxD(11) when others;
-------------------------------------------------------------------------------
-- Calculate the Roundkeys
--
-- This is now more complex as we do not pick the S'es in order but we pick
-- them post-BigShiftRows
-------------------------------------------------------------------------------
with PartCntxS select
K1xD <= CntxSP(61 downto 2) & "0000" when "00",
CntxSP(61 downto 2) & "0100" when "01",
CntxSP(61 downto 2) & "1000" when "10",
CntxSP(61 downto 2) & "1100" when others;
with PartCntxS select
K2xD <= CntxSP(61 downto 2) & "0101" when "00",
CntxSP(61 downto 2) & "1001" when "01",
CntxSP(61 downto 2) & "1101" when "10",
CntxSP(61 downto 2) & "0001" when others;
with PartCntxS select
K3xD <= CntxSP(61 downto 2) & "1010" when "00",
CntxSP(61 downto 2) & "1110" when "01",
CntxSP(61 downto 2) & "0010" when "10",
CntxSP(61 downto 2) & "0110" when others;
with PartCntxS select
K4xD <= CntxSP(61 downto 2) & "1111" when "00",
CntxSP(61 downto 2) & "0011" when "01",
CntxSP(61 downto 2) & "0111" when "10",
CntxSP(61 downto 2) & "1011" when others;
-------------------------------------------------------------------------------
-- Shuffle since ECHO has a strange way of representing data
-------------------------------------------------------------------------------
gen_keyshuffle: for i in 0 to 7 generate
K1SxD((i+1)*8-1 downto i*8) <= K1xD ((8-i)*8-1 downto (7-i)*8);
K2SxD((i+1)*8-1 downto i*8) <= K2xD ((8-i)*8-1 downto (7-i)*8);
K3SxD((i+1)*8-1 downto i*8) <= K3xD ((8-i)*8-1 downto (7-i)*8);
K4SxD((i+1)*8-1 downto i*8) <= K4xD ((8-i)*8-1 downto (7-i)*8);
end generate gen_keyshuffle;
-------------------------------------------------------------------------------
-- Instantiate four double AES
-------------------------------------------------------------------------------
i_doubleAES1: doubleAES
port map (
InxDI => A1xD,
OutxDO => B1xD,
RoundKeyxDI => K1SxD);
i_doubleAES2: doubleAES
port map (
InxDI => A2xD,
OutxDO => B2xD,
RoundKeyxDI => K2SxD);
i_doubleAES3: doubleAES
port map (
InxDI => A3xD,
OutxDO => B3xD,
RoundKeyxDI => K3SxD);
i_doubleAES4: doubleAES
port map (
InxDI => A4xD,
OutxDO => B4xD,
RoundKeyxDI => K4SxD);
-------------------------------------------------------------------------------
-- Now comes the Big Mixed Column
-------------------------------------------------------------------------------
gen_bigmixcols: for i in 0 to 15 generate
-- map inputs
BMIxD(i) <= B1xD((i+1)*8-1 downto i*8) &
B2xD((i+1)*8-1 downto i*8) &
B3xD((i+1)*8-1 downto i*8) &
B4xD((i+1)*8-1 downto i*8);
--instantiate the mixcolumns
i_mixcolumn: mixcolumn
port map (
InpxDI => BMIxD(i),
OupxDO => BMOxD(i));
-- map back to four signals
BM1xD((i+1)*8-1 downto i*8) <= BMOxD(i)(31 downto 24);
BM2xD((i+1)*8-1 downto i*8) <= BMOxD(i)(23 downto 16);
BM3xD((i+1)*8-1 downto i*8) <= BMOxD(i)(15 downto 8);
BM4xD((i+1)*8-1 downto i*8) <= BMOxD(i)( 7 downto 0);
end generate gen_bigmixcols;
-------------------------------------------------------------------------------
-- This process determines the next state for the S and Temp registers
-------------------------------------------------------------------------------
p_write: process (SPxD, PartCntxS,
BM1xD,BM2xD,BM3xD,BM4xD,
T1xDP,T2xDP,T3xDP,T4xDP)
begin -- process p_write
--defaults
SNxD <= SPxD;
T1xDN <= T1xDP;
T2xDN <= T2xDP;
T3xDN <= T3xDP;
T4xDN <= T4xDP;
case PartCntxS is
when "00" => SNxD(0) <= BM1xD; -- 0
T1xDN <= BM2xD; -- T1= 1
T2xDN <= BM3xD; -- T2= 2
T3xDN <= BM4xD; -- T3= 3
when "01" => SNxD(4) <= BM1xD; -- 4
SNxD(5) <= BM2xD; -- 5
T3xDN <= BM3xD; -- T3=6
T4xDN <= BM4xD; -- T4=7
SNxD(3) <= T3xDP; -- 3
when "10" => SNxD(8) <= BM1xD; -- 8
SNxD(9) <= BM2xD; -- 9
SNxD(10) <= BM3xD; -- 10
T2xDN <= BM4xD; -- T2=11
SNxD(2) <= T2xDP; -- 2
SNxD(7) <= T4xDP; -- 7
when others => SNxD(12) <= BM1xD; -- 12
SNxD(13) <= BM2xD; -- 13
SNxD(14) <= BM3xD; -- 14
SNxD(15) <= BM4xD; -- 15
SNxD(1) <= T1xDP; -- 1
SNxD(6) <= T3xDP; -- 6
SNxD(11) <= T2xDP; -- 11
end case;
end process p_write;
-------------------------------------------------------------------------------
-- Big Final
-- This is divided into two steps. Once the new data is here it is immediately
-- XOR'ed and added to V (BF1)
-- At the end of calculation 31st clock the State S is XOR'ed with V again
-- (BF2)
-- Technically the VxDP VxDN xor BF1xD xor BF2xD. However BF1 is available
-- at the very beginning and BF2 is available at the very end.
-------------------------------------------------------------------------------
-- this is the State XOR'ed. This is the next state
BF2xD(511 downto 384) <= SNxD(0) xor SNxD(4) xor SNxD(8) xor SNxD(12) ;
BF2xD(383 downto 256) <= SNxD(1) xor SNxD(5) xor SNxD(9) xor SNxD(13) ;
BF2xD(255 downto 128) <= SNxD(2) xor SNxD(6) xor SNxD(10) xor SNxD(14) ;
BF2xD(127 downto 0) <= SNxD(3) xor SNxD(7) xor SNxD(11) xor SNxD(15) ;
-- this one is the message XOR'ed
-- NOte that we need the shuffled data in for this as well
BF1xD(511 downto 384) <= DataShufflexD( 127 downto 0) xor DataShufflexD( 639 downto 512) xor DataShufflexD(1151 downto 1024);
BF1xD(383 downto 256) <= DataShufflexD( 255 downto 128) xor DataShufflexD( 767 downto 640) xor DataShufflexD(1279 downto 1152);
BF1xD(255 downto 128) <= DataShufflexD( 383 downto 256) xor DataShufflexD( 895 downto 768) xor DataShufflexD(1407 downto 1280);
BF1xD(127 downto 0) <= DataShufflexD( 511 downto 384) xor DataShufflexD(1023 downto 896) xor DataShufflexD(1535 downto 1408);
-------------------------------------------------------------------------------
-- State Machines
--
-- There are (at the moment) three state machines:
-- p_s: controls what happens with the STATE (SxDN)
-- p_v: controls the VECTOR (VxDN) and at the same time to allow fast working
-- also the OUTPUT (OxDN)
-- p_fsm: is for the main control flow, Counters, Last Flag, OutEn signal
--
-- The first two should produce larger multiplexer structures, the last one is
-- the 'real' FSM of the system.
-------------------------------------------------------------------------------
-- reshuffling, strange way of writing in data.. at least for me
gen_shuffle: for i in 0 to 11 generate
DataShufflexD((i+1)*128-1 downto i*128) <= DataInxDI((12-i)*128 -1 downto (11-i)*128);
end generate gen_shuffle;
p_s: process (SxD, InENxSI, StatexDP, DataShufflexD, VxDN, CntxSP, LastxSP)
begin -- process p_s
SxDN <= SxD;
case StatexDP is
when init =>
if InENxSI='1' then
SxDN <= DataShufflexD & VINIT;
end if;
when run =>
if CntxSP(4 downto 0)=CSTOP then
if InENxSI='1' then
if LastxSP='1' then
SxDN <= DataShufflexD & VINIT;
else
SxDN <= DataShufflexD & VxDN;
end if;
end if;
end if;
when others => null;
end case;
end process p_s;
-- Next state for V, O (Output)
p_v: process (VxDP, OxDP, BF1xD, BF2xD, StatexDP, InENxSI, CntxSP, LastxSP)
begin -- process p_v
-- default
VxDN <= VxDP; -- by default do not update
OxDN <= OxDP;
case StatexDP is
when init =>
VxDN <= VINIT; -- initialize
if InENxSI = '1' then
VxDN <= VINIT xor BF1xD; -- add the message to initial value
end if;
when run =>
if CntxSP(4 downto 0) = CSTOP then -- the last round
VxDN <= VxDP xor BF2xD; -- add the SxDN of run 31 which will be
-- the final state
-- we are done with computing at this stage write the output and be done
OxDN <= VxDP(511 downto 256) xor BF2xD(511 downto 256);
if InENxSI='1' then -- new block starting
if LastxSP = '1' then -- is this the last block
VxDN <= VINIT xor BF1xD; -- Clear V, get Input
else
VxDN <= VxDP xor BF2xD xor BF1xD; -- We keep the V, add the last
-- state and also add the
-- input which has arrived
end if;
end if;
end if;
when chain =>
if InENxSI='1' then
VxDN <= VxDP xor BF1xD; -- add the message
end if;
when others => null;
end case;
end process p_v;
-- main FSM
p_fsm: process (CntxSP, LastxSI, LastxSP, InENxSI, StatexDP)
begin -- process p_fsm
--defaults
StatexDN <= StatexDP;
OutEnxSN <= '0'; -- output is not ready
CntxSN <= std_logic_vector(unsigned (CntxSP) + "1");
LastxSN <= LastxSP;
case StatexDP is
when init =>
CntxSN <= CINIT;
if InENxSI='1' then -- there is new data
StatexDN <= run; -- go to run state
if LastxSI='1' then -- this will be the last block
LastxSN <= '1'; -- record the last block
end if;
end if;
when run =>
if CntxSP(4 downto 0) = CSTOP then
OutEnxSN <= '1'; -- Next cycle the output is ready
if InENxSI='1' then -- New data is here
StatexDN <= run; -- Go to run state
if LastxSP = '1' then -- was this the last ??
LastxSN <= '0'; -- Clear the last flag
CntxSN <= CINIT; -- reset the counter
end if;
else
if LastxSP = '1' then -- is this the last message block ??
StatexDN <= init; -- restart counters
LastxSN <= '0'; -- reset also Last
else
StatexDN <= chain; -- keep the count
end if;
end if;
end if;
when chain =>
CntxSN <= CntxSP; -- wait until there is new data
if InENxSI='1' then -- there is new data
StatexDN <= run; -- now continue
if LastxSI='1' then -- this will be the last block
LastxSN <= '1'; -- keep the information
end if;
end if;
when others => null;
end case;
end process p_fsm;
-- Outputs
-- Now from the register
DataOutxDO <= OxDP;
-- Output Enable is also registered
OutEnxSO <= OutEnxSP;
-------------------------------------------------------------------------------
-- Clocked process for all the registers
-------------------------------------------------------------------------------
p_clk : process (CLKxCI, RSTxRBI)
begin -- process p_clk
if RSTxRBI = '0' then -- asynchronous reset (active low)
SxDP <= (others => '0');
VxDP <= VINIT;
T1xDP <= (others => '0');
T2xDP <= (others => '0');
T3xDP <= (others => '0');
T4xDP <= (others => '0');
OxDP <= (others => '0');
CntxSP <= CINIT;
LastxSP <= '0';
OutEnxSP <= '0';
StatexDP <= init;
elsif CLKxCI'event and CLKxCI = '1' then -- rising clock edge
SxDP <= SxDN;
VxDP <= VxDN;
T1xDP <= T1xDN;
T2xDP <= T2xDN;
T3xDP <= T3xDN;
T4xDP <= T4xDN;
OxDP <= OxDN;
CntxSP <= CntxSN;
LastxSP <= LastxSN;
OutEnxSP <= OutEnxSN;
StatexDP <= StatexDN;
end if;
end process p_clk;
end fast;