------------------------------------------------------------
-- Copyright: 2010 Integrated Sytems Laboratory, ETH Zurich
--            http://www.iis.ee.ethz.ch/~sha3
------------------------------------------------------------
-------------------------------------------------------------------------------
-- Title      : ECHO 20 Gb/s implementation
-- Project    : 
-------------------------------------------------------------------------------
-- File       : echo_fast.vhd
-- Author     : Frank/Luca account  
-- Company    : Integrated Systems Laboratory, ETH Zurich
-- Created    : 2010-02-01
-- Last update: 2010-02-05
-- Platform   : ModelSim (simulation), Synopsys (synthesis)
-- Standard   : VHDL'87
-------------------------------------------------------------------------------
-- Description: This is a fast ECHO datapath with 4 double AES rounds in
--              parallel 
-------------------------------------------------------------------------------
-- Copyright (c) 2010 Integrated Systems Laboratory, ETH Zurich
-------------------------------------------------------------------------------
-- Revisions  :
-- Date        Version  Author  Description
-- 2010-02-01  1.0      sha3	Created
-------------------------------------------------------------------------------

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

entity echo is
  
  port (
    DataInxDI  : in  std_logic_vector(1535 downto 0);  -- All inputs parallel
    LastxSI    : in  std_logic;         -- 1: this data will be the last round
                                        -- 0: continue chaining
    InENxSI    : in  std_logic;
    DataOutxDO : out std_logic_vector(255 downto 0);
    OutEnxSO   : out std_logic;
    CLKxCI     : in  std_logic;
    RSTxRBI    : in  std_logic);

end echo;

architecture fast of echo is

 signal DataShufflexD : std_logic_vector(1535 downto 0);
  
 -- initialization string for VxDP according to section 2.1, on pg 7 of the
 -- ECHO description
 constant VINIT : std_logic_vector(511 downto 0) :=
     (496 => '1', 368 => '1' , 240 => '1', 112 => '1', others => '0');

-- somehow we fixed the length of the message to be one 1376. It is not
-- difficult to make it work with any length, but at the moment this is the
-- easiest alternative
 constant CINIT : std_logic_vector(61 downto 0) := X"000000000000056"&"00";
-- this has to be one less than CINIT(4 downto 0);
 constant CSTOP : std_logic_vector(4 downto 0) := "10111";
 
 
 signal VxDN, VxDP : std_logic_vector(511 downto 0);
 signal OxDN, OxDP : std_logic_vector(255 downto 0);
 
 signal SxDN, SxDP,SxD           : std_logic_vector(2047 downto 0);
 type s_type is array (0 to 15) of std_logic_vector(127 downto 0);
 signal SNxD, SPxD               : s_type;

 
 signal T1xDP, T2xDP, T3xDP, T4xDP : std_logic_vector(127 downto 0);
 signal T1xDN, T2xDN, T3xDN, T4xDN : std_logic_vector(127 downto 0);

 signal A1xD, A2xD, A3xD, A4xD     : std_logic_vector(127 downto 0);
 signal B1xD, B2xD, B3xD, B4xD     : std_logic_vector(127 downto 0);
 signal BM1xD, BM2xD, BM3xD, BM4xD : std_logic_vector(127 downto 0);
 
 -- The name is Big Mixcolumns Input and BigMixcolumnsOutput
 type bm_type is array (0 to 15) of std_logic_vector(31 downto 0);
 signal BMIxD,BMOxD : bm_type;

-- Big Final Signals 
 signal BF1xD, BF2xD : std_logic_vector(511 downto 0);
 
 signal K1xD, K2xD, K3xD, K4xD : std_logic_vector(63 downto 0);
 signal K1SxD, K2SxD, K3SxD, K4SxD : std_logic_vector(63 downto 0);

 signal CntxSP, CntxSN     : std_logic_vector(61 downto 0);
 signal LastxSP, LastxSN   : std_logic;
 signal OutEnxSN, OutEnxSP : std_logic;
 signal PartCntxS          : std_logic_vector(1 downto 0);
 
 type states_type is (init, run, chain, last);
 signal StatexDP, StatexDN : states_type;
 
 component doubleAES
   port (
     InxDI       : in  std_logic_vector(127 downto 0);
     OutxDO      : out std_logic_vector(127 downto 0);
     RoundKeyxDI : in  std_logic_vector(63 downto 0));
 end component;
 
 component mixcolumn
   port (
     InpxDI : in  std_logic_vector(31 downto 0);
     OupxDO : out std_logic_vector(31 downto 0));
 end component;
 
begin  -- fast

-- this is to make typing slightly easier
-- perl -e 'for $i (0..15){ print "SP",$i,"xDSxDP(",128*($i+1)-1," downto ",128*$i,");\n"}'

gen_smap: for i in 0 to 15 generate
  SPxD(i) <= SxDP((128*(i+1)) -1 downto 128*i);
  SxD((128*(i+1)) -1 downto 128*i) <= SNxD(i);
end generate gen_smap;

  
PartCntxS <= CntxSP(1 downto 0);        -- shows which cycle we are in
                                        -- The total calculation is four cock
                                        -- cycles 

-------------------------------------------------------------------------------
-- Select input for the four AESblocks
-- This essentially also implements the Big ShiftRows on S
-------------------------------------------------------------------------------
with PartCntxS select
  A1xD <=  SPxD(0)  when "00",
           SPxD(4)  when "01",
           SPxD(8)  when "10",
           SPxD(12) when others;

with PartCntxS select
  A2xD <=  SPxD(5)  when "00",
           SPxD(9)  when "01",
           SPxD(13) when "10",
           SPxD(1)  when others;


with PartCntxS select
  A3xD <=  SPxD(10) when "00",
           SPxD(14) when "01",
           SPxD(2)  when "10",
           SPxD(6)  when others;

with PartCntxS select
  A4xD <=  SPxD(15) when "00",
           SPxD(3)  when "01",
           SPxD(7)  when "10",
           SPxD(11) when others;

-------------------------------------------------------------------------------
-- Calculate the Roundkeys
--
-- This is now more complex as we do not pick the S'es in order but we pick
-- them post-BigShiftRows
-------------------------------------------------------------------------------

with PartCntxS select
  K1xD <=  CntxSP(61 downto 2) & "0000"  when "00",
           CntxSP(61 downto 2) & "0100"  when "01",
           CntxSP(61 downto 2) & "1000"  when "10",
           CntxSP(61 downto 2) & "1100"  when others;

with PartCntxS select
  K2xD <=  CntxSP(61 downto 2) & "0101"  when "00",
           CntxSP(61 downto 2) & "1001"  when "01",
           CntxSP(61 downto 2) & "1101"  when "10",
           CntxSP(61 downto 2) & "0001"  when others;

with PartCntxS select
  K3xD <=  CntxSP(61 downto 2) & "1010"  when "00",
           CntxSP(61 downto 2) & "1110"  when "01",
           CntxSP(61 downto 2) & "0010"  when "10",
           CntxSP(61 downto 2) & "0110"  when others;

with PartCntxS select
  K4xD <=  CntxSP(61 downto 2) & "1111"  when "00",
           CntxSP(61 downto 2) & "0011"  when "01",
           CntxSP(61 downto 2) & "0111"  when "10",
           CntxSP(61 downto 2) & "1011"  when others;

-------------------------------------------------------------------------------
-- Shuffle since ECHO has a strange way of representing data 
-------------------------------------------------------------------------------

gen_keyshuffle: for i in 0 to 7 generate
  K1SxD((i+1)*8-1 downto i*8) <= K1xD ((8-i)*8-1 downto (7-i)*8);
  K2SxD((i+1)*8-1 downto i*8) <= K2xD ((8-i)*8-1 downto (7-i)*8);
  K3SxD((i+1)*8-1 downto i*8) <= K3xD ((8-i)*8-1 downto (7-i)*8);
  K4SxD((i+1)*8-1 downto i*8) <= K4xD ((8-i)*8-1 downto (7-i)*8);  
end generate gen_keyshuffle;
          

          
-------------------------------------------------------------------------------
-- Instantiate four double AES 
-------------------------------------------------------------------------------
i_doubleAES1: doubleAES
  port map (
    InxDI       => A1xD,
    OutxDO      => B1xD,
    RoundKeyxDI => K1SxD);

i_doubleAES2: doubleAES
  port map (
    InxDI       => A2xD,
    OutxDO      => B2xD,
    RoundKeyxDI => K2SxD);

i_doubleAES3: doubleAES
  port map (
    InxDI       => A3xD,
    OutxDO      => B3xD,
    RoundKeyxDI => K3SxD);

i_doubleAES4: doubleAES
  port map (
    InxDI       => A4xD,
    OutxDO      => B4xD,
    RoundKeyxDI => K4SxD);

-------------------------------------------------------------------------------
-- Now comes the Big Mixed Column
-------------------------------------------------------------------------------
gen_bigmixcols: for i in 0 to 15 generate
-- map inputs
    BMIxD(i) <= B1xD((i+1)*8-1 downto i*8) &
                B2xD((i+1)*8-1 downto i*8) &
                B3xD((i+1)*8-1 downto i*8) &
                B4xD((i+1)*8-1 downto i*8);

--instantiate the mixcolumns
    i_mixcolumn: mixcolumn
      port map (
        InpxDI => BMIxD(i),
        OupxDO => BMOxD(i));

-- map back to four signals
    BM1xD((i+1)*8-1 downto i*8) <= BMOxD(i)(31 downto 24);
    BM2xD((i+1)*8-1 downto i*8) <= BMOxD(i)(23 downto 16);
    BM3xD((i+1)*8-1 downto i*8) <= BMOxD(i)(15 downto  8);
    BM4xD((i+1)*8-1 downto i*8) <= BMOxD(i)( 7 downto  0);
  
end generate gen_bigmixcols;

-------------------------------------------------------------------------------
-- This process determines the next state for the S and Temp registers
-------------------------------------------------------------------------------
                
p_write: process (SPxD, PartCntxS,
                  BM1xD,BM2xD,BM3xD,BM4xD,
                  T1xDP,T2xDP,T3xDP,T4xDP)
begin  -- process p_write
   --defaults
   SNxD <= SPxD;
   T1xDN <= T1xDP;
   T2xDN <= T2xDP;
   T3xDN <= T3xDP;
   T4xDN <= T4xDP;

   case PartCntxS is
     when "00" =>   SNxD(0)  <= BM1xD;     -- 0
                    T1xDN    <= BM2xD;     -- T1= 1
                    T2xDN    <= BM3xD;     -- T2= 2
                    T3xDN    <= BM4xD;     -- T3= 3 
     when "01" =>   SNxD(4)  <= BM1xD;     -- 4
                    SNxD(5)  <= BM2xD;     -- 5
                    T3xDN    <= BM3xD;     -- T3=6
                    T4xDN    <= BM4xD;     -- T4=7
                    SNxD(3)  <= T3xDP;     -- 3
     when "10" =>   SNxD(8)  <= BM1xD;     -- 8
                    SNxD(9)  <= BM2xD;     -- 9
                    SNxD(10) <= BM3xD;     -- 10
                    T2xDN    <= BM4xD;     -- T2=11
                    SNxD(2)  <= T2xDP;     -- 2
                    SNxD(7)  <= T4xDP;     -- 7                  
     when others => SNxD(12) <= BM1xD;     -- 12
                    SNxD(13) <= BM2xD;     -- 13
                    SNxD(14) <= BM3xD;     -- 14
                    SNxD(15) <= BM4xD;     -- 15
                    SNxD(1)  <= T1xDP;     -- 1
                    SNxD(6)  <= T3xDP;     -- 6
                    SNxD(11) <= T2xDP;     -- 11
   end case;
end process p_write;

-------------------------------------------------------------------------------
-- Big Final
-- This is divided into two steps. Once the new data is here it is immediately
-- XOR'ed and added to V (BF1)
-- At the end of calculation 31st clock the State S is XOR'ed with V again
-- (BF2)
-- Technically the VxDP VxDN xor BF1xD xor BF2xD. However BF1 is available
-- at the very beginning and BF2 is available at the very end.
-------------------------------------------------------------------------------

-- this is the State XOR'ed. This is the next state
BF2xD(511 downto 384) <= SNxD(0) xor SNxD(4) xor SNxD(8)  xor SNxD(12) ; 
BF2xD(383 downto 256) <= SNxD(1) xor SNxD(5) xor SNxD(9)  xor SNxD(13) ;
BF2xD(255 downto 128) <= SNxD(2) xor SNxD(6) xor SNxD(10) xor SNxD(14) ;
BF2xD(127 downto   0) <= SNxD(3) xor SNxD(7) xor SNxD(11) xor SNxD(15) ;

-- this one is the message XOR'ed
-- NOte that we need the shuffled data in for this as well 
BF1xD(511 downto 384) <= DataShufflexD( 127 downto    0) xor DataShufflexD( 639 downto  512) xor DataShufflexD(1151 downto 1024);
BF1xD(383 downto 256) <= DataShufflexD( 255 downto  128) xor DataShufflexD( 767 downto  640) xor DataShufflexD(1279 downto 1152);
BF1xD(255 downto 128) <= DataShufflexD( 383 downto  256) xor DataShufflexD( 895 downto  768) xor DataShufflexD(1407 downto 1280);
BF1xD(127 downto   0) <= DataShufflexD( 511 downto  384) xor DataShufflexD(1023 downto  896) xor DataShufflexD(1535 downto 1408);


-------------------------------------------------------------------------------
-- State Machines
--
-- There are (at the moment) three state machines:
--   p_s: controls what happens with the STATE (SxDN)
--   p_v: controls the VECTOR (VxDN) and at the same time to allow fast working
--        also the OUTPUT (OxDN)
--   p_fsm: is for the main control flow, Counters, Last Flag, OutEn signal
--
-- The first two should produce larger multiplexer structures, the last one is
-- the 'real' FSM of the system.
-------------------------------------------------------------------------------

-- reshuffling, strange way of writing in data.. at least for me
gen_shuffle: for i in 0 to 11 generate
  DataShufflexD((i+1)*128-1 downto i*128) <= DataInxDI((12-i)*128 -1 downto (11-i)*128);
end generate gen_shuffle;




p_s: process (SxD, InENxSI, StatexDP, DataShufflexD, VxDN, CntxSP, LastxSP)
begin  -- process p_s
  SxDN <= SxD;
  case StatexDP is
    when init =>
        if InENxSI='1' then
          SxDN <= DataShufflexD & VINIT;
        end if;
    when run  =>
      if CntxSP(4 downto 0)=CSTOP then
        if InENxSI='1' then
          if LastxSP='1' then
            SxDN <= DataShufflexD & VINIT;
          else
            SxDN <= DataShufflexD & VxDN;
          end if;
        end if;
      end if;
    when others => null;
  end case;
end process p_s;




-- Next state for V, O (Output)
p_v: process (VxDP, OxDP, BF1xD, BF2xD, StatexDP, InENxSI, CntxSP, LastxSP)
begin  -- process p_v
  -- default
  VxDN <= VxDP;                         -- by default do not update
  OxDN <= OxDP; 
  
  case StatexDP is
    when init =>
        VxDN <= VINIT;                  -- initialize
        if InENxSI = '1' then
          VxDN <= VINIT xor BF1xD;       -- add the message to initial value
        end if;
    when run =>
      if CntxSP(4 downto 0) = CSTOP then  -- the last round          
          VxDN <= VxDP xor BF2xD;       -- add the SxDN of run 31 which will be
                                        -- the final state
          -- we are done with computing at this stage write the output and be done
          OxDN <= VxDP(511 downto 256) xor BF2xD(511 downto 256);  
         if InENxSI='1' then            -- new block starting
           if LastxSP = '1' then        -- is this the last block
              VxDN <= VINIT xor BF1xD;  -- Clear V, get Input
           else
              VxDN <= VxDP xor BF2xD xor BF1xD;  -- We keep the V, add the last
                                                 -- state and also add the
                                                 -- input which has arrived
           end if;
         end if;  
       end if;
    when chain =>
        if InENxSI='1' then
          VxDN <= VxDP xor BF1xD;       -- add the message
        end if;
    when others => null; 
  end case;
end process p_v;

-- main FSM
p_fsm: process (CntxSP, LastxSI, LastxSP, InENxSI, StatexDP)
begin  -- process p_fsm
  --defaults
  StatexDN <= StatexDP;
  OutEnxSN <= '0';                      -- output is not ready
  CntxSN <= std_logic_vector(unsigned (CntxSP) + "1");
  LastxSN <= LastxSP; 
  
  case StatexDP is
    when init  =>
        CntxSN <= CINIT;
         if InENxSI='1' then            -- there is new data
           StatexDN <= run;             -- go to run state
           if LastxSI='1' then          -- this will be the last block
             LastxSN <= '1';            -- record the last block
           end if;              
         end if;
    when run =>
       if CntxSP(4 downto 0) = CSTOP then
         OutEnxSN <= '1';               -- Next cycle the output is ready
         if InENxSI='1' then            -- New data is here
           StatexDN <= run;             -- Go to run state
           if LastxSP = '1' then        -- was this the last ??
             LastxSN <= '0';            -- Clear the last flag
             CntxSN <= CINIT;           -- reset the counter
           end if;
         else  
           if LastxSP = '1' then          -- is this the last message block ?? 
             StatexDN <= init;            -- restart counters
             LastxSN <= '0';              -- reset also Last
           else
             StatexDN <= chain;           -- keep the count
           end if;
         end if;  
       end if;
    when chain =>
         CntxSN <= CntxSP;              -- wait until there is new data
         if InENxSI='1' then            -- there is new data
           StatexDN <= run;             -- now continue
           if LastxSI='1' then          -- this will be the last block
             LastxSN <= '1';            -- keep the information
           end if;              
         end if;
    when others => null;
  end case;

end process p_fsm;

-- Outputs
-- Now from the register
DataOutxDO <= OxDP; 
-- Output Enable is also registered 
OutEnxSO <= OutEnxSP;

-------------------------------------------------------------------------------
-- Clocked process for all the registers
-------------------------------------------------------------------------------


p_clk : process (CLKxCI, RSTxRBI)
begin  -- process p_clk
  if RSTxRBI = '0' then                     -- asynchronous reset (active low)
    SxDP     <= (others => '0');
    VxDP     <= VINIT;
    T1xDP    <= (others => '0');
    T2xDP    <= (others => '0');
    T3xDP    <= (others => '0');
    T4xDP    <= (others => '0');
    OxDP     <= (others => '0');
    CntxSP   <= CINIT;
    LastxSP  <= '0';
    OutEnxSP <= '0';
    StatexDP <= init;
  elsif CLKxCI'event and CLKxCI = '1' then  -- rising clock edge
    SxDP     <= SxDN;
    VxDP     <= VxDN;
    T1xDP    <= T1xDN;
    T2xDP    <= T2xDN;
    T3xDP    <= T3xDN;
    T4xDP    <= T4xDN;
    OxDP     <= OxDN;
    CntxSP   <= CntxSN;
    LastxSP  <= LastxSN;
    OutEnxSP <= OutEnxSN;
    StatexDP <= StatexDN;
  end if;
end process p_clk;

                

end fast;

Generated on Fri Sep 24 10:39:12 CEST 2010
Home