upload bin folder
Browse files- instruction/Makefile +13 -0
- instruction/gen_instruction.py +612 -0
- instruction/instruction_1024T_32L_no_write_back_logit_everyT_mxint4.bin +3 -0
- instruction/instruction_1024T_32L_no_write_back_logit_everyT_mxint8.bin +3 -0
- instruction/instruction_1024T_32L_write_back_logit_everyT_mxint4.bin +3 -0
- instruction/instruction_1024T_32L_write_back_logit_everyT_mxint8.bin +3 -0
instruction/Makefile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PHONY: clean gen_inst
|
| 2 |
+
|
| 3 |
+
# true or false
|
| 4 |
+
# if true, generate LLM head instructions for FPGA simulation (if false, make sure the host machine has AVX512)
|
| 5 |
+
SIM_LLM_HEAD = true
|
| 6 |
+
|
| 7 |
+
gen_inst:
|
| 8 |
+
python gen_instruction.py --sim_cmode mxint4 $(if $(filter-out false,$(SIM_LLM_HEAD)),--sim_llm_head)
|
| 9 |
+
python gen_instruction.py --sim_cmode mxint8 $(if $(filter-out false,$(SIM_LLM_HEAD)),--sim_llm_head)
|
| 10 |
+
|
| 11 |
+
clean:
|
| 12 |
+
rm -f *.bin
|
| 13 |
+
|
instruction/gen_instruction.py
ADDED
|
@@ -0,0 +1,612 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
# operation groups
|
| 4 |
+
NOP = 0b00000000000001
|
| 5 |
+
TEST_BW = 0b00000000000010
|
| 6 |
+
LOAD = 0b00000000000100
|
| 7 |
+
STORE = 0b00000000001000
|
| 8 |
+
MLP_WM = 0b00000000010000
|
| 9 |
+
MLP_QKT = 0b00000000100000
|
| 10 |
+
QKT_M_RSQRT = 0b00000001000000
|
| 11 |
+
MLP_HP = 0b00000010000000
|
| 12 |
+
GATE = 0b00000100000000
|
| 13 |
+
RRMS = 0b00001000000000
|
| 14 |
+
RMSNORM = 0b00010000000000
|
| 15 |
+
SOFTMAX = 0b00100000000000
|
| 16 |
+
RESIDUAL = 0b01000000000000
|
| 17 |
+
ROPE = 0b10000000000000
|
| 18 |
+
# cmode groups
|
| 19 |
+
MX_INT8 = 0b0
|
| 20 |
+
MX_INT4 = 0b1
|
| 21 |
+
#stage groups
|
| 22 |
+
SUM = 0b0
|
| 23 |
+
GEN = 0b1
|
| 24 |
+
# nonlinear groups
|
| 25 |
+
NO_ACT = 0b0
|
| 26 |
+
SILU = 0b1
|
| 27 |
+
# load targets groups
|
| 28 |
+
NULL = 0b000
|
| 29 |
+
ACT = 0b001
|
| 30 |
+
ACT_S = 0b010
|
| 31 |
+
RESI = 0b011
|
| 32 |
+
RESI_S = 0b100
|
| 33 |
+
WEIGHT_S = 0b101
|
| 34 |
+
# write back groups
|
| 35 |
+
NO_WB = 0b00000001
|
| 36 |
+
WB = 0b00000010
|
| 37 |
+
WB_KV = 0b00000100
|
| 38 |
+
WB_KV_S = 0b00001000
|
| 39 |
+
WB_ACT = 0b00010000
|
| 40 |
+
WB_ACT_S = 0b00100000
|
| 41 |
+
WB_RESI = 0b01000000
|
| 42 |
+
WB_RESI_S = 0b10000000
|
| 43 |
+
|
| 44 |
+
# address space
|
| 45 |
+
WQ_BASE_ADDR , WQ_LAYER_OFFSET = 0x00000000, 0x00100000
|
| 46 |
+
WK_BASE_ADDR , WK_LAYER_OFFSET = 0x02000000, 0x00100000
|
| 47 |
+
WV_BASE_ADDR , WV_LAYER_OFFSET = 0x04000000, 0x00100000
|
| 48 |
+
WO_BASE_ADDR , WO_LAYER_OFFSET = 0x06000000, 0x00100000
|
| 49 |
+
W1_BASE_ADDR , W1_LAYER_OFFSET = 0x08000000, 0x002B0000
|
| 50 |
+
W3_BASE_ADDR , W3_LAYER_OFFSET = 0x0D600000, 0x002B0000
|
| 51 |
+
W2_BASE_ADDR , W2_LAYER_OFFSET = 0x12C00000, 0x002B0000
|
| 52 |
+
WQS_BASE_ADDR , WQS_LAYER_OFFSET = 0x18200000, 0x00008000
|
| 53 |
+
WKS_BASE_ADDR , WKS_LAYER_OFFSET = 0x18300000, 0x00008000
|
| 54 |
+
WVS_BASE_ADDR , WVS_LAYER_OFFSET = 0x18400000, 0x00008000
|
| 55 |
+
WOS_BASE_ADDR , WOS_LAYER_OFFSET = 0x18500000, 0x00008000
|
| 56 |
+
W1S_BASE_ADDR , W1S_LAYER_OFFSET = 0x18600000, 0x0015800
|
| 57 |
+
W3S_BASE_ADDR , W3S_LAYER_OFFSET = 0x188B0000, 0x0015800
|
| 58 |
+
W2S_BASE_ADDR , W2S_LAYER_OFFSET = 0x18B60000, 0x0018000
|
| 59 |
+
KC_BASE_ADDR , KC_LAYER_OFFSET , KC_TOKEN_OFFSET = 0x18E60000, 0x0080000 , 0x100
|
| 60 |
+
VC_BASE_ADDR , VC_LAYER_OFFSET , VC_TOKEN_OFFSET = 0x19E60000, 0x0080000 , 0x100
|
| 61 |
+
KCS_BASE_ADDR , KCS_LAYER_OFFSET , KCS_TOKEN_OFFSET = 0x1AE60000, 0x0040000 , 0x80
|
| 62 |
+
VCS_BASE_ADDR , VCS_LAYER_OFFSET , VCS_TOKEN_OFFSET = 0x1B660000, 0x0040000 , 0x80
|
| 63 |
+
ACT_BASE_ADDR , ACT_TOKEN_OFFSET = 0x1BE60000, 0x1000
|
| 64 |
+
RESI_BASE_ADDR , RESI_TOKEN_OFFSET = 0x1C660000, 0x1000
|
| 65 |
+
ACTS_BASE_ADDR , ACTS_TOKEN_OFFSET = 0x1CE60000, 0x80
|
| 66 |
+
RESIS_BASE_ADDR , RESIS_TOKEN_OFFSET = 0x1CEA0000, 0x80
|
| 67 |
+
PRENORM_ADDR , PRENORM_LAYER_OFFSET = 0x1CEE0000, 0x1000
|
| 68 |
+
POSTNORM_ADDR , POSTNORM_LAYER_OFFSET = 0x1CF00000, 0x1000
|
| 69 |
+
ROPE_BASE_ADDR , ROPE_TOKEN_OFFSET = 0x1CF20000, 0xC0
|
| 70 |
+
PRENORMS_ADDR , PRENORMS_LAYER_OFFSET = 0x1CFE0000, 0x80
|
| 71 |
+
POSTNORMS_ADDR , POSTNORMS_LAYER_OFFSET = 0x1CFE1000, 0x80
|
| 72 |
+
OUTNORM_ADDR = 0x1D840000
|
| 73 |
+
OUTNORMS_ADDR = 0x1D841000
|
| 74 |
+
WHEAD_BASE_ADDR , WHEAD_LAYER_OFFSET = 0x1D000000, 0x100000
|
| 75 |
+
WHEADS_BASE_ADDR , WHEADS_LAYER_OFFSET = 0x1D800000, 0x8000
|
| 76 |
+
HEAD_OUT_BASE_ADDR , HEAD_OUT_LAYER_OFFSET = 0x1E000000, 0x1000
|
| 77 |
+
HEADS_OUT_BASE_ADDR , HEADS_OUT_LAYER_OFFSET = 0x1E008000, 0x80
|
| 78 |
+
|
| 79 |
+
class Instruction:
|
| 80 |
+
def __init__(self,
|
| 81 |
+
op,
|
| 82 |
+
dq_en,
|
| 83 |
+
stage,
|
| 84 |
+
token,
|
| 85 |
+
load_target,
|
| 86 |
+
cmode,
|
| 87 |
+
nonlinear,
|
| 88 |
+
write_back,
|
| 89 |
+
input_dim,
|
| 90 |
+
output_dim,
|
| 91 |
+
input_addr,
|
| 92 |
+
scale_addr,
|
| 93 |
+
output_addr,
|
| 94 |
+
layer_offset,
|
| 95 |
+
token_offset,
|
| 96 |
+
num_cb_ws,
|
| 97 |
+
num_cb_wm
|
| 98 |
+
):
|
| 99 |
+
self.op = op
|
| 100 |
+
self.dq_en = dq_en
|
| 101 |
+
self.stage = stage
|
| 102 |
+
self.token = token
|
| 103 |
+
self.load_target = load_target
|
| 104 |
+
self.cmode = cmode
|
| 105 |
+
self.nonlinear = nonlinear
|
| 106 |
+
self.write_back = write_back
|
| 107 |
+
self.input_dim = input_dim
|
| 108 |
+
self.output_dim = output_dim
|
| 109 |
+
self.input_addr = input_addr
|
| 110 |
+
self.scale_addr = scale_addr
|
| 111 |
+
self.output_addr = output_addr
|
| 112 |
+
self.layer_offset = layer_offset
|
| 113 |
+
self.token_offset = token_offset
|
| 114 |
+
self.num_cb_ws = num_cb_ws
|
| 115 |
+
self.num_cb_wm = num_cb_wm
|
| 116 |
+
|
| 117 |
+
def to_binary(self, inst_num:int, inst_info:str):
|
| 118 |
+
print('INFO: {:30s} , Instruction id: {}'.format(inst_info, inst_num))
|
| 119 |
+
# Convert the instruction to a binary format
|
| 120 |
+
binary_format = (
|
| 121 |
+
f"{self.op:014b}{self.dq_en:01b}{self.stage:01b}{self.token:011b}"
|
| 122 |
+
f"{self.load_target:03b}{self.cmode:01b}{self.nonlinear:01b}{self.write_back:08b}"
|
| 123 |
+
f"{self.input_dim:016b}{self.output_dim:016b}"
|
| 124 |
+
f"{self.input_addr:032b}{self.scale_addr:032b}{self.output_addr:032b}"
|
| 125 |
+
f"{self.layer_offset:032b}{self.token_offset:032b}{self.num_cb_ws:016b}{self.num_cb_wm:016b}"
|
| 126 |
+
)
|
| 127 |
+
padding_length = 512 - len(binary_format)
|
| 128 |
+
binary_format = '0' * padding_length + binary_format
|
| 129 |
+
return binary_format
|
| 130 |
+
|
| 131 |
+
def gen_inst(op, dq_en, stage, token, load_target, cmode, nonlinear, write_back, input_dim, output_dim, input_addr, scale_addr, output_addr, layer_offset, token_offset, num_cb_ws, num_cb_wm):
|
| 132 |
+
return Instruction(op, dq_en, stage, token, load_target, cmode, nonlinear, write_back, input_dim, output_dim, input_addr, scale_addr, output_addr, layer_offset, token_offset, num_cb_ws, num_cb_wm)
|
| 133 |
+
# =====================================================================================================================================================================================================================
|
| 134 |
+
# Test bandwidth
|
| 135 |
+
# =====================================================================================================================================================================================================================
|
| 136 |
+
inst_test_bw = gen_inst(TEST_BW, 0, SUM, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 0 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 137 |
+
# =====================================================================================================================================================================================================================
|
| 138 |
+
# MXINT8 Instruction templates
|
| 139 |
+
# =====================================================================================================================================================================================================================
|
| 140 |
+
inst_nop = gen_inst(NOP, 0, SUM, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 0 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 141 |
+
inst_test_bw = gen_inst(TEST_BW, 0, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 0 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 142 |
+
inst_load_resi = gen_inst(LOAD, 0, GEN, 0, RESI, MX_INT8, NO_ACT, NO_WB, 4096, 0 , RESI_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, RESI_TOKEN_OFFSET, 0, 0 )
|
| 143 |
+
inst_load_in_act = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , ACT_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ACT_TOKEN_OFFSET, 0, 0 )
|
| 144 |
+
inst_load_resi_s = gen_inst(LOAD, 0, GEN, 0, RESI_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , RESIS_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, RESIS_TOKEN_OFFSET, 0, 0 )
|
| 145 |
+
inst_load_in_act_s = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , ACTS_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ACTS_TOKEN_OFFSET, 0, 0 )
|
| 146 |
+
inst_mlp_wm_q = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , WQ_BASE_ADDR, WQS_BASE_ADDR, 0x00000000, WQ_LAYER_OFFSET, 0x00000000, 128, 4096 )
|
| 147 |
+
inst_mlp_wm_k = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , WK_BASE_ADDR, WKS_BASE_ADDR, 0x00000000, WK_LAYER_OFFSET, 0x00000000, 128, 4096 )
|
| 148 |
+
inst_mlp_wm_v = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, WB, 4096, 4096 , WV_BASE_ADDR, WVS_BASE_ADDR, 0x00000000, WV_LAYER_OFFSET, 0x00000000, 128, 4096 )
|
| 149 |
+
inst_mlp_wm_o = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , WO_BASE_ADDR, WOS_BASE_ADDR, 0x00000000, WO_LAYER_OFFSET, 0x00000000, 128, 4096 )
|
| 150 |
+
inst_mlp_wm_w1 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, SILU, NO_WB, 4096, 11008 , W1_BASE_ADDR, W1S_BASE_ADDR, 0x00000000, W1_LAYER_OFFSET, 0x00000000, 32, 1024 )
|
| 151 |
+
inst_mlp_wm_w3 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 11008 , W3_BASE_ADDR, W3S_BASE_ADDR, 0x00000000, W3_LAYER_OFFSET, 0x00000000, 32, 1024 )
|
| 152 |
+
inst_mlp_wm_w2 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 11008, 4096 , W2_BASE_ADDR, W2S_BASE_ADDR, 0x00000000, W2_LAYER_OFFSET, 0x00000000, 24, 688 )
|
| 153 |
+
inst_gate = gen_inst(GATE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 11008, 11008 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 154 |
+
inst_residual = gen_inst(RESIDUAL, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 155 |
+
inst_store_act = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT, 4096, 4096 , 0x00000000, 0x00000000, ACT_BASE_ADDR, 0x00000000, ACT_TOKEN_OFFSET, 0, 0 )
|
| 156 |
+
inst_store_act_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT_S, 4096, 4096 , 0x00000000, 0x00000000, ACTS_BASE_ADDR, 0x00000000, ACTS_TOKEN_OFFSET, 0, 0 )
|
| 157 |
+
inst_store_resi = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT, 4096, 4096 , 0x00000000, 0x00000000, RESI_BASE_ADDR, 0x00000000, RESI_TOKEN_OFFSET, 0, 0 )
|
| 158 |
+
inst_store_resi_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT_S, 4096, 4096 , 0x00000000, 0x00000000, RESIS_BASE_ADDR, 0x00000000, RESIS_TOKEN_OFFSET, 0, 0 )
|
| 159 |
+
inst_rope_nwb = gen_inst(ROPE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 192, 4096 , ROPE_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ROPE_TOKEN_OFFSET, 0, 0 )
|
| 160 |
+
inst_rope_wb = gen_inst(ROPE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB, 192, 4096 , ROPE_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ROPE_TOKEN_OFFSET, 0, 0 )
|
| 161 |
+
inst_store_k = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV, 4096, 4096 , 0x00000000, 0x00000000, KC_BASE_ADDR, 0x00000000, KC_TOKEN_OFFSET, 0, 0 )
|
| 162 |
+
inst_store_k_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV_S, 4096, 4096 , 0x00000000, 0x00000000, KCS_BASE_ADDR, 0x00000000, KCS_TOKEN_OFFSET, 0, 0 )
|
| 163 |
+
inst_store_v = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV, 4096, 4096 , 0x00000000, 0x00000000, VC_BASE_ADDR, 0x00000000, VC_TOKEN_OFFSET, 0, 0 )
|
| 164 |
+
inst_store_v_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV_S, 4096, 4096 , 0x00000000, 0x00000000, VCS_BASE_ADDR, 0x00000000, VCS_TOKEN_OFFSET, 0, 0 )
|
| 165 |
+
inst_mlp_qkt = gen_inst(MLP_QKT, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4, 1 , KC_BASE_ADDR, KCS_BASE_ADDR, 0x00000000, 0x00000000, KCS_TOKEN_OFFSET, 4, 32 )
|
| 166 |
+
inst_qkt_m_rsqrt = gen_inst(QKT_M_RSQRT, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 1, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1, 512 )
|
| 167 |
+
inst_softmax = gen_inst(SOFTMAX, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1, 0 )
|
| 168 |
+
inst_mlp_hp = gen_inst(MLP_HP, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4, 4096 , VC_BASE_ADDR, VCS_BASE_ADDR, 0x00000000, 0x00000000, VCS_TOKEN_OFFSET, 4, 32 )
|
| 169 |
+
inst_prerrms = gen_inst(RRMS, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 170 |
+
inst_load_pre_wm = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , PRENORM_ADDR, 0x00000000, 0x00000000, PRENORM_LAYER_OFFSET, 0x00000000, 0, 0 )
|
| 171 |
+
inst_load_pre_ws = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , PRENORMS_ADDR, 0x00000000, 0x00000000, PRENORMS_LAYER_OFFSET, 0x00000000, 0, 0 )
|
| 172 |
+
inst_prermsnorm = gen_inst(RMSNORM, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 173 |
+
inst_postrrms = gen_inst(RRMS, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 174 |
+
inst_load_post_wm = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , POSTNORM_ADDR, 0x00000000, 0x00000000, POSTNORM_LAYER_OFFSET, 0x00000000, 0, 0 )
|
| 175 |
+
inst_load_post_ws = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , POSTNORMS_ADDR, 0x00000000, 0x00000000, POSTNORMS_LAYER_OFFSET, 0x00000000, 0, 0 )
|
| 176 |
+
inst_postrmsnorm = gen_inst(RMSNORM, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 177 |
+
# =====================================================================================================================================================================================================================
|
| 178 |
+
# DECODER OUT Instruction templates
|
| 179 |
+
# =====================================================================================================================================================================================================================
|
| 180 |
+
inst_outrrms = gen_inst(RRMS, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 181 |
+
inst_load_out_wm = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , OUTNORM_ADDR, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 182 |
+
inst_load_out_ws = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , OUTNORMS_ADDR, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
|
| 183 |
+
inst_outrmsnorm = gen_inst(RMSNORM, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
|
| 184 |
+
inst_mlp_wm_whead = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, WB, 4096, 4096 , WHEAD_BASE_ADDR, WHEADS_BASE_ADDR, 0x00000000, WHEAD_LAYER_OFFSET, 0x00000000, 128, 4096 )
|
| 185 |
+
inst_store_head = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT, 4096, 4096 , 0x00000000, 0x00000000, HEAD_OUT_BASE_ADDR, 0x00000000, HEAD_OUT_LAYER_OFFSET, 0, 0 )
|
| 186 |
+
inst_store_head_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT_S, 4096, 4096 , 0x00000000, 0x00000000, HEADS_OUT_BASE_ADDR, 0x00000000, HEADS_OUT_LAYER_OFFSET, 0, 0 )
|
| 187 |
+
# =====================================================================================================================================================================================================================
|
| 188 |
+
# MXINT4 Instruction templates
|
| 189 |
+
# =====================================================================================================================================================================================================================
|
| 190 |
+
inst_mlp_wm_whead_mxint4= gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, WB, 4096, 4096 , WHEAD_BASE_ADDR, WHEADS_BASE_ADDR, 0x00000000, WHEAD_LAYER_OFFSET, 0x00000000, 128, 2048 )
|
| 191 |
+
inst_mlp_wm_q_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 4096 , WQ_BASE_ADDR, WQS_BASE_ADDR, 0x00000000, WQ_LAYER_OFFSET, 0x00000000, 128, 2048 )
|
| 192 |
+
inst_mlp_wm_k_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 4096 , WK_BASE_ADDR, WKS_BASE_ADDR, 0x00000000, WK_LAYER_OFFSET, 0x00000000, 128, 2048 )
|
| 193 |
+
inst_mlp_wm_v_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, WB, 4096, 4096 , WV_BASE_ADDR, WVS_BASE_ADDR, 0x00000000, WV_LAYER_OFFSET, 0x00000000, 128, 2048 )
|
| 194 |
+
inst_mlp_wm_o_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 4096 , WO_BASE_ADDR, WOS_BASE_ADDR, 0x00000000, WO_LAYER_OFFSET, 0x00000000, 128, 2048 )
|
| 195 |
+
inst_mlp_wm_w1_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, SILU, NO_WB, 4096, 11008 , W1_BASE_ADDR, W1S_BASE_ADDR, 0x00000000, W1_LAYER_OFFSET, 0x00000000, 32, 512 )
|
| 196 |
+
inst_mlp_wm_w3_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 11008 , W3_BASE_ADDR, W3S_BASE_ADDR, 0x00000000, W3_LAYER_OFFSET, 0x00000000, 32, 512 )
|
| 197 |
+
inst_mlp_wm_w2_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 11008, 4096 , W2_BASE_ADDR, W2S_BASE_ADDR, 0x00000000, W2_LAYER_OFFSET, 0x00000000, 24, 344 )
|
| 198 |
+
# =====================================================================================================================================================================================================================
|
| 199 |
+
# Instruction templates end
|
| 200 |
+
# =====================================================================================================================================================================================================================
|
| 201 |
+
# gen instructions parameters
|
| 202 |
+
parser = argparse.ArgumentParser(description='Generate instruction binary file for simulation')
|
| 203 |
+
parser.add_argument('--sim_cmode', type=str, default='mxint8', choices=['mxint8', 'mxint4'], help='Simulation compute mode')
|
| 204 |
+
parser.add_argument('--sim_llm_head', action='store_true', help='if true, generate LLM head instructions for FPGA simulation')
|
| 205 |
+
args = parser.parse_args()
|
| 206 |
+
|
| 207 |
+
SIM_CMODE = args.sim_cmode
|
| 208 |
+
TEST_OP_GROUP = 'demo'
|
| 209 |
+
SIM_LLM_HEAD = args.sim_llm_head
|
| 210 |
+
SIM_LOGIT_FLAG = '' if SIM_LLM_HEAD else '_no'
|
| 211 |
+
SIM_NUM_TOKEN = 1024
|
| 212 |
+
SIM_NUM_LAYER = 32
|
| 213 |
+
|
| 214 |
+
if __name__ == "__main__":
|
| 215 |
+
if TEST_OP_GROUP == 'demo':
|
| 216 |
+
current_token = 0
|
| 217 |
+
tmp_output_dim = 1
|
| 218 |
+
current_inst_cnt = 0
|
| 219 |
+
file_name = "instruction_{}T_32L{}_write_back_logit_everyT_{}.bin".format(SIM_NUM_TOKEN, SIM_LOGIT_FLAG, SIM_CMODE)
|
| 220 |
+
|
| 221 |
+
with open(file_name, "wb") as f:
|
| 222 |
+
for tk in range(SIM_NUM_TOKEN):
|
| 223 |
+
print("Gen {} th token instruction start".format(tk+1))
|
| 224 |
+
# LOAD IN_ACT
|
| 225 |
+
inst_load_in_act.input_addr = ACT_BASE_ADDR + ACT_TOKEN_OFFSET * tk
|
| 226 |
+
inst_load_in_act.token = current_token
|
| 227 |
+
binary_instruction = inst_load_in_act.to_binary(current_inst_cnt, 'LOAD IN_ACT')
|
| 228 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 229 |
+
current_inst_cnt += 1
|
| 230 |
+
# LOAD IN_ACT
|
| 231 |
+
inst_load_in_act_s.input_addr = ACTS_BASE_ADDR + ACTS_TOKEN_OFFSET * tk
|
| 232 |
+
inst_load_in_act_s.token = current_token
|
| 233 |
+
binary_instruction = inst_load_in_act_s.to_binary(current_inst_cnt, 'LOAD IN_ACT_S')
|
| 234 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 235 |
+
current_inst_cnt += 1
|
| 236 |
+
|
| 237 |
+
for l in range(SIM_NUM_LAYER):
|
| 238 |
+
print("Gen {} th token, {} th layer instruction".format(tk+1, l+1))
|
| 239 |
+
# RRMS
|
| 240 |
+
inst_prerrms.token = current_token
|
| 241 |
+
binary_instruction = inst_prerrms.to_binary(current_inst_cnt, 'PRE RRMS')
|
| 242 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 243 |
+
current_inst_cnt += 1
|
| 244 |
+
# LOAD NORM weight
|
| 245 |
+
inst_load_pre_wm.input_addr = PRENORM_ADDR + PRENORM_LAYER_OFFSET * l
|
| 246 |
+
inst_load_pre_wm.token = current_token
|
| 247 |
+
binary_instruction = inst_load_pre_wm.to_binary(current_inst_cnt, 'LOAD PRENORM weight')
|
| 248 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 249 |
+
current_inst_cnt += 1
|
| 250 |
+
# LOAD NORM weight scale
|
| 251 |
+
inst_load_pre_ws.input_addr = PRENORMS_ADDR + PRENORMS_LAYER_OFFSET * l
|
| 252 |
+
inst_load_pre_ws.token = current_token
|
| 253 |
+
binary_instruction = inst_load_pre_ws.to_binary(current_inst_cnt, 'LOAD PRENORM weight scale')
|
| 254 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 255 |
+
current_inst_cnt += 1
|
| 256 |
+
# RMSNORM
|
| 257 |
+
inst_prermsnorm.token = current_token
|
| 258 |
+
binary_instruction = inst_prermsnorm.to_binary(current_inst_cnt, 'PRE RMSNORM')
|
| 259 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 260 |
+
current_inst_cnt += 1
|
| 261 |
+
|
| 262 |
+
if SIM_CMODE == 'mxint8':
|
| 263 |
+
# MLP_WM Wv
|
| 264 |
+
inst_mlp_wm_v.input_addr = WV_BASE_ADDR + WV_LAYER_OFFSET * l
|
| 265 |
+
inst_mlp_wm_v.scale_addr = WVS_BASE_ADDR + WVS_LAYER_OFFSET * l
|
| 266 |
+
inst_mlp_wm_v.token = current_token
|
| 267 |
+
binary_instruction = inst_mlp_wm_v.to_binary(current_inst_cnt, 'MLP_WM Wv')
|
| 268 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 269 |
+
current_inst_cnt += 1
|
| 270 |
+
else:
|
| 271 |
+
# MLP_WM Wv
|
| 272 |
+
inst_mlp_wm_v_mxint4.input_addr = WV_BASE_ADDR + (WV_LAYER_OFFSET//2) * l
|
| 273 |
+
inst_mlp_wm_v_mxint4.scale_addr = WVS_BASE_ADDR + WVS_LAYER_OFFSET * l
|
| 274 |
+
inst_mlp_wm_v_mxint4.token = current_token
|
| 275 |
+
binary_instruction = inst_mlp_wm_v_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wv')
|
| 276 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 277 |
+
current_inst_cnt += 1
|
| 278 |
+
|
| 279 |
+
# STORE V elem
|
| 280 |
+
inst_store_v.output_addr = VC_BASE_ADDR + VC_LAYER_OFFSET * l + VC_TOKEN_OFFSET * tk
|
| 281 |
+
inst_store_v.token = current_token
|
| 282 |
+
binary_instruction = inst_store_v.to_binary(current_inst_cnt, 'STORE V elem')
|
| 283 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 284 |
+
current_inst_cnt += 1
|
| 285 |
+
# STORE V scale
|
| 286 |
+
inst_store_v_s.output_addr = VCS_BASE_ADDR + VCS_LAYER_OFFSET * l + VCS_TOKEN_OFFSET * tk
|
| 287 |
+
inst_store_v_s.token = current_token
|
| 288 |
+
binary_instruction = inst_store_v_s.to_binary(current_inst_cnt, 'STORE V scale')
|
| 289 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 290 |
+
current_inst_cnt += 1
|
| 291 |
+
|
| 292 |
+
if SIM_CMODE == 'mxint8':
|
| 293 |
+
# MLP_WM Wk
|
| 294 |
+
inst_mlp_wm_k.input_addr = WK_BASE_ADDR + WK_LAYER_OFFSET * l
|
| 295 |
+
inst_mlp_wm_k.scale_addr = WKS_BASE_ADDR + WKS_LAYER_OFFSET * l
|
| 296 |
+
inst_mlp_wm_k.token = current_token
|
| 297 |
+
binary_instruction = inst_mlp_wm_k.to_binary(current_inst_cnt, 'MLP_WM Wk')
|
| 298 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 299 |
+
current_inst_cnt += 1
|
| 300 |
+
else:
|
| 301 |
+
# MLP_WM Wk
|
| 302 |
+
inst_mlp_wm_k_mxint4.input_addr = WK_BASE_ADDR + (WK_LAYER_OFFSET//2) * l
|
| 303 |
+
inst_mlp_wm_k_mxint4.scale_addr = WKS_BASE_ADDR + WKS_LAYER_OFFSET * l
|
| 304 |
+
inst_mlp_wm_k_mxint4.token = current_token
|
| 305 |
+
binary_instruction = inst_mlp_wm_k_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wk')
|
| 306 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 307 |
+
current_inst_cnt += 1
|
| 308 |
+
|
| 309 |
+
# ROPE WB (K)
|
| 310 |
+
inst_rope_wb.input_addr = ROPE_BASE_ADDR + ROPE_TOKEN_OFFSET * tk
|
| 311 |
+
inst_rope_wb.token = current_token
|
| 312 |
+
binary_instruction = inst_rope_wb.to_binary(current_inst_cnt, 'ROPE WB (K)')
|
| 313 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 314 |
+
current_inst_cnt += 1
|
| 315 |
+
# STORE K elem
|
| 316 |
+
inst_store_k.output_addr = KC_BASE_ADDR + KC_LAYER_OFFSET * l + KC_TOKEN_OFFSET * tk
|
| 317 |
+
inst_store_k.token = current_token
|
| 318 |
+
binary_instruction = inst_store_k.to_binary(current_inst_cnt, 'STORE K elem')
|
| 319 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 320 |
+
current_inst_cnt += 1
|
| 321 |
+
# STORE K scale
|
| 322 |
+
inst_store_k_s.output_addr = KCS_BASE_ADDR + KCS_LAYER_OFFSET * l + KCS_TOKEN_OFFSET * tk
|
| 323 |
+
inst_store_k_s.token = current_token
|
| 324 |
+
binary_instruction = inst_store_k_s.to_binary(current_inst_cnt, 'STORE K scale')
|
| 325 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 326 |
+
current_inst_cnt += 1
|
| 327 |
+
|
| 328 |
+
if SIM_CMODE == 'mxint8':
|
| 329 |
+
# MLP_WM Wq
|
| 330 |
+
inst_mlp_wm_q.input_addr = WQ_BASE_ADDR + WQ_LAYER_OFFSET * l
|
| 331 |
+
inst_mlp_wm_q.scale_addr = WQS_BASE_ADDR + WQS_LAYER_OFFSET * l
|
| 332 |
+
inst_mlp_wm_q.token = current_token
|
| 333 |
+
binary_instruction = inst_mlp_wm_q.to_binary(current_inst_cnt, 'MLP_WM Wq')
|
| 334 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 335 |
+
current_inst_cnt += 1
|
| 336 |
+
else:
|
| 337 |
+
# MLP_WM Wq
|
| 338 |
+
inst_mlp_wm_q_mxint4.input_addr = WQ_BASE_ADDR + (WQ_LAYER_OFFSET//2) * l
|
| 339 |
+
inst_mlp_wm_q_mxint4.scale_addr = WQS_BASE_ADDR + WQS_LAYER_OFFSET * l
|
| 340 |
+
inst_mlp_wm_q_mxint4.token = current_token
|
| 341 |
+
binary_instruction = inst_mlp_wm_q_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wq')
|
| 342 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 343 |
+
current_inst_cnt += 1
|
| 344 |
+
|
| 345 |
+
# ROPE NO_WB (Q)
|
| 346 |
+
inst_rope_nwb.input_addr = ROPE_BASE_ADDR + ROPE_TOKEN_OFFSET * tk
|
| 347 |
+
inst_rope_nwb.token = current_token
|
| 348 |
+
binary_instruction = inst_rope_nwb.to_binary(current_inst_cnt, 'ROPE NO_WB (Q)')
|
| 349 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 350 |
+
current_inst_cnt += 1
|
| 351 |
+
# MLP_QKT
|
| 352 |
+
inst_mlp_qkt.input_addr = KC_BASE_ADDR + KC_LAYER_OFFSET * l
|
| 353 |
+
inst_mlp_qkt.scale_addr = KCS_BASE_ADDR + KCS_LAYER_OFFSET * l
|
| 354 |
+
inst_mlp_qkt.output_dim = tmp_output_dim
|
| 355 |
+
inst_mlp_qkt.input_dim = tmp_output_dim * inst_mlp_qkt.num_cb_ws
|
| 356 |
+
inst_mlp_qkt.token = current_token
|
| 357 |
+
binary_instruction = inst_mlp_qkt.to_binary(current_inst_cnt, 'MLP_QKT')
|
| 358 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 359 |
+
current_inst_cnt += 1
|
| 360 |
+
# QKT_M_RSQRT
|
| 361 |
+
inst_qkt_m_rsqrt.output_dim = tmp_output_dim
|
| 362 |
+
inst_qkt_m_rsqrt.input_dim = tmp_output_dim * inst_qkt_m_rsqrt.num_cb_ws
|
| 363 |
+
inst_qkt_m_rsqrt.token = current_token
|
| 364 |
+
binary_instruction = inst_qkt_m_rsqrt.to_binary(current_inst_cnt, 'QKT_M_RSQRT')
|
| 365 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 366 |
+
current_inst_cnt += 1
|
| 367 |
+
# SOFTMAX
|
| 368 |
+
inst_softmax.output_dim = tmp_output_dim
|
| 369 |
+
inst_softmax.token = current_token
|
| 370 |
+
binary_instruction = inst_softmax.to_binary(current_inst_cnt, 'SOFTMAX')
|
| 371 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 372 |
+
current_inst_cnt += 1
|
| 373 |
+
# MLP_HP (SxV)
|
| 374 |
+
inst_mlp_hp.input_addr = VC_BASE_ADDR + VC_LAYER_OFFSET * l
|
| 375 |
+
inst_mlp_hp.scale_addr = VCS_BASE_ADDR + VCS_LAYER_OFFSET * l
|
| 376 |
+
inst_mlp_hp.input_dim = tmp_output_dim * inst_mlp_hp.num_cb_ws
|
| 377 |
+
inst_mlp_hp.token = current_token
|
| 378 |
+
binary_instruction = inst_mlp_hp.to_binary(current_inst_cnt, 'MLP_HP (SxV)')
|
| 379 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 380 |
+
current_inst_cnt += 1
|
| 381 |
+
|
| 382 |
+
if SIM_CMODE == 'mxint8':
|
| 383 |
+
# MLP_WM Wo
|
| 384 |
+
inst_mlp_wm_o.input_addr = WO_BASE_ADDR + WO_LAYER_OFFSET * l
|
| 385 |
+
inst_mlp_wm_o.scale_addr = WOS_BASE_ADDR + WOS_LAYER_OFFSET * l
|
| 386 |
+
inst_mlp_wm_o.token = current_token
|
| 387 |
+
binary_instruction = inst_mlp_wm_o.to_binary(current_inst_cnt, 'MLP_WM Wo')
|
| 388 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 389 |
+
current_inst_cnt += 1
|
| 390 |
+
else:
|
| 391 |
+
# MLP_WM Wo
|
| 392 |
+
inst_mlp_wm_o_mxint4.input_addr = WO_BASE_ADDR + (WO_LAYER_OFFSET//2) * l
|
| 393 |
+
inst_mlp_wm_o_mxint4.scale_addr = WOS_BASE_ADDR + WOS_LAYER_OFFSET * l
|
| 394 |
+
inst_mlp_wm_o_mxint4.token = current_token
|
| 395 |
+
binary_instruction = inst_mlp_wm_o_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wo')
|
| 396 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 397 |
+
current_inst_cnt += 1
|
| 398 |
+
|
| 399 |
+
# LOAD RESI
|
| 400 |
+
inst_load_resi.input_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
|
| 401 |
+
inst_load_resi.token = current_token
|
| 402 |
+
binary_instruction = inst_load_resi.to_binary(current_inst_cnt, 'LOAD RESI')
|
| 403 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 404 |
+
current_inst_cnt += 1
|
| 405 |
+
# LOAD RESI_S
|
| 406 |
+
inst_load_resi_s.input_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
|
| 407 |
+
inst_load_resi_s.token = current_token
|
| 408 |
+
binary_instruction = inst_load_resi_s.to_binary(current_inst_cnt, 'LOAD RESI_S')
|
| 409 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 410 |
+
current_inst_cnt += 1
|
| 411 |
+
# RESIDUAL
|
| 412 |
+
inst_residual.token = current_token
|
| 413 |
+
binary_instruction = inst_residual.to_binary(current_inst_cnt, 'RESIDUAL')
|
| 414 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 415 |
+
current_inst_cnt += 1
|
| 416 |
+
# STORE RESI
|
| 417 |
+
inst_store_resi.output_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
|
| 418 |
+
inst_store_resi.token = current_token
|
| 419 |
+
binary_instruction = inst_store_resi.to_binary(current_inst_cnt, 'STORE RESI')
|
| 420 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 421 |
+
current_inst_cnt += 1
|
| 422 |
+
# STORE RESI_S
|
| 423 |
+
inst_store_resi_s.output_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
|
| 424 |
+
inst_store_resi_s.token = current_token
|
| 425 |
+
binary_instruction = inst_store_resi_s.to_binary(current_inst_cnt, 'STORE RESI_S')
|
| 426 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 427 |
+
current_inst_cnt += 1
|
| 428 |
+
# RRMS
|
| 429 |
+
inst_postrrms.token = current_token
|
| 430 |
+
binary_instruction = inst_postrrms.to_binary(current_inst_cnt, 'POST RRMS')
|
| 431 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 432 |
+
current_inst_cnt += 1
|
| 433 |
+
# LOAD NORM weight
|
| 434 |
+
inst_load_post_wm.input_addr = POSTNORM_ADDR + POSTNORM_LAYER_OFFSET * l
|
| 435 |
+
inst_load_post_wm.token = current_token
|
| 436 |
+
binary_instruction = inst_load_post_wm.to_binary(current_inst_cnt, 'LOAD POSTNORM weight')
|
| 437 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 438 |
+
current_inst_cnt += 1
|
| 439 |
+
# LOAD NORM weight scale
|
| 440 |
+
inst_load_post_ws.input_addr = POSTNORMS_ADDR + POSTNORMS_LAYER_OFFSET * l
|
| 441 |
+
inst_load_post_ws.token = current_token
|
| 442 |
+
binary_instruction = inst_load_post_ws.to_binary(current_inst_cnt, 'LOAD POSTNORM weight scale')
|
| 443 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 444 |
+
current_inst_cnt += 1
|
| 445 |
+
# RMSNORM
|
| 446 |
+
inst_postrmsnorm.token = current_token
|
| 447 |
+
binary_instruction = inst_postrmsnorm.to_binary(current_inst_cnt, 'POST RMSNORM')
|
| 448 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 449 |
+
current_inst_cnt += 1
|
| 450 |
+
|
| 451 |
+
if SIM_CMODE == 'mxint8':
|
| 452 |
+
# MLP_WM W1
|
| 453 |
+
inst_mlp_wm_w1.input_addr = W1_BASE_ADDR + W1_LAYER_OFFSET * l
|
| 454 |
+
inst_mlp_wm_w1.scale_addr = W1S_BASE_ADDR + W1S_LAYER_OFFSET * l
|
| 455 |
+
inst_mlp_wm_w1.token = current_token
|
| 456 |
+
binary_instruction = inst_mlp_wm_w1.to_binary(current_inst_cnt, 'MLP_WM W1')
|
| 457 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 458 |
+
current_inst_cnt += 1
|
| 459 |
+
# MLP_WM W3
|
| 460 |
+
inst_mlp_wm_w3.input_addr = W3_BASE_ADDR + W3_LAYER_OFFSET * l
|
| 461 |
+
inst_mlp_wm_w3.scale_addr = W3S_BASE_ADDR + W3S_LAYER_OFFSET * l
|
| 462 |
+
inst_mlp_wm_w3.token = current_token
|
| 463 |
+
binary_instruction = inst_mlp_wm_w3.to_binary(current_inst_cnt, 'MLP_WM W3')
|
| 464 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 465 |
+
current_inst_cnt += 1
|
| 466 |
+
else:
|
| 467 |
+
# MLP_WM W1
|
| 468 |
+
inst_mlp_wm_w1_mxint4.input_addr = W1_BASE_ADDR + (W1_LAYER_OFFSET//2) * l
|
| 469 |
+
inst_mlp_wm_w1_mxint4.scale_addr = W1S_BASE_ADDR + W1S_LAYER_OFFSET * l
|
| 470 |
+
inst_mlp_wm_w1_mxint4.token = current_token
|
| 471 |
+
binary_instruction = inst_mlp_wm_w1_mxint4.to_binary(current_inst_cnt, 'MLP_WM W1')
|
| 472 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 473 |
+
current_inst_cnt += 1
|
| 474 |
+
# MLP_WM W3
|
| 475 |
+
inst_mlp_wm_w3_mxint4.input_addr = W3_BASE_ADDR + (W3_LAYER_OFFSET//2) * l
|
| 476 |
+
inst_mlp_wm_w3_mxint4.scale_addr = W3S_BASE_ADDR + W3S_LAYER_OFFSET * l
|
| 477 |
+
inst_mlp_wm_w3_mxint4.token = current_token
|
| 478 |
+
binary_instruction = inst_mlp_wm_w3_mxint4.to_binary(current_inst_cnt, 'MLP_WM W3')
|
| 479 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 480 |
+
current_inst_cnt += 1
|
| 481 |
+
|
| 482 |
+
# GATE
|
| 483 |
+
inst_gate.token = current_token
|
| 484 |
+
binary_instruction = inst_gate.to_binary(current_inst_cnt, 'GATE')
|
| 485 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 486 |
+
current_inst_cnt += 1
|
| 487 |
+
|
| 488 |
+
if SIM_CMODE == 'mxint8':
|
| 489 |
+
# MLP_WM W2
|
| 490 |
+
inst_mlp_wm_w2.input_addr = W2_BASE_ADDR + W2_LAYER_OFFSET * l
|
| 491 |
+
inst_mlp_wm_w2.scale_addr = W2S_BASE_ADDR + W2S_LAYER_OFFSET * l
|
| 492 |
+
inst_mlp_wm_w2.token = current_token
|
| 493 |
+
binary_instruction = inst_mlp_wm_w2.to_binary(current_inst_cnt, 'MLP_WM W2')
|
| 494 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 495 |
+
current_inst_cnt += 1
|
| 496 |
+
else:
|
| 497 |
+
# MLP_WM W2
|
| 498 |
+
inst_mlp_wm_w2_mxint4.input_addr = W2_BASE_ADDR + (W2_LAYER_OFFSET//2) * l
|
| 499 |
+
inst_mlp_wm_w2_mxint4.scale_addr = W2S_BASE_ADDR + W2S_LAYER_OFFSET * l
|
| 500 |
+
inst_mlp_wm_w2_mxint4.token = current_token
|
| 501 |
+
binary_instruction = inst_mlp_wm_w2_mxint4.to_binary(current_inst_cnt, 'MLP_WM W2')
|
| 502 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 503 |
+
current_inst_cnt += 1
|
| 504 |
+
|
| 505 |
+
# LOAD RESI
|
| 506 |
+
inst_load_resi.input_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
|
| 507 |
+
inst_load_resi.token = current_token
|
| 508 |
+
binary_instruction = inst_load_resi.to_binary(current_inst_cnt, 'LOAD RESI')
|
| 509 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 510 |
+
current_inst_cnt += 1
|
| 511 |
+
# LOAD RESI_S
|
| 512 |
+
inst_load_resi_s.input_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
|
| 513 |
+
inst_load_resi_s.token = current_token
|
| 514 |
+
binary_instruction = inst_load_resi_s.to_binary(current_inst_cnt, 'LOAD RESI_S')
|
| 515 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 516 |
+
current_inst_cnt += 1
|
| 517 |
+
# RESIDUAL
|
| 518 |
+
inst_residual.token = current_token
|
| 519 |
+
binary_instruction = inst_residual.to_binary(current_inst_cnt, 'RESIDUAL')
|
| 520 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 521 |
+
current_inst_cnt += 1
|
| 522 |
+
# STORE RESI
|
| 523 |
+
inst_store_resi.output_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
|
| 524 |
+
inst_store_resi.token = current_token
|
| 525 |
+
binary_instruction = inst_store_resi.to_binary(current_inst_cnt, 'STORE RESI')
|
| 526 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 527 |
+
current_inst_cnt += 1
|
| 528 |
+
# STORE RESI_S
|
| 529 |
+
inst_store_resi_s.output_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
|
| 530 |
+
inst_store_resi_s.token = current_token
|
| 531 |
+
binary_instruction = inst_store_resi_s.to_binary(current_inst_cnt, 'STORE RESI_S')
|
| 532 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 533 |
+
current_inst_cnt += 1
|
| 534 |
+
# END OF LAYER
|
| 535 |
+
|
| 536 |
+
# if tk == SIM_NUM_TOKEN-1 and SIM_NUM_LAYER == 32 and SIM_LLM_HEAD:
|
| 537 |
+
if SIM_NUM_LAYER == 32 and SIM_LLM_HEAD:
|
| 538 |
+
# RRMS
|
| 539 |
+
inst_outrrms.token = current_token
|
| 540 |
+
binary_instruction = inst_outrrms.to_binary(current_inst_cnt, 'OUT RRMS')
|
| 541 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 542 |
+
current_inst_cnt += 1
|
| 543 |
+
# LOAD NORM weight
|
| 544 |
+
inst_load_out_wm.input_addr = OUTNORM_ADDR
|
| 545 |
+
inst_load_out_wm.token = current_token
|
| 546 |
+
binary_instruction = inst_load_out_wm.to_binary(current_inst_cnt, 'LOAD OUTNORM weight')
|
| 547 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 548 |
+
current_inst_cnt += 1
|
| 549 |
+
# LOAD NORM weight scale
|
| 550 |
+
inst_load_out_ws.input_addr = OUTNORMS_ADDR
|
| 551 |
+
inst_load_out_ws.token = current_token
|
| 552 |
+
binary_instruction = inst_load_out_ws.to_binary(current_inst_cnt, 'LOAD OUTNORM weight scale')
|
| 553 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 554 |
+
current_inst_cnt += 1
|
| 555 |
+
# RMSNORM
|
| 556 |
+
inst_outrmsnorm.token = current_token
|
| 557 |
+
binary_instruction = inst_outrmsnorm.to_binary(current_inst_cnt, 'OUT RMSNORM')
|
| 558 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 559 |
+
current_inst_cnt += 1
|
| 560 |
+
|
| 561 |
+
for it in range(8):
|
| 562 |
+
if SIM_CMODE == 'mxint8':
|
| 563 |
+
# MLP_WM Whead
|
| 564 |
+
inst_mlp_wm_whead.input_addr = WHEAD_BASE_ADDR + WHEAD_LAYER_OFFSET * it
|
| 565 |
+
inst_mlp_wm_whead.scale_addr = WHEADS_BASE_ADDR + WHEADS_LAYER_OFFSET * it
|
| 566 |
+
inst_mlp_wm_whead.token = current_token
|
| 567 |
+
binary_instruction = inst_mlp_wm_whead.to_binary(current_inst_cnt, 'MLP_WM Whead')
|
| 568 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 569 |
+
current_inst_cnt += 1
|
| 570 |
+
else:
|
| 571 |
+
# MLP_WM Whead
|
| 572 |
+
inst_mlp_wm_whead_mxint4.input_addr = WHEAD_BASE_ADDR + (WHEAD_LAYER_OFFSET//2) * it
|
| 573 |
+
inst_mlp_wm_whead_mxint4.scale_addr = WHEADS_BASE_ADDR + WHEADS_LAYER_OFFSET * it
|
| 574 |
+
inst_mlp_wm_whead_mxint4.token = current_token
|
| 575 |
+
binary_instruction = inst_mlp_wm_whead_mxint4.to_binary(current_inst_cnt, 'MLP_WM Whead')
|
| 576 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 577 |
+
current_inst_cnt += 1
|
| 578 |
+
|
| 579 |
+
# STORE HEAD
|
| 580 |
+
inst_store_head.output_addr = HEAD_OUT_BASE_ADDR + HEAD_OUT_LAYER_OFFSET * it
|
| 581 |
+
inst_store_head.token = current_token
|
| 582 |
+
binary_instruction = inst_store_head.to_binary(current_inst_cnt, 'STORE HEAD')
|
| 583 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 584 |
+
current_inst_cnt += 1
|
| 585 |
+
# STORE HEAD_S
|
| 586 |
+
inst_store_head_s.output_addr = HEADS_OUT_BASE_ADDR + HEADS_OUT_LAYER_OFFSET * it
|
| 587 |
+
inst_store_head_s.token = current_token
|
| 588 |
+
binary_instruction = inst_store_head_s.to_binary(current_inst_cnt, 'STORE HEAD_S')
|
| 589 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 590 |
+
current_inst_cnt += 1
|
| 591 |
+
|
| 592 |
+
|
| 593 |
+
# Adjust parameters
|
| 594 |
+
tmp_output_dim += 1
|
| 595 |
+
current_token += 1
|
| 596 |
+
# END OF TOKEN
|
| 597 |
+
|
| 598 |
+
# end instruction NOP
|
| 599 |
+
binary_instruction = inst_nop.to_binary(current_inst_cnt, 'End instruction NOP')
|
| 600 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 601 |
+
current_inst_cnt += 1
|
| 602 |
+
# END OF SIMULATION
|
| 603 |
+
print('INFO: Total Instruction Count: {}'.format(current_inst_cnt))
|
| 604 |
+
|
| 605 |
+
elif TEST_OP_GROUP == 'test_bw':
|
| 606 |
+
current_inst_cnt = 0
|
| 607 |
+
with open("../instruction/instruction_test_bw.bin", "wb") as f:
|
| 608 |
+
for i in range(SIM_NUM_TOKEN):
|
| 609 |
+
binary_instruction = inst_test_bw.to_binary(current_inst_cnt, 'TEST BW')
|
| 610 |
+
f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
|
| 611 |
+
current_inst_cnt += 1
|
| 612 |
+
print('INFO: Total Instruction Count: {}'.format(current_inst_cnt))
|
instruction/instruction_1024T_32L_no_write_back_logit_everyT_mxint4.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4227f9d80b5024e2f0ca32b283513ed47fe752f4c7c21104eb0dca6228e570b4
|
| 3 |
+
size 75628608
|
instruction/instruction_1024T_32L_no_write_back_logit_everyT_mxint8.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1e0e514eca48728cf2a4c5d5fa6d8bb09299f1fa701c61438dd47ce492f9788
|
| 3 |
+
size 75628608
|
instruction/instruction_1024T_32L_write_back_logit_everyT_mxint4.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7baec5523f2c7558828f9fa2c1add85791613c1f721a1228af1c1d426e7f73f8
|
| 3 |
+
size 77463616
|
instruction/instruction_1024T_32L_write_back_logit_everyT_mxint8.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:beef5a09e1c473e54182fb81f7963fe7c83ff8a99275ecb47a0954f8ec154695
|
| 3 |
+
size 77463616
|