benchang1110 commited on
Commit
7e84e49
·
verified ·
1 Parent(s): 146b025

upload bin folder

Browse files
instruction/Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PHONY: clean gen_inst
2
+
3
+ # true or false
4
+ # if true, generate LLM head instructions for FPGA simulation (if false, make sure the host machine has AVX512)
5
+ SIM_LLM_HEAD = true
6
+
7
+ gen_inst:
8
+ python gen_instruction.py --sim_cmode mxint4 $(if $(filter-out false,$(SIM_LLM_HEAD)),--sim_llm_head)
9
+ python gen_instruction.py --sim_cmode mxint8 $(if $(filter-out false,$(SIM_LLM_HEAD)),--sim_llm_head)
10
+
11
+ clean:
12
+ rm -f *.bin
13
+
instruction/gen_instruction.py ADDED
@@ -0,0 +1,612 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ # operation groups
4
+ NOP = 0b00000000000001
5
+ TEST_BW = 0b00000000000010
6
+ LOAD = 0b00000000000100
7
+ STORE = 0b00000000001000
8
+ MLP_WM = 0b00000000010000
9
+ MLP_QKT = 0b00000000100000
10
+ QKT_M_RSQRT = 0b00000001000000
11
+ MLP_HP = 0b00000010000000
12
+ GATE = 0b00000100000000
13
+ RRMS = 0b00001000000000
14
+ RMSNORM = 0b00010000000000
15
+ SOFTMAX = 0b00100000000000
16
+ RESIDUAL = 0b01000000000000
17
+ ROPE = 0b10000000000000
18
+ # cmode groups
19
+ MX_INT8 = 0b0
20
+ MX_INT4 = 0b1
21
+ #stage groups
22
+ SUM = 0b0
23
+ GEN = 0b1
24
+ # nonlinear groups
25
+ NO_ACT = 0b0
26
+ SILU = 0b1
27
+ # load targets groups
28
+ NULL = 0b000
29
+ ACT = 0b001
30
+ ACT_S = 0b010
31
+ RESI = 0b011
32
+ RESI_S = 0b100
33
+ WEIGHT_S = 0b101
34
+ # write back groups
35
+ NO_WB = 0b00000001
36
+ WB = 0b00000010
37
+ WB_KV = 0b00000100
38
+ WB_KV_S = 0b00001000
39
+ WB_ACT = 0b00010000
40
+ WB_ACT_S = 0b00100000
41
+ WB_RESI = 0b01000000
42
+ WB_RESI_S = 0b10000000
43
+
44
+ # address space
45
+ WQ_BASE_ADDR , WQ_LAYER_OFFSET = 0x00000000, 0x00100000
46
+ WK_BASE_ADDR , WK_LAYER_OFFSET = 0x02000000, 0x00100000
47
+ WV_BASE_ADDR , WV_LAYER_OFFSET = 0x04000000, 0x00100000
48
+ WO_BASE_ADDR , WO_LAYER_OFFSET = 0x06000000, 0x00100000
49
+ W1_BASE_ADDR , W1_LAYER_OFFSET = 0x08000000, 0x002B0000
50
+ W3_BASE_ADDR , W3_LAYER_OFFSET = 0x0D600000, 0x002B0000
51
+ W2_BASE_ADDR , W2_LAYER_OFFSET = 0x12C00000, 0x002B0000
52
+ WQS_BASE_ADDR , WQS_LAYER_OFFSET = 0x18200000, 0x00008000
53
+ WKS_BASE_ADDR , WKS_LAYER_OFFSET = 0x18300000, 0x00008000
54
+ WVS_BASE_ADDR , WVS_LAYER_OFFSET = 0x18400000, 0x00008000
55
+ WOS_BASE_ADDR , WOS_LAYER_OFFSET = 0x18500000, 0x00008000
56
+ W1S_BASE_ADDR , W1S_LAYER_OFFSET = 0x18600000, 0x0015800
57
+ W3S_BASE_ADDR , W3S_LAYER_OFFSET = 0x188B0000, 0x0015800
58
+ W2S_BASE_ADDR , W2S_LAYER_OFFSET = 0x18B60000, 0x0018000
59
+ KC_BASE_ADDR , KC_LAYER_OFFSET , KC_TOKEN_OFFSET = 0x18E60000, 0x0080000 , 0x100
60
+ VC_BASE_ADDR , VC_LAYER_OFFSET , VC_TOKEN_OFFSET = 0x19E60000, 0x0080000 , 0x100
61
+ KCS_BASE_ADDR , KCS_LAYER_OFFSET , KCS_TOKEN_OFFSET = 0x1AE60000, 0x0040000 , 0x80
62
+ VCS_BASE_ADDR , VCS_LAYER_OFFSET , VCS_TOKEN_OFFSET = 0x1B660000, 0x0040000 , 0x80
63
+ ACT_BASE_ADDR , ACT_TOKEN_OFFSET = 0x1BE60000, 0x1000
64
+ RESI_BASE_ADDR , RESI_TOKEN_OFFSET = 0x1C660000, 0x1000
65
+ ACTS_BASE_ADDR , ACTS_TOKEN_OFFSET = 0x1CE60000, 0x80
66
+ RESIS_BASE_ADDR , RESIS_TOKEN_OFFSET = 0x1CEA0000, 0x80
67
+ PRENORM_ADDR , PRENORM_LAYER_OFFSET = 0x1CEE0000, 0x1000
68
+ POSTNORM_ADDR , POSTNORM_LAYER_OFFSET = 0x1CF00000, 0x1000
69
+ ROPE_BASE_ADDR , ROPE_TOKEN_OFFSET = 0x1CF20000, 0xC0
70
+ PRENORMS_ADDR , PRENORMS_LAYER_OFFSET = 0x1CFE0000, 0x80
71
+ POSTNORMS_ADDR , POSTNORMS_LAYER_OFFSET = 0x1CFE1000, 0x80
72
+ OUTNORM_ADDR = 0x1D840000
73
+ OUTNORMS_ADDR = 0x1D841000
74
+ WHEAD_BASE_ADDR , WHEAD_LAYER_OFFSET = 0x1D000000, 0x100000
75
+ WHEADS_BASE_ADDR , WHEADS_LAYER_OFFSET = 0x1D800000, 0x8000
76
+ HEAD_OUT_BASE_ADDR , HEAD_OUT_LAYER_OFFSET = 0x1E000000, 0x1000
77
+ HEADS_OUT_BASE_ADDR , HEADS_OUT_LAYER_OFFSET = 0x1E008000, 0x80
78
+
79
+ class Instruction:
80
+ def __init__(self,
81
+ op,
82
+ dq_en,
83
+ stage,
84
+ token,
85
+ load_target,
86
+ cmode,
87
+ nonlinear,
88
+ write_back,
89
+ input_dim,
90
+ output_dim,
91
+ input_addr,
92
+ scale_addr,
93
+ output_addr,
94
+ layer_offset,
95
+ token_offset,
96
+ num_cb_ws,
97
+ num_cb_wm
98
+ ):
99
+ self.op = op
100
+ self.dq_en = dq_en
101
+ self.stage = stage
102
+ self.token = token
103
+ self.load_target = load_target
104
+ self.cmode = cmode
105
+ self.nonlinear = nonlinear
106
+ self.write_back = write_back
107
+ self.input_dim = input_dim
108
+ self.output_dim = output_dim
109
+ self.input_addr = input_addr
110
+ self.scale_addr = scale_addr
111
+ self.output_addr = output_addr
112
+ self.layer_offset = layer_offset
113
+ self.token_offset = token_offset
114
+ self.num_cb_ws = num_cb_ws
115
+ self.num_cb_wm = num_cb_wm
116
+
117
+ def to_binary(self, inst_num:int, inst_info:str):
118
+ print('INFO: {:30s} , Instruction id: {}'.format(inst_info, inst_num))
119
+ # Convert the instruction to a binary format
120
+ binary_format = (
121
+ f"{self.op:014b}{self.dq_en:01b}{self.stage:01b}{self.token:011b}"
122
+ f"{self.load_target:03b}{self.cmode:01b}{self.nonlinear:01b}{self.write_back:08b}"
123
+ f"{self.input_dim:016b}{self.output_dim:016b}"
124
+ f"{self.input_addr:032b}{self.scale_addr:032b}{self.output_addr:032b}"
125
+ f"{self.layer_offset:032b}{self.token_offset:032b}{self.num_cb_ws:016b}{self.num_cb_wm:016b}"
126
+ )
127
+ padding_length = 512 - len(binary_format)
128
+ binary_format = '0' * padding_length + binary_format
129
+ return binary_format
130
+
131
+ def gen_inst(op, dq_en, stage, token, load_target, cmode, nonlinear, write_back, input_dim, output_dim, input_addr, scale_addr, output_addr, layer_offset, token_offset, num_cb_ws, num_cb_wm):
132
+ return Instruction(op, dq_en, stage, token, load_target, cmode, nonlinear, write_back, input_dim, output_dim, input_addr, scale_addr, output_addr, layer_offset, token_offset, num_cb_ws, num_cb_wm)
133
+ # =====================================================================================================================================================================================================================
134
+ # Test bandwidth
135
+ # =====================================================================================================================================================================================================================
136
+ inst_test_bw = gen_inst(TEST_BW, 0, SUM, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 0 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
137
+ # =====================================================================================================================================================================================================================
138
+ # MXINT8 Instruction templates
139
+ # =====================================================================================================================================================================================================================
140
+ inst_nop = gen_inst(NOP, 0, SUM, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 0 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
141
+ inst_test_bw = gen_inst(TEST_BW, 0, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 0 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
142
+ inst_load_resi = gen_inst(LOAD, 0, GEN, 0, RESI, MX_INT8, NO_ACT, NO_WB, 4096, 0 , RESI_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, RESI_TOKEN_OFFSET, 0, 0 )
143
+ inst_load_in_act = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , ACT_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ACT_TOKEN_OFFSET, 0, 0 )
144
+ inst_load_resi_s = gen_inst(LOAD, 0, GEN, 0, RESI_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , RESIS_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, RESIS_TOKEN_OFFSET, 0, 0 )
145
+ inst_load_in_act_s = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , ACTS_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ACTS_TOKEN_OFFSET, 0, 0 )
146
+ inst_mlp_wm_q = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , WQ_BASE_ADDR, WQS_BASE_ADDR, 0x00000000, WQ_LAYER_OFFSET, 0x00000000, 128, 4096 )
147
+ inst_mlp_wm_k = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , WK_BASE_ADDR, WKS_BASE_ADDR, 0x00000000, WK_LAYER_OFFSET, 0x00000000, 128, 4096 )
148
+ inst_mlp_wm_v = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, WB, 4096, 4096 , WV_BASE_ADDR, WVS_BASE_ADDR, 0x00000000, WV_LAYER_OFFSET, 0x00000000, 128, 4096 )
149
+ inst_mlp_wm_o = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , WO_BASE_ADDR, WOS_BASE_ADDR, 0x00000000, WO_LAYER_OFFSET, 0x00000000, 128, 4096 )
150
+ inst_mlp_wm_w1 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, SILU, NO_WB, 4096, 11008 , W1_BASE_ADDR, W1S_BASE_ADDR, 0x00000000, W1_LAYER_OFFSET, 0x00000000, 32, 1024 )
151
+ inst_mlp_wm_w3 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 4096, 11008 , W3_BASE_ADDR, W3S_BASE_ADDR, 0x00000000, W3_LAYER_OFFSET, 0x00000000, 32, 1024 )
152
+ inst_mlp_wm_w2 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, NO_WB, 11008, 4096 , W2_BASE_ADDR, W2S_BASE_ADDR, 0x00000000, W2_LAYER_OFFSET, 0x00000000, 24, 688 )
153
+ inst_gate = gen_inst(GATE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 11008, 11008 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
154
+ inst_residual = gen_inst(RESIDUAL, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
155
+ inst_store_act = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT, 4096, 4096 , 0x00000000, 0x00000000, ACT_BASE_ADDR, 0x00000000, ACT_TOKEN_OFFSET, 0, 0 )
156
+ inst_store_act_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT_S, 4096, 4096 , 0x00000000, 0x00000000, ACTS_BASE_ADDR, 0x00000000, ACTS_TOKEN_OFFSET, 0, 0 )
157
+ inst_store_resi = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT, 4096, 4096 , 0x00000000, 0x00000000, RESI_BASE_ADDR, 0x00000000, RESI_TOKEN_OFFSET, 0, 0 )
158
+ inst_store_resi_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT_S, 4096, 4096 , 0x00000000, 0x00000000, RESIS_BASE_ADDR, 0x00000000, RESIS_TOKEN_OFFSET, 0, 0 )
159
+ inst_rope_nwb = gen_inst(ROPE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 192, 4096 , ROPE_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ROPE_TOKEN_OFFSET, 0, 0 )
160
+ inst_rope_wb = gen_inst(ROPE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB, 192, 4096 , ROPE_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ROPE_TOKEN_OFFSET, 0, 0 )
161
+ inst_store_k = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV, 4096, 4096 , 0x00000000, 0x00000000, KC_BASE_ADDR, 0x00000000, KC_TOKEN_OFFSET, 0, 0 )
162
+ inst_store_k_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV_S, 4096, 4096 , 0x00000000, 0x00000000, KCS_BASE_ADDR, 0x00000000, KCS_TOKEN_OFFSET, 0, 0 )
163
+ inst_store_v = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV, 4096, 4096 , 0x00000000, 0x00000000, VC_BASE_ADDR, 0x00000000, VC_TOKEN_OFFSET, 0, 0 )
164
+ inst_store_v_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_KV_S, 4096, 4096 , 0x00000000, 0x00000000, VCS_BASE_ADDR, 0x00000000, VCS_TOKEN_OFFSET, 0, 0 )
165
+ inst_mlp_qkt = gen_inst(MLP_QKT, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4, 1 , KC_BASE_ADDR, KCS_BASE_ADDR, 0x00000000, 0x00000000, KCS_TOKEN_OFFSET, 4, 32 )
166
+ inst_qkt_m_rsqrt = gen_inst(QKT_M_RSQRT, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 1, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1, 512 )
167
+ inst_softmax = gen_inst(SOFTMAX, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 0, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1, 0 )
168
+ inst_mlp_hp = gen_inst(MLP_HP, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4, 4096 , VC_BASE_ADDR, VCS_BASE_ADDR, 0x00000000, 0x00000000, VCS_TOKEN_OFFSET, 4, 32 )
169
+ inst_prerrms = gen_inst(RRMS, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
170
+ inst_load_pre_wm = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , PRENORM_ADDR, 0x00000000, 0x00000000, PRENORM_LAYER_OFFSET, 0x00000000, 0, 0 )
171
+ inst_load_pre_ws = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , PRENORMS_ADDR, 0x00000000, 0x00000000, PRENORMS_LAYER_OFFSET, 0x00000000, 0, 0 )
172
+ inst_prermsnorm = gen_inst(RMSNORM, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
173
+ inst_postrrms = gen_inst(RRMS, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
174
+ inst_load_post_wm = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , POSTNORM_ADDR, 0x00000000, 0x00000000, POSTNORM_LAYER_OFFSET, 0x00000000, 0, 0 )
175
+ inst_load_post_ws = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , POSTNORMS_ADDR, 0x00000000, 0x00000000, POSTNORMS_LAYER_OFFSET, 0x00000000, 0, 0 )
176
+ inst_postrmsnorm = gen_inst(RMSNORM, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
177
+ # =====================================================================================================================================================================================================================
178
+ # DECODER OUT Instruction templates
179
+ # =====================================================================================================================================================================================================================
180
+ inst_outrrms = gen_inst(RRMS, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 1 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
181
+ inst_load_out_wm = gen_inst(LOAD, 0, GEN, 0, ACT, MX_INT8, NO_ACT, NO_WB, 4096, 0 , OUTNORM_ADDR, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
182
+ inst_load_out_ws = gen_inst(LOAD, 0, GEN, 0, ACT_S, MX_INT8, NO_ACT, NO_WB, 128, 0 , OUTNORMS_ADDR, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0 )
183
+ inst_outrmsnorm = gen_inst(RMSNORM, 1, GEN, 0, NULL, MX_INT8, NO_ACT, NO_WB, 4096, 4096 , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128 )
184
+ inst_mlp_wm_whead = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT8, NO_ACT, WB, 4096, 4096 , WHEAD_BASE_ADDR, WHEADS_BASE_ADDR, 0x00000000, WHEAD_LAYER_OFFSET, 0x00000000, 128, 4096 )
185
+ inst_store_head = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT, 4096, 4096 , 0x00000000, 0x00000000, HEAD_OUT_BASE_ADDR, 0x00000000, HEAD_OUT_LAYER_OFFSET, 0, 0 )
186
+ inst_store_head_s = gen_inst(STORE, 1, GEN, 0, NULL, MX_INT8, NO_ACT, WB_ACT_S, 4096, 4096 , 0x00000000, 0x00000000, HEADS_OUT_BASE_ADDR, 0x00000000, HEADS_OUT_LAYER_OFFSET, 0, 0 )
187
+ # =====================================================================================================================================================================================================================
188
+ # MXINT4 Instruction templates
189
+ # =====================================================================================================================================================================================================================
190
+ inst_mlp_wm_whead_mxint4= gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, WB, 4096, 4096 , WHEAD_BASE_ADDR, WHEADS_BASE_ADDR, 0x00000000, WHEAD_LAYER_OFFSET, 0x00000000, 128, 2048 )
191
+ inst_mlp_wm_q_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 4096 , WQ_BASE_ADDR, WQS_BASE_ADDR, 0x00000000, WQ_LAYER_OFFSET, 0x00000000, 128, 2048 )
192
+ inst_mlp_wm_k_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 4096 , WK_BASE_ADDR, WKS_BASE_ADDR, 0x00000000, WK_LAYER_OFFSET, 0x00000000, 128, 2048 )
193
+ inst_mlp_wm_v_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, WB, 4096, 4096 , WV_BASE_ADDR, WVS_BASE_ADDR, 0x00000000, WV_LAYER_OFFSET, 0x00000000, 128, 2048 )
194
+ inst_mlp_wm_o_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 4096 , WO_BASE_ADDR, WOS_BASE_ADDR, 0x00000000, WO_LAYER_OFFSET, 0x00000000, 128, 2048 )
195
+ inst_mlp_wm_w1_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, SILU, NO_WB, 4096, 11008 , W1_BASE_ADDR, W1S_BASE_ADDR, 0x00000000, W1_LAYER_OFFSET, 0x00000000, 32, 512 )
196
+ inst_mlp_wm_w3_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 4096, 11008 , W3_BASE_ADDR, W3S_BASE_ADDR, 0x00000000, W3_LAYER_OFFSET, 0x00000000, 32, 512 )
197
+ inst_mlp_wm_w2_mxint4 = gen_inst(MLP_WM, 1, GEN, 0, WEIGHT_S, MX_INT4, NO_ACT, NO_WB, 11008, 4096 , W2_BASE_ADDR, W2S_BASE_ADDR, 0x00000000, W2_LAYER_OFFSET, 0x00000000, 24, 344 )
198
+ # =====================================================================================================================================================================================================================
199
+ # Instruction templates end
200
+ # =====================================================================================================================================================================================================================
201
+ # gen instructions parameters
202
+ parser = argparse.ArgumentParser(description='Generate instruction binary file for simulation')
203
+ parser.add_argument('--sim_cmode', type=str, default='mxint8', choices=['mxint8', 'mxint4'], help='Simulation compute mode')
204
+ parser.add_argument('--sim_llm_head', action='store_true', help='if true, generate LLM head instructions for FPGA simulation')
205
+ args = parser.parse_args()
206
+
207
+ SIM_CMODE = args.sim_cmode
208
+ TEST_OP_GROUP = 'demo'
209
+ SIM_LLM_HEAD = args.sim_llm_head
210
+ SIM_LOGIT_FLAG = '' if SIM_LLM_HEAD else '_no'
211
+ SIM_NUM_TOKEN = 1024
212
+ SIM_NUM_LAYER = 32
213
+
214
+ if __name__ == "__main__":
215
+ if TEST_OP_GROUP == 'demo':
216
+ current_token = 0
217
+ tmp_output_dim = 1
218
+ current_inst_cnt = 0
219
+ file_name = "instruction_{}T_32L{}_write_back_logit_everyT_{}.bin".format(SIM_NUM_TOKEN, SIM_LOGIT_FLAG, SIM_CMODE)
220
+
221
+ with open(file_name, "wb") as f:
222
+ for tk in range(SIM_NUM_TOKEN):
223
+ print("Gen {} th token instruction start".format(tk+1))
224
+ # LOAD IN_ACT
225
+ inst_load_in_act.input_addr = ACT_BASE_ADDR + ACT_TOKEN_OFFSET * tk
226
+ inst_load_in_act.token = current_token
227
+ binary_instruction = inst_load_in_act.to_binary(current_inst_cnt, 'LOAD IN_ACT')
228
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
229
+ current_inst_cnt += 1
230
+ # LOAD IN_ACT
231
+ inst_load_in_act_s.input_addr = ACTS_BASE_ADDR + ACTS_TOKEN_OFFSET * tk
232
+ inst_load_in_act_s.token = current_token
233
+ binary_instruction = inst_load_in_act_s.to_binary(current_inst_cnt, 'LOAD IN_ACT_S')
234
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
235
+ current_inst_cnt += 1
236
+
237
+ for l in range(SIM_NUM_LAYER):
238
+ print("Gen {} th token, {} th layer instruction".format(tk+1, l+1))
239
+ # RRMS
240
+ inst_prerrms.token = current_token
241
+ binary_instruction = inst_prerrms.to_binary(current_inst_cnt, 'PRE RRMS')
242
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
243
+ current_inst_cnt += 1
244
+ # LOAD NORM weight
245
+ inst_load_pre_wm.input_addr = PRENORM_ADDR + PRENORM_LAYER_OFFSET * l
246
+ inst_load_pre_wm.token = current_token
247
+ binary_instruction = inst_load_pre_wm.to_binary(current_inst_cnt, 'LOAD PRENORM weight')
248
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
249
+ current_inst_cnt += 1
250
+ # LOAD NORM weight scale
251
+ inst_load_pre_ws.input_addr = PRENORMS_ADDR + PRENORMS_LAYER_OFFSET * l
252
+ inst_load_pre_ws.token = current_token
253
+ binary_instruction = inst_load_pre_ws.to_binary(current_inst_cnt, 'LOAD PRENORM weight scale')
254
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
255
+ current_inst_cnt += 1
256
+ # RMSNORM
257
+ inst_prermsnorm.token = current_token
258
+ binary_instruction = inst_prermsnorm.to_binary(current_inst_cnt, 'PRE RMSNORM')
259
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
260
+ current_inst_cnt += 1
261
+
262
+ if SIM_CMODE == 'mxint8':
263
+ # MLP_WM Wv
264
+ inst_mlp_wm_v.input_addr = WV_BASE_ADDR + WV_LAYER_OFFSET * l
265
+ inst_mlp_wm_v.scale_addr = WVS_BASE_ADDR + WVS_LAYER_OFFSET * l
266
+ inst_mlp_wm_v.token = current_token
267
+ binary_instruction = inst_mlp_wm_v.to_binary(current_inst_cnt, 'MLP_WM Wv')
268
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
269
+ current_inst_cnt += 1
270
+ else:
271
+ # MLP_WM Wv
272
+ inst_mlp_wm_v_mxint4.input_addr = WV_BASE_ADDR + (WV_LAYER_OFFSET//2) * l
273
+ inst_mlp_wm_v_mxint4.scale_addr = WVS_BASE_ADDR + WVS_LAYER_OFFSET * l
274
+ inst_mlp_wm_v_mxint4.token = current_token
275
+ binary_instruction = inst_mlp_wm_v_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wv')
276
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
277
+ current_inst_cnt += 1
278
+
279
+ # STORE V elem
280
+ inst_store_v.output_addr = VC_BASE_ADDR + VC_LAYER_OFFSET * l + VC_TOKEN_OFFSET * tk
281
+ inst_store_v.token = current_token
282
+ binary_instruction = inst_store_v.to_binary(current_inst_cnt, 'STORE V elem')
283
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
284
+ current_inst_cnt += 1
285
+ # STORE V scale
286
+ inst_store_v_s.output_addr = VCS_BASE_ADDR + VCS_LAYER_OFFSET * l + VCS_TOKEN_OFFSET * tk
287
+ inst_store_v_s.token = current_token
288
+ binary_instruction = inst_store_v_s.to_binary(current_inst_cnt, 'STORE V scale')
289
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
290
+ current_inst_cnt += 1
291
+
292
+ if SIM_CMODE == 'mxint8':
293
+ # MLP_WM Wk
294
+ inst_mlp_wm_k.input_addr = WK_BASE_ADDR + WK_LAYER_OFFSET * l
295
+ inst_mlp_wm_k.scale_addr = WKS_BASE_ADDR + WKS_LAYER_OFFSET * l
296
+ inst_mlp_wm_k.token = current_token
297
+ binary_instruction = inst_mlp_wm_k.to_binary(current_inst_cnt, 'MLP_WM Wk')
298
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
299
+ current_inst_cnt += 1
300
+ else:
301
+ # MLP_WM Wk
302
+ inst_mlp_wm_k_mxint4.input_addr = WK_BASE_ADDR + (WK_LAYER_OFFSET//2) * l
303
+ inst_mlp_wm_k_mxint4.scale_addr = WKS_BASE_ADDR + WKS_LAYER_OFFSET * l
304
+ inst_mlp_wm_k_mxint4.token = current_token
305
+ binary_instruction = inst_mlp_wm_k_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wk')
306
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
307
+ current_inst_cnt += 1
308
+
309
+ # ROPE WB (K)
310
+ inst_rope_wb.input_addr = ROPE_BASE_ADDR + ROPE_TOKEN_OFFSET * tk
311
+ inst_rope_wb.token = current_token
312
+ binary_instruction = inst_rope_wb.to_binary(current_inst_cnt, 'ROPE WB (K)')
313
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
314
+ current_inst_cnt += 1
315
+ # STORE K elem
316
+ inst_store_k.output_addr = KC_BASE_ADDR + KC_LAYER_OFFSET * l + KC_TOKEN_OFFSET * tk
317
+ inst_store_k.token = current_token
318
+ binary_instruction = inst_store_k.to_binary(current_inst_cnt, 'STORE K elem')
319
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
320
+ current_inst_cnt += 1
321
+ # STORE K scale
322
+ inst_store_k_s.output_addr = KCS_BASE_ADDR + KCS_LAYER_OFFSET * l + KCS_TOKEN_OFFSET * tk
323
+ inst_store_k_s.token = current_token
324
+ binary_instruction = inst_store_k_s.to_binary(current_inst_cnt, 'STORE K scale')
325
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
326
+ current_inst_cnt += 1
327
+
328
+ if SIM_CMODE == 'mxint8':
329
+ # MLP_WM Wq
330
+ inst_mlp_wm_q.input_addr = WQ_BASE_ADDR + WQ_LAYER_OFFSET * l
331
+ inst_mlp_wm_q.scale_addr = WQS_BASE_ADDR + WQS_LAYER_OFFSET * l
332
+ inst_mlp_wm_q.token = current_token
333
+ binary_instruction = inst_mlp_wm_q.to_binary(current_inst_cnt, 'MLP_WM Wq')
334
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
335
+ current_inst_cnt += 1
336
+ else:
337
+ # MLP_WM Wq
338
+ inst_mlp_wm_q_mxint4.input_addr = WQ_BASE_ADDR + (WQ_LAYER_OFFSET//2) * l
339
+ inst_mlp_wm_q_mxint4.scale_addr = WQS_BASE_ADDR + WQS_LAYER_OFFSET * l
340
+ inst_mlp_wm_q_mxint4.token = current_token
341
+ binary_instruction = inst_mlp_wm_q_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wq')
342
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
343
+ current_inst_cnt += 1
344
+
345
+ # ROPE NO_WB (Q)
346
+ inst_rope_nwb.input_addr = ROPE_BASE_ADDR + ROPE_TOKEN_OFFSET * tk
347
+ inst_rope_nwb.token = current_token
348
+ binary_instruction = inst_rope_nwb.to_binary(current_inst_cnt, 'ROPE NO_WB (Q)')
349
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
350
+ current_inst_cnt += 1
351
+ # MLP_QKT
352
+ inst_mlp_qkt.input_addr = KC_BASE_ADDR + KC_LAYER_OFFSET * l
353
+ inst_mlp_qkt.scale_addr = KCS_BASE_ADDR + KCS_LAYER_OFFSET * l
354
+ inst_mlp_qkt.output_dim = tmp_output_dim
355
+ inst_mlp_qkt.input_dim = tmp_output_dim * inst_mlp_qkt.num_cb_ws
356
+ inst_mlp_qkt.token = current_token
357
+ binary_instruction = inst_mlp_qkt.to_binary(current_inst_cnt, 'MLP_QKT')
358
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
359
+ current_inst_cnt += 1
360
+ # QKT_M_RSQRT
361
+ inst_qkt_m_rsqrt.output_dim = tmp_output_dim
362
+ inst_qkt_m_rsqrt.input_dim = tmp_output_dim * inst_qkt_m_rsqrt.num_cb_ws
363
+ inst_qkt_m_rsqrt.token = current_token
364
+ binary_instruction = inst_qkt_m_rsqrt.to_binary(current_inst_cnt, 'QKT_M_RSQRT')
365
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
366
+ current_inst_cnt += 1
367
+ # SOFTMAX
368
+ inst_softmax.output_dim = tmp_output_dim
369
+ inst_softmax.token = current_token
370
+ binary_instruction = inst_softmax.to_binary(current_inst_cnt, 'SOFTMAX')
371
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
372
+ current_inst_cnt += 1
373
+ # MLP_HP (SxV)
374
+ inst_mlp_hp.input_addr = VC_BASE_ADDR + VC_LAYER_OFFSET * l
375
+ inst_mlp_hp.scale_addr = VCS_BASE_ADDR + VCS_LAYER_OFFSET * l
376
+ inst_mlp_hp.input_dim = tmp_output_dim * inst_mlp_hp.num_cb_ws
377
+ inst_mlp_hp.token = current_token
378
+ binary_instruction = inst_mlp_hp.to_binary(current_inst_cnt, 'MLP_HP (SxV)')
379
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
380
+ current_inst_cnt += 1
381
+
382
+ if SIM_CMODE == 'mxint8':
383
+ # MLP_WM Wo
384
+ inst_mlp_wm_o.input_addr = WO_BASE_ADDR + WO_LAYER_OFFSET * l
385
+ inst_mlp_wm_o.scale_addr = WOS_BASE_ADDR + WOS_LAYER_OFFSET * l
386
+ inst_mlp_wm_o.token = current_token
387
+ binary_instruction = inst_mlp_wm_o.to_binary(current_inst_cnt, 'MLP_WM Wo')
388
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
389
+ current_inst_cnt += 1
390
+ else:
391
+ # MLP_WM Wo
392
+ inst_mlp_wm_o_mxint4.input_addr = WO_BASE_ADDR + (WO_LAYER_OFFSET//2) * l
393
+ inst_mlp_wm_o_mxint4.scale_addr = WOS_BASE_ADDR + WOS_LAYER_OFFSET * l
394
+ inst_mlp_wm_o_mxint4.token = current_token
395
+ binary_instruction = inst_mlp_wm_o_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wo')
396
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
397
+ current_inst_cnt += 1
398
+
399
+ # LOAD RESI
400
+ inst_load_resi.input_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
401
+ inst_load_resi.token = current_token
402
+ binary_instruction = inst_load_resi.to_binary(current_inst_cnt, 'LOAD RESI')
403
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
404
+ current_inst_cnt += 1
405
+ # LOAD RESI_S
406
+ inst_load_resi_s.input_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
407
+ inst_load_resi_s.token = current_token
408
+ binary_instruction = inst_load_resi_s.to_binary(current_inst_cnt, 'LOAD RESI_S')
409
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
410
+ current_inst_cnt += 1
411
+ # RESIDUAL
412
+ inst_residual.token = current_token
413
+ binary_instruction = inst_residual.to_binary(current_inst_cnt, 'RESIDUAL')
414
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
415
+ current_inst_cnt += 1
416
+ # STORE RESI
417
+ inst_store_resi.output_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
418
+ inst_store_resi.token = current_token
419
+ binary_instruction = inst_store_resi.to_binary(current_inst_cnt, 'STORE RESI')
420
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
421
+ current_inst_cnt += 1
422
+ # STORE RESI_S
423
+ inst_store_resi_s.output_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
424
+ inst_store_resi_s.token = current_token
425
+ binary_instruction = inst_store_resi_s.to_binary(current_inst_cnt, 'STORE RESI_S')
426
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
427
+ current_inst_cnt += 1
428
+ # RRMS
429
+ inst_postrrms.token = current_token
430
+ binary_instruction = inst_postrrms.to_binary(current_inst_cnt, 'POST RRMS')
431
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
432
+ current_inst_cnt += 1
433
+ # LOAD NORM weight
434
+ inst_load_post_wm.input_addr = POSTNORM_ADDR + POSTNORM_LAYER_OFFSET * l
435
+ inst_load_post_wm.token = current_token
436
+ binary_instruction = inst_load_post_wm.to_binary(current_inst_cnt, 'LOAD POSTNORM weight')
437
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
438
+ current_inst_cnt += 1
439
+ # LOAD NORM weight scale
440
+ inst_load_post_ws.input_addr = POSTNORMS_ADDR + POSTNORMS_LAYER_OFFSET * l
441
+ inst_load_post_ws.token = current_token
442
+ binary_instruction = inst_load_post_ws.to_binary(current_inst_cnt, 'LOAD POSTNORM weight scale')
443
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
444
+ current_inst_cnt += 1
445
+ # RMSNORM
446
+ inst_postrmsnorm.token = current_token
447
+ binary_instruction = inst_postrmsnorm.to_binary(current_inst_cnt, 'POST RMSNORM')
448
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
449
+ current_inst_cnt += 1
450
+
451
+ if SIM_CMODE == 'mxint8':
452
+ # MLP_WM W1
453
+ inst_mlp_wm_w1.input_addr = W1_BASE_ADDR + W1_LAYER_OFFSET * l
454
+ inst_mlp_wm_w1.scale_addr = W1S_BASE_ADDR + W1S_LAYER_OFFSET * l
455
+ inst_mlp_wm_w1.token = current_token
456
+ binary_instruction = inst_mlp_wm_w1.to_binary(current_inst_cnt, 'MLP_WM W1')
457
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
458
+ current_inst_cnt += 1
459
+ # MLP_WM W3
460
+ inst_mlp_wm_w3.input_addr = W3_BASE_ADDR + W3_LAYER_OFFSET * l
461
+ inst_mlp_wm_w3.scale_addr = W3S_BASE_ADDR + W3S_LAYER_OFFSET * l
462
+ inst_mlp_wm_w3.token = current_token
463
+ binary_instruction = inst_mlp_wm_w3.to_binary(current_inst_cnt, 'MLP_WM W3')
464
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
465
+ current_inst_cnt += 1
466
+ else:
467
+ # MLP_WM W1
468
+ inst_mlp_wm_w1_mxint4.input_addr = W1_BASE_ADDR + (W1_LAYER_OFFSET//2) * l
469
+ inst_mlp_wm_w1_mxint4.scale_addr = W1S_BASE_ADDR + W1S_LAYER_OFFSET * l
470
+ inst_mlp_wm_w1_mxint4.token = current_token
471
+ binary_instruction = inst_mlp_wm_w1_mxint4.to_binary(current_inst_cnt, 'MLP_WM W1')
472
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
473
+ current_inst_cnt += 1
474
+ # MLP_WM W3
475
+ inst_mlp_wm_w3_mxint4.input_addr = W3_BASE_ADDR + (W3_LAYER_OFFSET//2) * l
476
+ inst_mlp_wm_w3_mxint4.scale_addr = W3S_BASE_ADDR + W3S_LAYER_OFFSET * l
477
+ inst_mlp_wm_w3_mxint4.token = current_token
478
+ binary_instruction = inst_mlp_wm_w3_mxint4.to_binary(current_inst_cnt, 'MLP_WM W3')
479
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
480
+ current_inst_cnt += 1
481
+
482
+ # GATE
483
+ inst_gate.token = current_token
484
+ binary_instruction = inst_gate.to_binary(current_inst_cnt, 'GATE')
485
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
486
+ current_inst_cnt += 1
487
+
488
+ if SIM_CMODE == 'mxint8':
489
+ # MLP_WM W2
490
+ inst_mlp_wm_w2.input_addr = W2_BASE_ADDR + W2_LAYER_OFFSET * l
491
+ inst_mlp_wm_w2.scale_addr = W2S_BASE_ADDR + W2S_LAYER_OFFSET * l
492
+ inst_mlp_wm_w2.token = current_token
493
+ binary_instruction = inst_mlp_wm_w2.to_binary(current_inst_cnt, 'MLP_WM W2')
494
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
495
+ current_inst_cnt += 1
496
+ else:
497
+ # MLP_WM W2
498
+ inst_mlp_wm_w2_mxint4.input_addr = W2_BASE_ADDR + (W2_LAYER_OFFSET//2) * l
499
+ inst_mlp_wm_w2_mxint4.scale_addr = W2S_BASE_ADDR + W2S_LAYER_OFFSET * l
500
+ inst_mlp_wm_w2_mxint4.token = current_token
501
+ binary_instruction = inst_mlp_wm_w2_mxint4.to_binary(current_inst_cnt, 'MLP_WM W2')
502
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
503
+ current_inst_cnt += 1
504
+
505
+ # LOAD RESI
506
+ inst_load_resi.input_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
507
+ inst_load_resi.token = current_token
508
+ binary_instruction = inst_load_resi.to_binary(current_inst_cnt, 'LOAD RESI')
509
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
510
+ current_inst_cnt += 1
511
+ # LOAD RESI_S
512
+ inst_load_resi_s.input_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
513
+ inst_load_resi_s.token = current_token
514
+ binary_instruction = inst_load_resi_s.to_binary(current_inst_cnt, 'LOAD RESI_S')
515
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
516
+ current_inst_cnt += 1
517
+ # RESIDUAL
518
+ inst_residual.token = current_token
519
+ binary_instruction = inst_residual.to_binary(current_inst_cnt, 'RESIDUAL')
520
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
521
+ current_inst_cnt += 1
522
+ # STORE RESI
523
+ inst_store_resi.output_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
524
+ inst_store_resi.token = current_token
525
+ binary_instruction = inst_store_resi.to_binary(current_inst_cnt, 'STORE RESI')
526
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
527
+ current_inst_cnt += 1
528
+ # STORE RESI_S
529
+ inst_store_resi_s.output_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
530
+ inst_store_resi_s.token = current_token
531
+ binary_instruction = inst_store_resi_s.to_binary(current_inst_cnt, 'STORE RESI_S')
532
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
533
+ current_inst_cnt += 1
534
+ # END OF LAYER
535
+
536
+ # if tk == SIM_NUM_TOKEN-1 and SIM_NUM_LAYER == 32 and SIM_LLM_HEAD:
537
+ if SIM_NUM_LAYER == 32 and SIM_LLM_HEAD:
538
+ # RRMS
539
+ inst_outrrms.token = current_token
540
+ binary_instruction = inst_outrrms.to_binary(current_inst_cnt, 'OUT RRMS')
541
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
542
+ current_inst_cnt += 1
543
+ # LOAD NORM weight
544
+ inst_load_out_wm.input_addr = OUTNORM_ADDR
545
+ inst_load_out_wm.token = current_token
546
+ binary_instruction = inst_load_out_wm.to_binary(current_inst_cnt, 'LOAD OUTNORM weight')
547
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
548
+ current_inst_cnt += 1
549
+ # LOAD NORM weight scale
550
+ inst_load_out_ws.input_addr = OUTNORMS_ADDR
551
+ inst_load_out_ws.token = current_token
552
+ binary_instruction = inst_load_out_ws.to_binary(current_inst_cnt, 'LOAD OUTNORM weight scale')
553
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
554
+ current_inst_cnt += 1
555
+ # RMSNORM
556
+ inst_outrmsnorm.token = current_token
557
+ binary_instruction = inst_outrmsnorm.to_binary(current_inst_cnt, 'OUT RMSNORM')
558
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
559
+ current_inst_cnt += 1
560
+
561
+ for it in range(8):
562
+ if SIM_CMODE == 'mxint8':
563
+ # MLP_WM Whead
564
+ inst_mlp_wm_whead.input_addr = WHEAD_BASE_ADDR + WHEAD_LAYER_OFFSET * it
565
+ inst_mlp_wm_whead.scale_addr = WHEADS_BASE_ADDR + WHEADS_LAYER_OFFSET * it
566
+ inst_mlp_wm_whead.token = current_token
567
+ binary_instruction = inst_mlp_wm_whead.to_binary(current_inst_cnt, 'MLP_WM Whead')
568
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
569
+ current_inst_cnt += 1
570
+ else:
571
+ # MLP_WM Whead
572
+ inst_mlp_wm_whead_mxint4.input_addr = WHEAD_BASE_ADDR + (WHEAD_LAYER_OFFSET//2) * it
573
+ inst_mlp_wm_whead_mxint4.scale_addr = WHEADS_BASE_ADDR + WHEADS_LAYER_OFFSET * it
574
+ inst_mlp_wm_whead_mxint4.token = current_token
575
+ binary_instruction = inst_mlp_wm_whead_mxint4.to_binary(current_inst_cnt, 'MLP_WM Whead')
576
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
577
+ current_inst_cnt += 1
578
+
579
+ # STORE HEAD
580
+ inst_store_head.output_addr = HEAD_OUT_BASE_ADDR + HEAD_OUT_LAYER_OFFSET * it
581
+ inst_store_head.token = current_token
582
+ binary_instruction = inst_store_head.to_binary(current_inst_cnt, 'STORE HEAD')
583
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
584
+ current_inst_cnt += 1
585
+ # STORE HEAD_S
586
+ inst_store_head_s.output_addr = HEADS_OUT_BASE_ADDR + HEADS_OUT_LAYER_OFFSET * it
587
+ inst_store_head_s.token = current_token
588
+ binary_instruction = inst_store_head_s.to_binary(current_inst_cnt, 'STORE HEAD_S')
589
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
590
+ current_inst_cnt += 1
591
+
592
+
593
+ # Adjust parameters
594
+ tmp_output_dim += 1
595
+ current_token += 1
596
+ # END OF TOKEN
597
+
598
+ # end instruction NOP
599
+ binary_instruction = inst_nop.to_binary(current_inst_cnt, 'End instruction NOP')
600
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
601
+ current_inst_cnt += 1
602
+ # END OF SIMULATION
603
+ print('INFO: Total Instruction Count: {}'.format(current_inst_cnt))
604
+
605
+ elif TEST_OP_GROUP == 'test_bw':
606
+ current_inst_cnt = 0
607
+ with open("../instruction/instruction_test_bw.bin", "wb") as f:
608
+ for i in range(SIM_NUM_TOKEN):
609
+ binary_instruction = inst_test_bw.to_binary(current_inst_cnt, 'TEST BW')
610
+ f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
611
+ current_inst_cnt += 1
612
+ print('INFO: Total Instruction Count: {}'.format(current_inst_cnt))
instruction/instruction_1024T_32L_no_write_back_logit_everyT_mxint4.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4227f9d80b5024e2f0ca32b283513ed47fe752f4c7c21104eb0dca6228e570b4
3
+ size 75628608
instruction/instruction_1024T_32L_no_write_back_logit_everyT_mxint8.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1e0e514eca48728cf2a4c5d5fa6d8bb09299f1fa701c61438dd47ce492f9788
3
+ size 75628608
instruction/instruction_1024T_32L_write_back_logit_everyT_mxint4.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7baec5523f2c7558828f9fa2c1add85791613c1f721a1228af1c1d426e7f73f8
3
+ size 77463616
instruction/instruction_1024T_32L_write_back_logit_everyT_mxint8.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beef5a09e1c473e54182fb81f7963fe7c83ff8a99275ecb47a0954f8ec154695
3
+ size 77463616