upload bin folder

Browse files

Files changed (6) hide show

instruction/Makefile +13 -0
instruction/gen_instruction.py +612 -0
instruction/instruction_1024T_32L_no_write_back_logit_everyT_mxint4.bin +3 -0
instruction/instruction_1024T_32L_no_write_back_logit_everyT_mxint8.bin +3 -0
instruction/instruction_1024T_32L_write_back_logit_everyT_mxint4.bin +3 -0
instruction/instruction_1024T_32L_write_back_logit_everyT_mxint8.bin +3 -0

instruction/Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+PHONY: clean gen_inst
+# true or false
+# if true, generate LLM head instructions for FPGA simulation (if false, make sure the host machine has AVX512)
+SIM_LLM_HEAD = true
+gen_inst:
+	python gen_instruction.py --sim_cmode mxint4 $(if $(filter-out false,$(SIM_LLM_HEAD)),--sim_llm_head)
+	python gen_instruction.py --sim_cmode mxint8 $(if $(filter-out false,$(SIM_LLM_HEAD)),--sim_llm_head)
+clean:
+	rm -f *.bin

instruction/gen_instruction.py ADDED Viewed

	@@ -0,0 +1,612 @@

+import argparse
+# operation groups
+NOP         = 0b00000000000001
+TEST_BW     = 0b00000000000010
+LOAD        = 0b00000000000100
+STORE       = 0b00000000001000
+MLP_WM      = 0b00000000010000
+MLP_QKT     = 0b00000000100000
+QKT_M_RSQRT = 0b00000001000000
+MLP_HP      = 0b00000010000000
+GATE        = 0b00000100000000
+RRMS        = 0b00001000000000
+RMSNORM     = 0b00010000000000
+SOFTMAX     = 0b00100000000000
+RESIDUAL    = 0b01000000000000
+ROPE        = 0b10000000000000
+# cmode groups
+MX_INT8     = 0b0
+MX_INT4     = 0b1
+#stage groups
+SUM         = 0b0
+GEN         = 0b1
+# nonlinear groups
+NO_ACT      = 0b0
+SILU        = 0b1
+# load targets groups
+NULL        = 0b000
+ACT         = 0b001
+ACT_S       = 0b010
+RESI        = 0b011
+RESI_S      = 0b100
+WEIGHT_S    = 0b101
+# write back groups
+NO_WB       = 0b00000001
+WB          = 0b00000010
+WB_KV       = 0b00000100
+WB_KV_S     = 0b00001000
+WB_ACT      = 0b00010000
+WB_ACT_S    = 0b00100000
+WB_RESI     = 0b01000000
+WB_RESI_S   = 0b10000000
+# address space
+WQ_BASE_ADDR    , WQ_LAYER_OFFSET                       = 0x00000000, 0x00100000
+WK_BASE_ADDR    , WK_LAYER_OFFSET                       = 0x02000000, 0x00100000
+WV_BASE_ADDR    , WV_LAYER_OFFSET                       = 0x04000000, 0x00100000
+WO_BASE_ADDR    , WO_LAYER_OFFSET                       = 0x06000000, 0x00100000
+W1_BASE_ADDR    , W1_LAYER_OFFSET                       = 0x08000000, 0x002B0000
+W3_BASE_ADDR    , W3_LAYER_OFFSET                       = 0x0D600000, 0x002B0000
+W2_BASE_ADDR    , W2_LAYER_OFFSET                       = 0x12C00000, 0x002B0000
+WQS_BASE_ADDR   , WQS_LAYER_OFFSET                      = 0x18200000, 0x00008000
+WKS_BASE_ADDR   , WKS_LAYER_OFFSET                      = 0x18300000, 0x00008000
+WVS_BASE_ADDR   , WVS_LAYER_OFFSET                      = 0x18400000, 0x00008000
+WOS_BASE_ADDR   , WOS_LAYER_OFFSET                      = 0x18500000, 0x00008000
+W1S_BASE_ADDR   , W1S_LAYER_OFFSET                      = 0x18600000, 0x0015800
+W3S_BASE_ADDR   , W3S_LAYER_OFFSET                      = 0x188B0000, 0x0015800
+W2S_BASE_ADDR   , W2S_LAYER_OFFSET                      = 0x18B60000, 0x0018000
+KC_BASE_ADDR    , KC_LAYER_OFFSET   , KC_TOKEN_OFFSET   = 0x18E60000, 0x0080000     , 0x100
+VC_BASE_ADDR    , VC_LAYER_OFFSET   , VC_TOKEN_OFFSET   = 0x19E60000, 0x0080000     , 0x100
+KCS_BASE_ADDR   , KCS_LAYER_OFFSET  , KCS_TOKEN_OFFSET  = 0x1AE60000, 0x0040000     , 0x80
+VCS_BASE_ADDR   , VCS_LAYER_OFFSET  , VCS_TOKEN_OFFSET  = 0x1B660000, 0x0040000     , 0x80
+ACT_BASE_ADDR   , ACT_TOKEN_OFFSET                      = 0x1BE60000, 0x1000
+RESI_BASE_ADDR  , RESI_TOKEN_OFFSET                     = 0x1C660000, 0x1000
+ACTS_BASE_ADDR  , ACTS_TOKEN_OFFSET                     = 0x1CE60000, 0x80
+RESIS_BASE_ADDR , RESIS_TOKEN_OFFSET                    = 0x1CEA0000, 0x80
+PRENORM_ADDR    , PRENORM_LAYER_OFFSET                  = 0x1CEE0000, 0x1000
+POSTNORM_ADDR   , POSTNORM_LAYER_OFFSET                 = 0x1CF00000, 0x1000
+ROPE_BASE_ADDR  , ROPE_TOKEN_OFFSET                     = 0x1CF20000, 0xC0
+PRENORMS_ADDR   , PRENORMS_LAYER_OFFSET                 = 0x1CFE0000, 0x80
+POSTNORMS_ADDR  , POSTNORMS_LAYER_OFFSET                = 0x1CFE1000, 0x80
+OUTNORM_ADDR                                            = 0x1D840000
+OUTNORMS_ADDR                                           = 0x1D841000
+WHEAD_BASE_ADDR     ,   WHEAD_LAYER_OFFSET              = 0x1D000000, 0x100000
+WHEADS_BASE_ADDR    ,   WHEADS_LAYER_OFFSET             = 0x1D800000, 0x8000
+HEAD_OUT_BASE_ADDR  ,   HEAD_OUT_LAYER_OFFSET           = 0x1E000000, 0x1000
+HEADS_OUT_BASE_ADDR ,   HEADS_OUT_LAYER_OFFSET          = 0x1E008000, 0x80
+class Instruction:
+    def __init__(self,
+                 op,
+                 dq_en,
+                 stage,
+                 token,
+                 load_target,
+                 cmode,
+                 nonlinear,
+                 write_back,
+                 input_dim,
+                 output_dim,
+                 input_addr,
+                 scale_addr,
+                 output_addr,
+                 layer_offset,
+                 token_offset,
+                 num_cb_ws,
+                 num_cb_wm
+                ):
+        self.op = op
+        self.dq_en = dq_en
+        self.stage = stage
+        self.token = token
+        self.load_target = load_target
+        self.cmode = cmode
+        self.nonlinear = nonlinear
+        self.write_back = write_back
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.input_addr = input_addr
+        self.scale_addr = scale_addr
+        self.output_addr = output_addr
+        self.layer_offset = layer_offset
+        self.token_offset = token_offset
+        self.num_cb_ws = num_cb_ws
+        self.num_cb_wm = num_cb_wm
+    def to_binary(self, inst_num:int, inst_info:str):
+        print('INFO: {:30s} , Instruction id: {}'.format(inst_info, inst_num))
+        # Convert the instruction to a binary format
+        binary_format = (
+            f"{self.op:014b}{self.dq_en:01b}{self.stage:01b}{self.token:011b}"
+            f"{self.load_target:03b}{self.cmode:01b}{self.nonlinear:01b}{self.write_back:08b}"
+            f"{self.input_dim:016b}{self.output_dim:016b}"
+            f"{self.input_addr:032b}{self.scale_addr:032b}{self.output_addr:032b}"
+            f"{self.layer_offset:032b}{self.token_offset:032b}{self.num_cb_ws:016b}{self.num_cb_wm:016b}"
+        )
+        padding_length = 512 - len(binary_format)
+        binary_format = '0' * padding_length + binary_format
+        return binary_format
+def gen_inst(op, dq_en, stage, token, load_target, cmode, nonlinear, write_back, input_dim, output_dim, input_addr, scale_addr, output_addr, layer_offset, token_offset, num_cb_ws, num_cb_wm):
+    return Instruction(op, dq_en, stage, token, load_target, cmode, nonlinear, write_back, input_dim, output_dim, input_addr, scale_addr, output_addr, layer_offset, token_offset, num_cb_ws, num_cb_wm)
+# =====================================================================================================================================================================================================================
+# Test bandwidth
+# =====================================================================================================================================================================================================================
+inst_test_bw        = gen_inst(TEST_BW,     0, SUM, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      0,      0       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
+# =====================================================================================================================================================================================================================
+# MXINT8 Instruction templates
+# =====================================================================================================================================================================================================================
+inst_nop            = gen_inst(NOP,         0, SUM, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      0,      0       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
+inst_test_bw        = gen_inst(TEST_BW,     0, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      0,      0       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
+inst_load_resi      = gen_inst(LOAD,        0, GEN, 0, RESI,        MX_INT8, NO_ACT,    NO_WB,      4096,   0       , RESI_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, RESI_TOKEN_OFFSET, 0, 0      )
+inst_load_in_act    = gen_inst(LOAD,        0, GEN, 0, ACT,         MX_INT8, NO_ACT,    NO_WB,      4096,   0       , ACT_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ACT_TOKEN_OFFSET, 0, 0      )
+inst_load_resi_s    = gen_inst(LOAD,        0, GEN, 0, RESI_S,      MX_INT8, NO_ACT,    NO_WB,      128,    0       , RESIS_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, RESIS_TOKEN_OFFSET, 0, 0      )
+inst_load_in_act_s  = gen_inst(LOAD,        0, GEN, 0, ACT_S,       MX_INT8, NO_ACT,    NO_WB,      128,    0       , ACTS_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ACTS_TOKEN_OFFSET, 0, 0      )
+inst_mlp_wm_q       = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , WQ_BASE_ADDR, WQS_BASE_ADDR, 0x00000000, WQ_LAYER_OFFSET, 0x00000000, 128, 4096    )
+inst_mlp_wm_k       = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , WK_BASE_ADDR, WKS_BASE_ADDR, 0x00000000, WK_LAYER_OFFSET, 0x00000000, 128, 4096    )
+inst_mlp_wm_v       = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    WB,         4096,   4096    , WV_BASE_ADDR, WVS_BASE_ADDR, 0x00000000, WV_LAYER_OFFSET, 0x00000000, 128, 4096    )
+inst_mlp_wm_o       = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , WO_BASE_ADDR, WOS_BASE_ADDR, 0x00000000, WO_LAYER_OFFSET, 0x00000000, 128, 4096    )
+inst_mlp_wm_w1      = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, SILU,      NO_WB,      4096,   11008   , W1_BASE_ADDR, W1S_BASE_ADDR, 0x00000000, W1_LAYER_OFFSET, 0x00000000, 32, 1024    )
+inst_mlp_wm_w3      = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    NO_WB,      4096,   11008   , W3_BASE_ADDR, W3S_BASE_ADDR, 0x00000000, W3_LAYER_OFFSET, 0x00000000, 32, 1024    )
+inst_mlp_wm_w2      = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    NO_WB,      11008,  4096    , W2_BASE_ADDR, W2S_BASE_ADDR, 0x00000000, W2_LAYER_OFFSET, 0x00000000, 24, 688    )
+inst_gate           = gen_inst(GATE,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      11008,  11008   , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
+inst_residual       = gen_inst(RESIDUAL,    1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB,         4096,   4096    , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
+inst_store_act      = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT,     4096,   4096    , 0x00000000, 0x00000000, ACT_BASE_ADDR, 0x00000000, ACT_TOKEN_OFFSET, 0, 0      )
+inst_store_act_s    = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT_S,   4096,   4096    , 0x00000000, 0x00000000, ACTS_BASE_ADDR, 0x00000000, ACTS_TOKEN_OFFSET, 0, 0      )
+inst_store_resi     = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT,     4096,   4096    , 0x00000000, 0x00000000, RESI_BASE_ADDR, 0x00000000, RESI_TOKEN_OFFSET, 0, 0      )
+inst_store_resi_s   = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT_S,   4096,   4096    , 0x00000000, 0x00000000, RESIS_BASE_ADDR, 0x00000000, RESIS_TOKEN_OFFSET, 0, 0      )
+inst_rope_nwb       = gen_inst(ROPE,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      192,    4096    , ROPE_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ROPE_TOKEN_OFFSET, 0, 0      )
+inst_rope_wb        = gen_inst(ROPE,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB,         192,    4096    , ROPE_BASE_ADDR, 0x00000000, 0x00000000, 0x00000000, ROPE_TOKEN_OFFSET, 0, 0      )
+inst_store_k        = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_KV,      4096,   4096    , 0x00000000, 0x00000000, KC_BASE_ADDR, 0x00000000, KC_TOKEN_OFFSET, 0, 0      )
+inst_store_k_s      = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_KV_S,    4096,   4096    , 0x00000000, 0x00000000, KCS_BASE_ADDR, 0x00000000, KCS_TOKEN_OFFSET, 0, 0      )
+inst_store_v        = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_KV,      4096,   4096    , 0x00000000, 0x00000000, VC_BASE_ADDR, 0x00000000, VC_TOKEN_OFFSET, 0, 0      )
+inst_store_v_s      = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_KV_S,    4096,   4096    , 0x00000000, 0x00000000, VCS_BASE_ADDR, 0x00000000, VCS_TOKEN_OFFSET, 0, 0      )
+inst_mlp_qkt        = gen_inst(MLP_QKT,     1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4,      1       , KC_BASE_ADDR, KCS_BASE_ADDR, 0x00000000, 0x00000000, KCS_TOKEN_OFFSET, 4, 32     )
+inst_qkt_m_rsqrt    = gen_inst(QKT_M_RSQRT, 1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      1,      1       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1, 512    )
+inst_softmax        = gen_inst(SOFTMAX,     1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      0,      1       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1, 0      )
+inst_mlp_hp         = gen_inst(MLP_HP,      1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4,      4096    , VC_BASE_ADDR, VCS_BASE_ADDR, 0x00000000, 0x00000000, VCS_TOKEN_OFFSET, 4, 32     )
+inst_prerrms        = gen_inst(RRMS,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   1       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
+inst_load_pre_wm    = gen_inst(LOAD,        0, GEN, 0, ACT,         MX_INT8, NO_ACT,    NO_WB,      4096,   0       , PRENORM_ADDR, 0x00000000, 0x00000000, PRENORM_LAYER_OFFSET, 0x00000000, 0, 0      )
+inst_load_pre_ws    = gen_inst(LOAD,        0, GEN, 0, ACT_S,       MX_INT8, NO_ACT,    NO_WB,      128,    0       , PRENORMS_ADDR, 0x00000000, 0x00000000, PRENORMS_LAYER_OFFSET, 0x00000000, 0, 0      )
+inst_prermsnorm     = gen_inst(RMSNORM,     1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
+inst_postrrms       = gen_inst(RRMS,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   1       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
+inst_load_post_wm   = gen_inst(LOAD,        0, GEN, 0, ACT,         MX_INT8, NO_ACT,    NO_WB,      4096,   0       , POSTNORM_ADDR, 0x00000000, 0x00000000, POSTNORM_LAYER_OFFSET, 0x00000000, 0, 0      )
+inst_load_post_ws   = gen_inst(LOAD,        0, GEN, 0, ACT_S,       MX_INT8, NO_ACT,    NO_WB,      128,    0       , POSTNORMS_ADDR, 0x00000000, 0x00000000, POSTNORMS_LAYER_OFFSET, 0x00000000, 0, 0      )
+inst_postrmsnorm    = gen_inst(RMSNORM,     1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
+# =====================================================================================================================================================================================================================
+# DECODER OUT Instruction templates
+# =====================================================================================================================================================================================================================
+inst_outrrms        = gen_inst(RRMS,        1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   1       , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
+inst_load_out_wm    = gen_inst(LOAD,        0, GEN, 0, ACT,         MX_INT8, NO_ACT,    NO_WB,      4096,   0       , OUTNORM_ADDR, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
+inst_load_out_ws    = gen_inst(LOAD,        0, GEN, 0, ACT_S,       MX_INT8, NO_ACT,    NO_WB,      128,    0       , OUTNORMS_ADDR, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 0      )
+inst_outrmsnorm     = gen_inst(RMSNORM,     1, GEN, 0, NULL,        MX_INT8, NO_ACT,    NO_WB,      4096,   4096    , 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0, 128    )
+inst_mlp_wm_whead   = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT8, NO_ACT,    WB,         4096,   4096    , WHEAD_BASE_ADDR, WHEADS_BASE_ADDR, 0x00000000, WHEAD_LAYER_OFFSET, 0x00000000, 128, 4096    )
+inst_store_head     = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT,     4096,   4096    , 0x00000000, 0x00000000, HEAD_OUT_BASE_ADDR, 0x00000000, HEAD_OUT_LAYER_OFFSET, 0, 0      )
+inst_store_head_s   = gen_inst(STORE,       1, GEN, 0, NULL,        MX_INT8, NO_ACT,    WB_ACT_S,   4096,   4096    , 0x00000000, 0x00000000, HEADS_OUT_BASE_ADDR, 0x00000000, HEADS_OUT_LAYER_OFFSET, 0, 0      )
+# =====================================================================================================================================================================================================================
+# MXINT4 Instruction templates
+# =====================================================================================================================================================================================================================
+inst_mlp_wm_whead_mxint4= gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    WB,         4096,   4096    , WHEAD_BASE_ADDR,  WHEADS_BASE_ADDR,   0x00000000, WHEAD_LAYER_OFFSET, 0x00000000, 128, 2048    )
+inst_mlp_wm_q_mxint4    = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    NO_WB,      4096,   4096    , WQ_BASE_ADDR,     WQS_BASE_ADDR,      0x00000000, WQ_LAYER_OFFSET,    0x00000000, 128, 2048    )
+inst_mlp_wm_k_mxint4    = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    NO_WB,      4096,   4096    , WK_BASE_ADDR,     WKS_BASE_ADDR,      0x00000000, WK_LAYER_OFFSET,    0x00000000, 128, 2048    )
+inst_mlp_wm_v_mxint4    = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    WB,         4096,   4096    , WV_BASE_ADDR,     WVS_BASE_ADDR,      0x00000000, WV_LAYER_OFFSET,    0x00000000, 128, 2048    )
+inst_mlp_wm_o_mxint4    = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    NO_WB,      4096,   4096    , WO_BASE_ADDR,     WOS_BASE_ADDR,      0x00000000, WO_LAYER_OFFSET,    0x00000000, 128, 2048    )
+inst_mlp_wm_w1_mxint4   = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, SILU,      NO_WB,      4096,   11008   , W1_BASE_ADDR,     W1S_BASE_ADDR,      0x00000000, W1_LAYER_OFFSET,    0x00000000, 32, 512    )
+inst_mlp_wm_w3_mxint4   = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    NO_WB,      4096,   11008   , W3_BASE_ADDR,     W3S_BASE_ADDR,      0x00000000, W3_LAYER_OFFSET,    0x00000000, 32, 512    )
+inst_mlp_wm_w2_mxint4   = gen_inst(MLP_WM,      1, GEN, 0, WEIGHT_S,    MX_INT4, NO_ACT,    NO_WB,      11008,  4096    , W2_BASE_ADDR,     W2S_BASE_ADDR,      0x00000000, W2_LAYER_OFFSET,    0x00000000, 24, 344     )
+# =====================================================================================================================================================================================================================
+# Instruction templates end
+# =====================================================================================================================================================================================================================
+# gen instructions parameters
+parser = argparse.ArgumentParser(description='Generate instruction binary file for simulation')
+parser.add_argument('--sim_cmode', type=str, default='mxint8', choices=['mxint8', 'mxint4'], help='Simulation compute mode')
+parser.add_argument('--sim_llm_head', action='store_true', help='if true, generate LLM head instructions for FPGA simulation')
+args = parser.parse_args()
+SIM_CMODE       = args.sim_cmode
+TEST_OP_GROUP   = 'demo'
+SIM_LLM_HEAD    = args.sim_llm_head
+SIM_LOGIT_FLAG  = '' if SIM_LLM_HEAD else '_no'
+SIM_NUM_TOKEN   = 1024
+SIM_NUM_LAYER   = 32
+if __name__ == "__main__":
+    if TEST_OP_GROUP == 'demo':
+        current_token = 0
+        tmp_output_dim = 1
+        current_inst_cnt = 0
+        file_name = "instruction_{}T_32L{}_write_back_logit_everyT_{}.bin".format(SIM_NUM_TOKEN, SIM_LOGIT_FLAG, SIM_CMODE)
+        with open(file_name, "wb") as f:
+            for tk in range(SIM_NUM_TOKEN):
+                print("Gen {} th token instruction start".format(tk+1))
+                # LOAD IN_ACT
+                inst_load_in_act.input_addr = ACT_BASE_ADDR + ACT_TOKEN_OFFSET * tk
+                inst_load_in_act.token      = current_token
+                binary_instruction = inst_load_in_act.to_binary(current_inst_cnt, 'LOAD IN_ACT')
+                f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                current_inst_cnt += 1
+                # LOAD IN_ACT
+                inst_load_in_act_s.input_addr = ACTS_BASE_ADDR + ACTS_TOKEN_OFFSET * tk
+                inst_load_in_act_s.token      = current_token
+                binary_instruction = inst_load_in_act_s.to_binary(current_inst_cnt, 'LOAD IN_ACT_S')
+                f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                current_inst_cnt += 1
+                for l in range(SIM_NUM_LAYER):
+                    print("Gen {} th token, {} th layer instruction".format(tk+1, l+1))
+                    # RRMS
+                    inst_prerrms.token  = current_token
+                    binary_instruction  = inst_prerrms.to_binary(current_inst_cnt, 'PRE RRMS')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # LOAD NORM weight
+                    inst_load_pre_wm.input_addr = PRENORM_ADDR + PRENORM_LAYER_OFFSET * l
+                    inst_load_pre_wm.token      = current_token
+                    binary_instruction          = inst_load_pre_wm.to_binary(current_inst_cnt, 'LOAD PRENORM weight')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # LOAD NORM weight scale
+                    inst_load_pre_ws.input_addr = PRENORMS_ADDR + PRENORMS_LAYER_OFFSET * l
+                    inst_load_pre_ws.token      = current_token
+                    binary_instruction          = inst_load_pre_ws.to_binary(current_inst_cnt, 'LOAD PRENORM weight scale')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # RMSNORM
+                    inst_prermsnorm.token       = current_token
+                    binary_instruction          = inst_prermsnorm.to_binary(current_inst_cnt, 'PRE RMSNORM')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    if SIM_CMODE == 'mxint8':
+                        # MLP_WM Wv
+                        inst_mlp_wm_v.input_addr    = WV_BASE_ADDR + WV_LAYER_OFFSET * l
+                        inst_mlp_wm_v.scale_addr    = WVS_BASE_ADDR + WVS_LAYER_OFFSET * l
+                        inst_mlp_wm_v.token         = current_token
+                        binary_instruction          = inst_mlp_wm_v.to_binary(current_inst_cnt, 'MLP_WM Wv')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    else:
+                        # MLP_WM Wv
+                        inst_mlp_wm_v_mxint4.input_addr = WV_BASE_ADDR + (WV_LAYER_OFFSET//2) * l
+                        inst_mlp_wm_v_mxint4.scale_addr = WVS_BASE_ADDR + WVS_LAYER_OFFSET * l
+                        inst_mlp_wm_v_mxint4.token      = current_token
+                        binary_instruction              = inst_mlp_wm_v_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wv')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    # STORE V elem
+                    inst_store_v.output_addr    = VC_BASE_ADDR + VC_LAYER_OFFSET * l + VC_TOKEN_OFFSET * tk
+                    inst_store_v.token          = current_token
+                    binary_instruction          = inst_store_v.to_binary(current_inst_cnt, 'STORE V elem')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # STORE V scale
+                    inst_store_v_s.output_addr  = VCS_BASE_ADDR + VCS_LAYER_OFFSET * l + VCS_TOKEN_OFFSET * tk
+                    inst_store_v_s.token        = current_token
+                    binary_instruction          = inst_store_v_s.to_binary(current_inst_cnt, 'STORE V scale')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    if SIM_CMODE == 'mxint8':
+                        # MLP_WM Wk
+                        inst_mlp_wm_k.input_addr    = WK_BASE_ADDR + WK_LAYER_OFFSET * l
+                        inst_mlp_wm_k.scale_addr    = WKS_BASE_ADDR + WKS_LAYER_OFFSET * l
+                        inst_mlp_wm_k.token         = current_token
+                        binary_instruction          = inst_mlp_wm_k.to_binary(current_inst_cnt, 'MLP_WM Wk')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    else:
+                        # MLP_WM Wk
+                        inst_mlp_wm_k_mxint4.input_addr = WK_BASE_ADDR + (WK_LAYER_OFFSET//2) * l
+                        inst_mlp_wm_k_mxint4.scale_addr = WKS_BASE_ADDR + WKS_LAYER_OFFSET * l
+                        inst_mlp_wm_k_mxint4.token      = current_token
+                        binary_instruction              = inst_mlp_wm_k_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wk')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    # ROPE WB (K)
+                    inst_rope_wb.input_addr     = ROPE_BASE_ADDR + ROPE_TOKEN_OFFSET * tk
+                    inst_rope_wb.token          = current_token
+                    binary_instruction          = inst_rope_wb.to_binary(current_inst_cnt, 'ROPE WB (K)')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # STORE K elem
+                    inst_store_k.output_addr    = KC_BASE_ADDR + KC_LAYER_OFFSET * l + KC_TOKEN_OFFSET * tk
+                    inst_store_k.token          = current_token
+                    binary_instruction          = inst_store_k.to_binary(current_inst_cnt, 'STORE K elem')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # STORE K scale
+                    inst_store_k_s.output_addr  = KCS_BASE_ADDR + KCS_LAYER_OFFSET * l + KCS_TOKEN_OFFSET * tk
+                    inst_store_k_s.token        = current_token
+                    binary_instruction          = inst_store_k_s.to_binary(current_inst_cnt, 'STORE K scale')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    if SIM_CMODE == 'mxint8':
+                        # MLP_WM Wq
+                        inst_mlp_wm_q.input_addr    = WQ_BASE_ADDR + WQ_LAYER_OFFSET * l
+                        inst_mlp_wm_q.scale_addr    = WQS_BASE_ADDR + WQS_LAYER_OFFSET * l
+                        inst_mlp_wm_q.token         = current_token
+                        binary_instruction          = inst_mlp_wm_q.to_binary(current_inst_cnt, 'MLP_WM Wq')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    else:
+                        # MLP_WM Wq
+                        inst_mlp_wm_q_mxint4.input_addr = WQ_BASE_ADDR + (WQ_LAYER_OFFSET//2) * l
+                        inst_mlp_wm_q_mxint4.scale_addr = WQS_BASE_ADDR + WQS_LAYER_OFFSET * l
+                        inst_mlp_wm_q_mxint4.token      = current_token
+                        binary_instruction              = inst_mlp_wm_q_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wq')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    # ROPE NO_WB (Q)
+                    inst_rope_nwb.input_addr    = ROPE_BASE_ADDR + ROPE_TOKEN_OFFSET * tk
+                    inst_rope_nwb.token         = current_token
+                    binary_instruction          = inst_rope_nwb.to_binary(current_inst_cnt, 'ROPE NO_WB (Q)')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # MLP_QKT
+                    inst_mlp_qkt.input_addr     = KC_BASE_ADDR + KC_LAYER_OFFSET * l
+                    inst_mlp_qkt.scale_addr     = KCS_BASE_ADDR + KCS_LAYER_OFFSET * l
+                    inst_mlp_qkt.output_dim     = tmp_output_dim
+                    inst_mlp_qkt.input_dim      = tmp_output_dim * inst_mlp_qkt.num_cb_ws
+                    inst_mlp_qkt.token          = current_token
+                    binary_instruction          = inst_mlp_qkt.to_binary(current_inst_cnt, 'MLP_QKT')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # QKT_M_RSQRT
+                    inst_qkt_m_rsqrt.output_dim = tmp_output_dim
+                    inst_qkt_m_rsqrt.input_dim  = tmp_output_dim * inst_qkt_m_rsqrt.num_cb_ws
+                    inst_qkt_m_rsqrt.token      = current_token
+                    binary_instruction          = inst_qkt_m_rsqrt.to_binary(current_inst_cnt, 'QKT_M_RSQRT')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # SOFTMAX
+                    inst_softmax.output_dim     = tmp_output_dim
+                    inst_softmax.token          = current_token
+                    binary_instruction          = inst_softmax.to_binary(current_inst_cnt, 'SOFTMAX')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # MLP_HP (SxV)
+                    inst_mlp_hp.input_addr      = VC_BASE_ADDR + VC_LAYER_OFFSET * l
+                    inst_mlp_hp.scale_addr      = VCS_BASE_ADDR + VCS_LAYER_OFFSET * l
+                    inst_mlp_hp.input_dim       = tmp_output_dim * inst_mlp_hp.num_cb_ws
+                    inst_mlp_hp.token           = current_token
+                    binary_instruction          = inst_mlp_hp.to_binary(current_inst_cnt, 'MLP_HP (SxV)')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    if SIM_CMODE == 'mxint8':
+                        # MLP_WM Wo
+                        inst_mlp_wm_o.input_addr    = WO_BASE_ADDR + WO_LAYER_OFFSET * l
+                        inst_mlp_wm_o.scale_addr    = WOS_BASE_ADDR + WOS_LAYER_OFFSET * l
+                        inst_mlp_wm_o.token         = current_token
+                        binary_instruction          = inst_mlp_wm_o.to_binary(current_inst_cnt, 'MLP_WM Wo')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    else:
+                        # MLP_WM Wo
+                        inst_mlp_wm_o_mxint4.input_addr = WO_BASE_ADDR + (WO_LAYER_OFFSET//2) * l
+                        inst_mlp_wm_o_mxint4.scale_addr = WOS_BASE_ADDR + WOS_LAYER_OFFSET * l
+                        inst_mlp_wm_o_mxint4.token      = current_token
+                        binary_instruction              = inst_mlp_wm_o_mxint4.to_binary(current_inst_cnt, 'MLP_WM Wo')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    # LOAD RESI
+                    inst_load_resi.input_addr   = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
+                    inst_load_resi.token        = current_token
+                    binary_instruction          = inst_load_resi.to_binary(current_inst_cnt, 'LOAD RESI')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # LOAD RESI_S
+                    inst_load_resi_s.input_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
+                    inst_load_resi_s.token      = current_token
+                    binary_instruction          = inst_load_resi_s.to_binary(current_inst_cnt, 'LOAD RESI_S')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # RESIDUAL
+                    inst_residual.token         = current_token
+                    binary_instruction          = inst_residual.to_binary(current_inst_cnt, 'RESIDUAL')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # STORE RESI
+                    inst_store_resi.output_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
+                    inst_store_resi.token       = current_token
+                    binary_instruction          = inst_store_resi.to_binary(current_inst_cnt, 'STORE RESI')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # STORE RESI_S
+                    inst_store_resi_s.output_addr   = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
+                    inst_store_resi_s.token         = current_token
+                    binary_instruction              = inst_store_resi_s.to_binary(current_inst_cnt, 'STORE RESI_S')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # RRMS
+                    inst_postrrms.token             = current_token
+                    binary_instruction              = inst_postrrms.to_binary(current_inst_cnt, 'POST RRMS')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # LOAD NORM weight
+                    inst_load_post_wm.input_addr    = POSTNORM_ADDR + POSTNORM_LAYER_OFFSET * l
+                    inst_load_post_wm.token         = current_token
+                    binary_instruction              = inst_load_post_wm.to_binary(current_inst_cnt, 'LOAD POSTNORM weight')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # LOAD NORM weight scale
+                    inst_load_post_ws.input_addr    = POSTNORMS_ADDR + POSTNORMS_LAYER_OFFSET * l
+                    inst_load_post_ws.token         = current_token
+                    binary_instruction              = inst_load_post_ws.to_binary(current_inst_cnt, 'LOAD POSTNORM weight scale')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # RMSNORM
+                    inst_postrmsnorm.token          = current_token
+                    binary_instruction              = inst_postrmsnorm.to_binary(current_inst_cnt, 'POST RMSNORM')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    if SIM_CMODE == 'mxint8':
+                        # MLP_WM W1
+                        inst_mlp_wm_w1.input_addr   = W1_BASE_ADDR + W1_LAYER_OFFSET * l
+                        inst_mlp_wm_w1.scale_addr   = W1S_BASE_ADDR + W1S_LAYER_OFFSET * l
+                        inst_mlp_wm_w1.token        = current_token
+                        binary_instruction          = inst_mlp_wm_w1.to_binary(current_inst_cnt, 'MLP_WM W1')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                        # MLP_WM W3
+                        inst_mlp_wm_w3.input_addr   = W3_BASE_ADDR + W3_LAYER_OFFSET * l
+                        inst_mlp_wm_w3.scale_addr   = W3S_BASE_ADDR + W3S_LAYER_OFFSET * l
+                        inst_mlp_wm_w3.token        = current_token
+                        binary_instruction          = inst_mlp_wm_w3.to_binary(current_inst_cnt, 'MLP_WM W3')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    else:
+                        # MLP_WM W1
+                        inst_mlp_wm_w1_mxint4.input_addr    = W1_BASE_ADDR + (W1_LAYER_OFFSET//2) * l
+                        inst_mlp_wm_w1_mxint4.scale_addr    = W1S_BASE_ADDR + W1S_LAYER_OFFSET * l
+                        inst_mlp_wm_w1_mxint4.token         = current_token
+                        binary_instruction                  = inst_mlp_wm_w1_mxint4.to_binary(current_inst_cnt, 'MLP_WM W1')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                        # MLP_WM W3
+                        inst_mlp_wm_w3_mxint4.input_addr    = W3_BASE_ADDR + (W3_LAYER_OFFSET//2) * l
+                        inst_mlp_wm_w3_mxint4.scale_addr    = W3S_BASE_ADDR + W3S_LAYER_OFFSET * l
+                        inst_mlp_wm_w3_mxint4.token         = current_token
+                        binary_instruction                  = inst_mlp_wm_w3_mxint4.to_binary(current_inst_cnt, 'MLP_WM W3')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    # GATE
+                    inst_gate.token             = current_token
+                    binary_instruction          = inst_gate.to_binary(current_inst_cnt, 'GATE')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    if SIM_CMODE == 'mxint8':
+                        # MLP_WM W2
+                        inst_mlp_wm_w2.input_addr   = W2_BASE_ADDR + W2_LAYER_OFFSET * l
+                        inst_mlp_wm_w2.scale_addr   = W2S_BASE_ADDR + W2S_LAYER_OFFSET * l
+                        inst_mlp_wm_w2.token        = current_token
+                        binary_instruction          = inst_mlp_wm_w2.to_binary(current_inst_cnt, 'MLP_WM W2')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    else:
+                        # MLP_WM W2
+                        inst_mlp_wm_w2_mxint4.input_addr    = W2_BASE_ADDR + (W2_LAYER_OFFSET//2) * l
+                        inst_mlp_wm_w2_mxint4.scale_addr    = W2S_BASE_ADDR + W2S_LAYER_OFFSET * l
+                        inst_mlp_wm_w2_mxint4.token         = current_token
+                        binary_instruction                  = inst_mlp_wm_w2_mxint4.to_binary(current_inst_cnt, 'MLP_WM W2')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                    # LOAD RESI
+                    inst_load_resi.input_addr   = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
+                    inst_load_resi.token        = current_token
+                    binary_instruction          = inst_load_resi.to_binary(current_inst_cnt, 'LOAD RESI')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # LOAD RESI_S
+                    inst_load_resi_s.input_addr = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
+                    inst_load_resi_s.token      = current_token
+                    binary_instruction          = inst_load_resi_s.to_binary(current_inst_cnt, 'LOAD RESI_S')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # RESIDUAL
+                    inst_residual.token         = current_token
+                    binary_instruction          = inst_residual.to_binary(current_inst_cnt, 'RESIDUAL')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # STORE RESI
+                    inst_store_resi.output_addr = RESI_BASE_ADDR + RESI_TOKEN_OFFSET * tk
+                    inst_store_resi.token       = current_token
+                    binary_instruction          = inst_store_resi.to_binary(current_inst_cnt, 'STORE RESI')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # STORE RESI_S
+                    inst_store_resi_s.output_addr   = RESIS_BASE_ADDR + RESIS_TOKEN_OFFSET * tk
+                    inst_store_resi_s.token         = current_token
+                    binary_instruction              = inst_store_resi_s.to_binary(current_inst_cnt, 'STORE RESI_S')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # END OF LAYER
+                # if tk == SIM_NUM_TOKEN-1 and SIM_NUM_LAYER == 32 and SIM_LLM_HEAD:
+                if SIM_NUM_LAYER == 32 and SIM_LLM_HEAD:
+                    # RRMS
+                    inst_outrrms.token              = current_token
+                    binary_instruction              = inst_outrrms.to_binary(current_inst_cnt, 'OUT RRMS')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # LOAD NORM weight
+                    inst_load_out_wm.input_addr     = OUTNORM_ADDR
+                    inst_load_out_wm.token          = current_token
+                    binary_instruction              = inst_load_out_wm.to_binary(current_inst_cnt, 'LOAD OUTNORM weight')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # LOAD NORM weight scale
+                    inst_load_out_ws.input_addr     = OUTNORMS_ADDR
+                    inst_load_out_ws.token          = current_token
+                    binary_instruction              = inst_load_out_ws.to_binary(current_inst_cnt, 'LOAD OUTNORM weight scale')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    # RMSNORM
+                    inst_outrmsnorm.token           = current_token
+                    binary_instruction              = inst_outrmsnorm.to_binary(current_inst_cnt, 'OUT RMSNORM')
+                    f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                    current_inst_cnt += 1
+                    for it in range(8):
+                        if SIM_CMODE == 'mxint8':
+                            # MLP_WM Whead
+                            inst_mlp_wm_whead.input_addr    = WHEAD_BASE_ADDR + WHEAD_LAYER_OFFSET * it
+                            inst_mlp_wm_whead.scale_addr    = WHEADS_BASE_ADDR + WHEADS_LAYER_OFFSET * it
+                            inst_mlp_wm_whead.token         = current_token
+                            binary_instruction              = inst_mlp_wm_whead.to_binary(current_inst_cnt, 'MLP_WM Whead')
+                            f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                            current_inst_cnt += 1
+                        else:
+                            # MLP_WM Whead
+                            inst_mlp_wm_whead_mxint4.input_addr = WHEAD_BASE_ADDR + (WHEAD_LAYER_OFFSET//2) * it
+                            inst_mlp_wm_whead_mxint4.scale_addr = WHEADS_BASE_ADDR + WHEADS_LAYER_OFFSET * it
+                            inst_mlp_wm_whead_mxint4.token      = current_token
+                            binary_instruction                  = inst_mlp_wm_whead_mxint4.to_binary(current_inst_cnt, 'MLP_WM Whead')
+                            f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                            current_inst_cnt += 1
+                        # STORE HEAD
+                        inst_store_head.output_addr     = HEAD_OUT_BASE_ADDR + HEAD_OUT_LAYER_OFFSET * it
+                        inst_store_head.token           = current_token
+                        binary_instruction              = inst_store_head.to_binary(current_inst_cnt, 'STORE HEAD')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                        # STORE HEAD_S
+                        inst_store_head_s.output_addr   = HEADS_OUT_BASE_ADDR + HEADS_OUT_LAYER_OFFSET * it
+                        inst_store_head_s.token         = current_token
+                        binary_instruction              = inst_store_head_s.to_binary(current_inst_cnt, 'STORE HEAD_S')
+                        f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                        current_inst_cnt += 1
+                # Adjust parameters
+                tmp_output_dim += 1
+                current_token += 1
+                # END OF TOKEN
+            # end instruction NOP
+            binary_instruction = inst_nop.to_binary(current_inst_cnt, 'End instruction NOP')
+            f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+            current_inst_cnt += 1
+            # END OF SIMULATION
+        print('INFO: Total Instruction Count: {}'.format(current_inst_cnt))
+    elif TEST_OP_GROUP == 'test_bw':
+        current_inst_cnt = 0
+        with open("../instruction/instruction_test_bw.bin", "wb") as f:
+            for i in range(SIM_NUM_TOKEN):
+                binary_instruction = inst_test_bw.to_binary(current_inst_cnt, 'TEST BW')
+                f.write(int(binary_instruction, 2).to_bytes(len(binary_instruction) // 8, byteorder='little'))
+                current_inst_cnt += 1
+        print('INFO: Total Instruction Count: {}'.format(current_inst_cnt))

instruction/instruction_1024T_32L_no_write_back_logit_everyT_mxint4.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4227f9d80b5024e2f0ca32b283513ed47fe752f4c7c21104eb0dca6228e570b4
+size 75628608

instruction/instruction_1024T_32L_no_write_back_logit_everyT_mxint8.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1e0e514eca48728cf2a4c5d5fa6d8bb09299f1fa701c61438dd47ce492f9788
+size 75628608

instruction/instruction_1024T_32L_write_back_logit_everyT_mxint4.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7baec5523f2c7558828f9fa2c1add85791613c1f721a1228af1c1d426e7f73f8
+size 77463616

instruction/instruction_1024T_32L_write_back_logit_everyT_mxint8.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:beef5a09e1c473e54182fb81f7963fe7c83ff8a99275ecb47a0954f8ec154695
+size 77463616