init commit

Browse files

Files changed (10) hide show

inference/configs/config_16B.json +19 -0
inference/configs/config_236B.json +20 -0
inference/configs/config_671B.json +22 -0
inference/configs/config_671B_test.json +23 -0
inference/convert2.py +630 -0
inference/generate.py +458 -0
inference/int64_gemm.cu +1030 -0
inference/kernel.py +724 -0
inference/model.py +1631 -0
inference/requirements.txt +4 -0

inference/configs/config_16B.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "vocab_size": 102400,
+    "dim": 2048,
+    "inter_dim": 10944,
+    "moe_inter_dim": 1408,
+    "n_layers": 27,
+    "n_dense_layers": 1,
+    "n_heads": 16,
+    "n_routed_experts": 64,
+    "n_shared_experts": 2,
+    "n_activated_experts": 6,
+    "route_scale": 1.0,
+    "q_lora_rank": 0,
+    "kv_lora_rank": 512,
+    "qk_nope_head_dim": 128,
+    "qk_rope_head_dim": 64,
+    "v_head_dim": 128,
+    "mscale": 0.707
+}

inference/configs/config_236B.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "vocab_size": 102400,
+    "dim": 5120,
+    "inter_dim": 12288,
+    "moe_inter_dim": 1536,
+    "n_layers": 60,
+    "n_dense_layers": 1,
+    "n_heads": 128,
+    "n_routed_experts": 160,
+    "n_shared_experts": 2,
+    "n_activated_experts": 6,
+    "n_expert_groups": 8,
+    "n_limited_groups": 3,
+    "route_scale": 16.0,
+    "q_lora_rank": 1536,
+    "kv_lora_rank": 512,
+    "qk_nope_head_dim": 128,
+    "qk_rope_head_dim": 64,
+    "v_head_dim": 128
+}

inference/configs/config_671B.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "vocab_size": 129280,
+    "dim": 7168,
+    "inter_dim": 18432,
+    "moe_inter_dim": 2048,
+    "n_layers": 61,
+    "n_dense_layers": 3,
+    "n_heads": 128,
+    "n_routed_experts": 256,
+    "n_shared_experts": 1,
+    "n_activated_experts": 8,
+    "n_expert_groups": 8,
+    "n_limited_groups": 4,
+    "route_scale": 2.5,
+    "score_func": "sigmoid",
+    "q_lora_rank": 1536,
+    "kv_lora_rank": 512,
+    "qk_nope_head_dim": 128,
+    "qk_rope_head_dim": 64,
+    "v_head_dim": 128,
+    "dtype": "fp8"
+}

inference/configs/config_671B_test.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "vocab_size": 8080,
+    "dim": 7168,
+    "inter_dim": 1152,
+    "moe_inter_dim": 2048,
+    "n_layers": 61,
+    "n_dense_layers": 3,
+    "n_heads": 8,
+    "n_routed_experts": 16,
+    "n_shared_experts": 1,
+    "n_activated_experts": 8,
+    "n_expert_groups": 8,
+    "n_limited_groups": 4,
+    "route_scale": 2.5,
+    "score_func": "sigmoid",
+    "q_lora_rank": 1536,
+    "kv_lora_rank": 512,
+    "qk_nope_head_dim": 128,
+    "qk_rope_head_dim": 64,
+    "v_head_dim": 128,
+    "dtype": "fp8"
+}

inference/convert2.py ADDED Viewed

	@@ -0,0 +1,630 @@

+import os
+import shutil
+from argparse import ArgumentParser
+from glob import glob
+from tqdm import tqdm, trange
+import torch
+import ctypes
+from safetensors.torch import safe_open, save_file
+from kernel import weight_dequant
+mapping = {
+    "embed_tokens": ("embed", 0),
+    "input_layernorm": ("attn_norm", None),
+    "post_attention_layernorm": ("ffn_norm", None),
+    "q_proj": ("wq", 0),
+    "q_a_proj": ("wq_a", None),
+    "q_a_layernorm": ("q_norm", None),
+    "q_b_proj": ("wq_b", 0),
+    "kv_a_proj_with_mqa": ("wkv_a", None),
+    "kv_a_layernorm": ("kv_norm", None),
+    "kv_b_proj": ("wkv_b", 0),
+    "o_proj": ("wo", 1),
+    "gate": ("gate", None),
+    "gate_proj": ("w1", 0),
+    "down_proj": ("w2", 1),
+    "up_proj": ("w3", 0),
+    "norm": ("norm", None),
+    "lm_head": ("head", 0),
+    "scale": ("scale", None),
+}
+EmbedsInOneFile = 256
+EmbedsZKDir = "zkdata/embeds/"
+wkv_b_1_rescales = [32, 34, 37, 36, 33, 32, 33, 33, 30, 32,
+                   32, 30, 31, 30, 29, 30, 29, 30, 29, 29,
+                   29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+                   29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+                   29, 29, 29, 29, 29, 29, 29, 29, 30, 30,
+                   29, 29, 30, 30, 30, 30, 29, 30, 30, 29, 30]
+wkv_b_2_rescales = [31, 32, 32, 31, 32, 30, 30, 30, 30, 30,
+                   30, 30, 30, 29, 29, 29, 29, 30, 29, 29,
+                   29, 29, 29, 29, 30, 30, 30, 29, 29, 29,
+                   29, 29, 30, 29, 30, 29, 30, 29, 29, 29,
+                   30, 29, 29, 29, 29, 30, 29, 30, 30, 30,
+                   29, 29, 29, 30, 30, 29, 29, 29, 30, 30, 30]
+wo_rescales = [31, 32, 32, 32, 32, 31, 32, 31, 31, 31,
+              31, 31, 31, 31, 30, 31, 31, 32, 31, 31,
+              31, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+              30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+              30, 30, 30, 31, 30, 31, 30, 30, 31, 31,
+              31, 30, 31, 31, 31, 30, 31, 31, 31, 31, 32 ]
+gate_rescales = [0, 0, 0, 33, 32, 32, 32, 31, 32, 31, 30,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+                32, 31, 32, 31, 32, 32, 32, 32, 31, 32,
+                32, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+                32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+                32, 32, 32, 33, 33, 33, 33, 33, 32, 32 ]
+w1_rescales = [32, 32, 32]
+w2_rescales = [31, 32, 31]
+w3_rescales = [32, 33, 32]
+shared_w1_rescales = [0, 0, 0, 30, 30, 29, 29, 29, 28, 29,
+                      29, 28, 29, 29, 29, 29, 29, 29, 29, 29,
+                      29, 29, 29, 30, 30, 30, 30, 30, 30, 30,
+                      30, 30, 30, 30, 29, 29, 30, 29, 29, 30,
+                      29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+                      29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29]
+shared_w2_rescales = [0, 0, 0, 30, 30, 30, 30, 30, 29, 29,
+                      30, 29, 29, 29, 30, 30, 30, 30, 30, 29,
+                      29, 29, 29, 29, 29, 29, 29, 30, 30, 29,
+                      29, 29, 29, 29, 29, 29, 29, 30, 29, 29,
+                      29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+                      29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30]
+shared_w3_rescales = [0, 0, 0, 30, 30, 30, 30, 30, 29, 29,
+                      30, 29, 29, 29, 30, 30, 30, 29, 30, 29,
+                      29, 29, 29, 29, 29, 29, 30, 30, 30, 30,
+                      29, 29, 29, 29, 29, 29, 29, 30, 30, 29,
+                      30, 29, 29, 29, 29, 30, 29, 29, 30, 30,
+                      29, 30, 30, 30, 29, 29, 30, 30, 30, 29, 28]
+layer_state_dict0 = [{} for _ in range(61)]
+layer_state_dict = [{} for _ in range(61)]
+experts = [ [{} for _j in range(256)] for _i in range(61)]
+def getF32PrintStr(ele):
+    v = int(ele.cpu().view(torch.uint32).item())
+    ex = str((v >> 23 & 0xFF) - 127)
+    r = '(1+' + str(v & 0x7FFFFF) + '/8388608)'
+    if v & 0x80000000:
+        vstr = '-' + r + '*2^' + ex
+    else:
+        vstr = r + '*2^' + ex
+    return vstr
+def getBF16PrintStr(ele):
+    v = int(ele.cpu().view(torch.uint16).item())
+    ex = v >> 7 & 0xFF
+    r = '(1+' + str(v & 0x7F) + '/128)'
+    rraw = v & 0x7F
+    if v & 0x8000:
+        vstr = '-' + r + '*2^' + str(ex - 127)
+    else:
+        vstr = r + '*2^' + str(ex - 127)
+    return vstr
+def getBF8PrintStr(ele):
+    v = int(ele.cpu().view(torch.uint8).item())
+    ex = str((v >> 3 & 0xF) - 7)
+    r = '(1+' + str(v & 0x7) + '/8)'
+    if v & 0x80:
+        vstr = '-' + r + '*2^' + ex
+    else:
+        vstr = r + '*2^' + ex
+    if ex == -7 or ex == 8:
+        print(vstr)
+    return vstr
+def mem(i):
+    a = torch.cuda.memory_allocated()/1024**2
+    r = torch.cuda.memory_reserved()/1024**2
+    m = torch.cuda.max_memory_allocated()/1024**2
+    print(f"{i} allocated={a:.1f}MB, reserved={r:.1f}MB, max={m:.1f}MB", flush=True)
+def handle_expert_w(layer_id, expert_id, idx, param_weight, weight_name, scale, typ, shape, experts_save_path):
+    global layer_state_dict0
+    global experts
+    scale_name = weight_name.replace('weight', 'scale')
+    param_scale = layer_state_dict0[layer_id][scale_name]
+    weight = weight_dequant(param_weight.cuda(), param_scale.cuda())
+    # scale = experts_w3_rescales[layer_id][expert_id]
+    rescale = 2 ** scale
+    param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
+    # layer_state_dict[layer_id][weight_name] = param_int.cpu()
+    # layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
+    weight_name2 = f'w{idx}.weight'
+    scale_name2 = f'w{idx}.scale'
+    experts[layer_id][expert_id][weight_name2] = param_int
+    experts[layer_id][expert_id][scale_name2] = torch.tensor(scale, dtype=torch.int32)
+    if len(experts[layer_id][expert_id]) == 6: # w1, w2, w3 以及对应的 scale
+        save_file(experts[layer_id][expert_id], os.path.join(experts_save_path, f"{expert_id}.safetensors"))
+        experts[layer_id][expert_id] = {}
+    print(f'layer {layer_id} expert {expert_id} w{idx} type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {scale_name}')
+def saveTensor(fileName, t):
+    with open(fileName, "w", encoding="utf-8") as f:
+        t = t.detach()
+        if t.device.type != "cpu":
+            t = t.cpu()
+        t = t.contiguous()
+        with open(fileName, "wb") as f:
+            f.write(t.numpy().tobytes(order="C"))
+def main(hf_ckpt_path, save_path, n_experts, mp):
+    """
+    Converts and saves model checkpoint files into a specified format.
+    Args:
+        hf_ckpt_path (str): Path to the directory containing the input checkpoint files.
+        save_path (str): Path to the directory where the converted checkpoint files will be saved.
+        n_experts (int): Total number of experts in the model.
+        mp (int): Model parallelism factor.
+    Returns:
+        None
+    """
+    torch.cuda.set_device(0)
+    # 设置pytorch计算时的默认数据类型。这里使用的是BF16
+    torch.set_default_dtype(torch.bfloat16)
+    # 限制 PyTorch 在 CPU 计算时最多使用 8 个线程，防止过多线程竞争资源：
+    torch.set_num_threads(8)
+    # 设定随机种子，保证不同进程初始化时随机数相同。
+    torch.manual_seed(965)
+    # n_local_experts = n_experts // mp
+    # state_dicts = [{} for _ in range(mp)]
+    head_state_dict = {}
+    norm_state_dict = {}
+    embed_state_dict = {}
+    experts_w1_rescales = []
+    experts_w2_rescales = []
+    experts_w3_rescales = []
+    with open("w1.txt", "r", encoding="utf-8") as f1:
+        for line in f1:
+            layer_line = line.strip().split()
+            int_list = [int(s) for s in layer_line]
+            experts_w1_rescales.append(int_list)
+    with open("w2.txt", "r", encoding="utf-8") as f2:
+        for line in f2:
+            layer_line = line.strip().split()
+            int_list = [int(s) for s in layer_line]
+            experts_w2_rescales.append(int_list)
+    with open("w3.txt", "r", encoding="utf-8") as f3:
+        for line in f3:
+            layer_line = line.strip().split()
+            int_list = [int(s) for s in layer_line]
+            experts_w3_rescales.append(int_list)
+    # Tqdm 是一个快速，可扩展的Python进度条，可以在 Python 长循环中添加一个进度提示信息，用户只需要封装任意的迭代器 tqdm(iterator)。
+    # glob是python自己带的一个文件操作相关模块，用它可以查找符合自己目的的文件，类似于Windows下的文件搜索
+    for file_path in tqdm(glob(os.path.join(hf_ckpt_path, "*.safetensors"))):
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            print('Opening ' + file_path, flush=True)
+            for name in f.keys():
+                # print('name 1: ', name)
+                if "model.layers.61" in name:
+                    continue
+                param: torch.Tensor = f.get_tensor(name)
+                if name.startswith("model."):
+                    name = name[len("model."):]
+                name = name.replace("self_attn", "attn")
+                name = name.replace("mlp", "ffn")
+                name = name.replace("weight_scale_inv", "scale")
+                name = name.replace("e_score_correction_bias", "bias")
+                key = name.split(".")[-2]
+                assert key in mapping, f"Key {key} not found in mapping"
+                # print('key::: ' + key)
+                new_key, dim = mapping[key]
+                # print('dim::: ' + str(dim))
+                name = name.replace(key, new_key)
+                ns = name.split(".")
+                comp = ns[0]
+                if comp == 'head':
+                    name2 = name[len('head.'):]
+                    print('head: ' + name2)
+                    param_int =  (param.to(torch.float32) * (2 ** 43)).round().to(torch.int64)
+                    head_state_dict[name2] = param_int
+                elif comp == 'norm':
+                    name2 = name[len('norm.'):]
+                    print('norm: ' + name2)
+                    param_int =  (param.to(torch.float32) * (2 ** 15)).round().to(torch.int64)
+                    norm_state_dict[name2] = param_int
+                elif comp == 'embed':
+                    name2 = name[len('embed.'):]
+                    print('embed: ' + name2)
+                    param_int =  (param.to(torch.float32) * (2 ** 31)).round().to(torch.int64)
+                    embed_state_dict[name2] = param_int
+                    os.makedirs(EmbedsZKDir, exist_ok=True)
+                    fileCount = param_int.shape[0] // EmbedsInOneFile
+                    for i in range(0, fileCount):
+                        saveTensor(EmbedsZKDir + str(i) + '.bin', param_int[i * EmbedsInOneFile : (i+1) * EmbedsInOneFile].cpu())
+                elif comp == 'layers':
+                    layer_id = int(ns[1])
+                    name2 = '.'.join(ns[2:])
+                    layer_state_dict0[layer_id][name2] = param
+    print('Finish loading state dict from disk! ++++++++++')
+    # for layer_id, states in enumerate(layer_state_dict0):
+    for layer_id in range(len(layer_state_dict0)):
+        os.makedirs(f'{save_path}/experts-{layer_id}', exist_ok=True)
+        states = layer_state_dict0[layer_id]
+        for name, param in states.items():
+            ns = name.split(".")
+            typ = param.type()
+            shape = param.shape
+            if ns[0] == 'attn_norm':
+                print(f'layer {layer_id} {name}, type: {typ}', flush=True)
+                if ns[1] == 'weight':
+                    param_int = (param.to(torch.float32) * (2 ** 21)).round().to(torch.int32)
+                    layer_state_dict[layer_id][name] = param_int
+            elif ns[0] == 'ffn_norm':
+                print(f'layer {layer_id} {name}, type: {typ}', flush=True)
+                if ns[1] == 'weight':
+                    param_int2 = (param.to(torch.float32) * (2 ** 23)).round().to(torch.int32)
+                    layer_state_dict[layer_id][name] = param_int2
+            elif ns[0] == 'ffn':
+                if len(ns) == 3:
+                    if ns[1] == 'w1' and ns[2] == 'scale':
+                        continue
+                    elif ns[1] == 'w1' and ns[2] == 'weight':
+                        param_weight = param.cuda()
+                        weight_name = name
+                        scale_name = name.replace('weight', 'scale')
+                        param_scale = states[scale_name]
+                        weight = weight_dequant(param_weight, param_scale.cuda())
+                        scale = w1_rescales[layer_id]
+                        rescale = 2 ** scale
+                        param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
+                        layer_state_dict[layer_id][weight_name] = param_int.cpu()
+                        layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
+                        print(f'layer {layer_id} w1 weight, type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {name}', flush=True)
+                    elif ns[1] == 'w2' and ns[2] == 'scale':
+                        continue
+                    elif ns[1] == 'w2' and ns[2] == 'weight':
+                        param_weight = param.cuda()
+                        weight_name = name
+                        scale_name = name.replace('weight', 'scale')
+                        param_scale = states[scale_name]
+                        weight = weight_dequant(param_weight, param_scale.cuda())
+                        scale = w2_rescales[layer_id]
+                        rescale = 2 ** scale
+                        param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
+                        layer_state_dict[layer_id][weight_name] = param_int.cpu()
+                        layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
+                        print(f'layer {layer_id} w2 weight, type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {name}', flush=True)
+                    elif ns[1] == 'w3' and ns[2] == 'scale':
+                        continue
+                    elif ns[1] == 'w3' and ns[2] == 'weight':
+                        param_weight = param.cuda()
+                        weight_name = name
+                        scale_name = name.replace('weight', 'scale')
+                        param_scale = states[scale_name]
+                        weight = weight_dequant(param_weight, param_scale.cuda())
+                        scale = w3_rescales[layer_id]
+                        rescale = 2 ** scale
+                        param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
+                        layer_state_dict[layer_id][weight_name] = param_int.cpu()
+                        layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
+                        print(f'layer {layer_id} w3 weight, type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {name}', flush=True)
+                    elif ns[1] == 'gate' and ns[2] == 'weight':
+                        gate_rescale = 2 ** gate_rescales[layer_id]
+                        gate_int = (param.to(torch.float32) * gate_rescale).round().to(torch.int32)
+                        layer_state_dict[layer_id][name] = gate_int.cpu()
+                        rescale_name = name.replace('weight', 'scale')
+                        layer_state_dict[layer_id][rescale_name] = torch.tensor(gate_rescales[layer_id], dtype=torch.int32)
+                        print(f'layer {layer_id}: gate_weight_name: {name}, gate_scale_name: {rescale_name}')
+                    elif ns[1] == 'gate' and ns[2] == 'bias':
+                        bias_int = (param.to(torch.float32) * (2 ** 23)).round().to(torch.int32)
+                        layer_state_dict[layer_id][name] = bias_int.cpu()
+                        print(f'layer {layer_id} bias: {name}')
+                    else:
+                        layer_state_dict[layer_id][name] = param
+                elif len(ns) == 4:
+                    if ns[1] == 'shared_experts':
+                        if (ns[2] == 'w1' or ns[2] == 'w2' or ns[2] == 'w3') and ns[3] == 'scale':
+                            continue
+                        elif ns[2] == 'w1' and ns[3] == 'weight':
+                            param_weight = param.cuda()
+                            weight_name = name
+                            scale_name = name.replace('weight', 'scale')
+                            param_scale = states[scale_name]
+                            weight = weight_dequant(param_weight, param_scale.cuda())
+                            scale = shared_w1_rescales[layer_id]
+                            rescale = 2 ** scale
+                            param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
+                            layer_state_dict[layer_id][weight_name] = param_int.cpu()
+                            layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
+                            print(f'layer {layer_id} shared_expert w1 type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {scale_name}')
+                        elif ns[2] == 'w2' and ns[3] == 'weight':
+                            param_weight = param.cuda()
+                            weight_name = name
+                            scale_name = name.replace('weight', 'scale')
+                            param_scale = states[scale_name]
+                            weight = weight_dequant(param_weight, param_scale.cuda())
+                            scale = shared_w2_rescales[layer_id]
+                            rescale = 2 ** scale
+                            param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
+                            layer_state_dict[layer_id][weight_name] = param_int.cpu()
+                            layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
+                            print(f'layer {layer_id} shared_expert w2 type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {scale_name}')
+                        elif ns[2] == 'w3' and ns[3] == 'weight':
+                            param_weight = param.cuda()
+                            weight_name = name
+                            scale_name = name.replace('weight', 'scale')
+                            param_scale = states[scale_name]
+                            weight = weight_dequant(param_weight, param_scale.cuda())
+                            scale = shared_w3_rescales[layer_id]
+                            rescale = 2 ** scale
+                            param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
+                            layer_state_dict[layer_id][weight_name] = param_int.cpu()
+                            layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
+                            print(f'layer {layer_id} shared_expert w3 type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {scale_name}')
+                        else:
+                            layer_state_dict[layer_id][name] = param
+                    else:
+                        layer_state_dict[layer_id][name] = param
+                elif len(ns) == 5:
+                    if ns[1] == 'experts':
+                        expert_id = int(ns[2])
+                        if (ns[3] == 'w1' or ns[3] == 'w2' or ns[3] == 'w3') and ns[4] == 'scale':
+                            continue
+                        elif ns[3] == 'w1' and ns[4] == 'weight':
+                            scale = experts_w1_rescales[layer_id][expert_id]
+                            handle_expert_w(layer_id, expert_id, 1, param, name, scale, typ, shape, f'{save_path}/experts-{layer_id}')
+                        elif ns[3] == 'w2' and ns[4] == 'weight':
+                            scale = experts_w2_rescales[layer_id][expert_id]
+                            handle_expert_w(layer_id, expert_id, 2, param, name, scale, typ, shape, f'{save_path}/experts-{layer_id}')
+                        elif ns[3] == 'w3' and ns[4] == 'weight':
+                            scale = experts_w3_rescales[layer_id][expert_id]
+                            handle_expert_w(layer_id, expert_id, 3, param, name, scale, typ, shape, f'{save_path}/experts-{layer_id}')
+                        else:
+                            layer_state_dict[layer_id][name] = param
+                else:
+                    layer_state_dict[layer_id][name] = param
+            elif ns[0] == 'attn':
+                if len(ns) == 3:
+                    if ns[1] == 'wq_a' and ns[2] == 'scale':
+                        continue
+                    elif ns[1] == 'wq_a' and ns[2] == 'weight':
+                        param_weight = param.cuda()
+                        weight_name = name
+                        scale_name = name.replace('weight', 'scale')
+                        param_scale = states[scale_name]
+                        weight = weight_dequant(param_weight, param_scale.cuda())
+                        weight_int = (weight.to(torch.float32) * (2 ** 30)).round().to(torch.int32)
+                        layer_state_dict[layer_id][weight_name] = weight_int.cpu()
+                        print(f'layer {layer_id} wq_a weight, type: {typ}, shape: {shape}', flush=True)
+                    elif ns[1] == 'q_norm':
+                        print(f'layer {layer_id} q_norm, type: {typ}, shape: {shape}', flush=True)
+                        param_int3 = (param.to(torch.float32) * (2 ** 19)).round().to(torch.int32)
+                        layer_state_dict[layer_id][name] = param_int3
+                    elif ns[1] == 'kv_norm':
+                        print(f'layer {layer_id} kv_norm, type: {typ}, shape: {shape}', flush=True)
+                        param_int4 = (param.to(torch.float32) * (2 ** 23)).round().to(torch.int32)
+                        layer_state_dict[layer_id][name] = param_int4
+                    elif ns[1] == 'wq_b' and ns[2] == 'scale':
+                        continue
+                    elif ns[1] == 'wq_b' and ns[2] == 'weight':
+                        param_weight = param.cuda()
+                        weight_name = name
+                        scale_name = name.replace('weight', 'scale')
+                        param_scale = states[scale_name]
+                        weight = weight_dequant(param_weight, param_scale.cuda())
+                        weight_int = (weight.to(torch.float32) * (2 ** 30)).round().to(torch.int32)
+                        weight_int = weight_int.view(128, 192, 1536)
+                        wq_b1, wq_b2 = torch.split(weight_int, [128, 64], dim=-2)
+                        print(f'layer {layer_id} wq_b1 weight, shape: {wq_b1.shape}, wq_b2 weight, shape: {wq_b2.shape}', flush=True)
+                        wq_b1 = wq_b1.reshape(128 * 128, 1536)
+                        wq_b2 = wq_b2.reshape(128 * 64, 1536)
+                        wq_b1_name = weight_name.replace('wq_b', 'wq_b1')
+                        wq_b2_name = weight_name.replace('wq_b', 'wq_b2')
+                        # layer_state_dict[layer_id][weight_name] = weight_int.cpu()
+                        layer_state_dict[layer_id][wq_b1_name] = wq_b1.cpu()
+                        layer_state_dict[layer_id][wq_b2_name] = wq_b2.cpu()
+                        print(f'layer {layer_id} wq_b weight, type: {typ}, shape: {shape}', flush=True)
+                    elif ns[1] == 'wkv_a' and ns[2] == 'scale':
+                        continue
+                    elif ns[1] == 'wkv_a' and ns[2] == 'weight':
+                        param_weight = param.cuda()
+                        weight_name = name
+                        scale_name = name.replace('weight', 'scale')
+                        param_scale = states[scale_name]
+                        weight = weight_dequant(param_weight, param_scale.cuda())
+                        weight_int = (weight.to(torch.float32) * (2 ** 29)).round().to(torch.int32)
+                        # layer_state_dict[layer_id][weight_name] = weight_int.cpu()
+                        weight_int = weight_int.view(576, 7168)
+                        wkv_a1, wkv_a2 = torch.split(weight_int, [512, 64], dim=-2)
+                        print(f'layer {layer_id} wkv_a1 weight, shape: {wkv_a1.shape}, wkv_a2 weight, shape: {wkv_a2.shape}', flush=True)
+                        wkv_a1_name = weight_name.replace('wkv_a', 'wkv_a1')
+                        wkv_a2_name = weight_name.replace('wkv_a', 'wkv_a2')
+                        # layer_state_dict[layer_id][weight_name] = weight_int.cpu()
+                        layer_state_dict[layer_id][wkv_a1_name] = wkv_a1.cpu()
+                        layer_state_dict[layer_id][wkv_a2_name] = wkv_a2.cpu()
+                        print(f'layer {layer_id} wkv_a weight, type: {typ}, shape: {shape}', flush=True)
+                    elif ns[1] == 'wkv_b' and ns[2] == 'scale':
+                        continue
+                    elif ns[1] == 'wkv_b' and ns[2] == 'weight':
+                        param_weight = param.cuda()
+                        weight_name = name
+                        scale_name = name.replace('weight', 'scale')
+                        param_scale = states[scale_name]
+                        weight = weight_dequant(param_weight, param_scale.cuda())
+                        wkv_b = weight.view(128, 256, 512)
+                        wkv_b_1 = wkv_b[:, :128]
+                        wkv_b_1 = wkv_b_1.reshape(128 * 128, 512)
+                        scale1 = wkv_b_1_rescales[layer_id]
+                        wkv_b_1_rescale = 2 ** scale1
+                        wkv_b_1_int = torch.round(wkv_b_1.to(torch.float32) * wkv_b_1_rescale).to(torch.int32)
+                        wkv_b_2 = wkv_b[:, -128:]
+                        wkv_b_2 = wkv_b_2.reshape(128 * 128, 512)
+                        scale2 = wkv_b_2_rescales[layer_id]
+                        wkv_b_2_rescale = 2 ** scale2
+                        wkv_b_2_int = torch.round(wkv_b_2.to(torch.float32) * wkv_b_2_rescale).to(torch.int32)
+                        wkv_b_1_name = weight_name.replace("wkv_b", "wkv_b_1")
+                        wkv_b_1_scale_name = scale_name.replace("wkv_b", "wkv_b_1")
+                        layer_state_dict[layer_id][wkv_b_1_name] = wkv_b_1_int.cpu()
+                        layer_state_dict[layer_id][wkv_b_1_scale_name] = torch.tensor(scale1, dtype=torch.int32)
+                        wkv_b_2_name = weight_name.replace("wkv_b", "wkv_b_2")
+                        wkv_b_2_scale_name = scale_name.replace("wkv_b", "wkv_b_2")
+                        layer_state_dict[layer_id][wkv_b_2_name] = wkv_b_2_int.cpu()
+                        layer_state_dict[layer_id][wkv_b_2_scale_name] = torch.tensor(scale2, dtype=torch.int32)
+                        print(f'layer {layer_id} wkv_b, type: {typ}, shape: {shape}, wkv_b_1 weight: {wkv_b_1_name}, wkv_b_1 scale: {wkv_b_1_scale_name}, wkv_b_2 weight: {wkv_b_2_name}, wkv_b_2 scale: {wkv_b_2_scale_name}', flush=True)
+                    elif ns[1] == 'wo' and ns[2] == 'scale':
+                        continue
+                    elif ns[1] == 'wo' and ns[2] == 'weight':
+                        param_weight = param.cuda()
+                        weight_name = name
+                        scale_name = name.replace('weight', 'scale')
+                        param_scale = states[scale_name]
+                        weight = weight_dequant(param_weight, param_scale.cuda())
+                        scale = wo_rescales[layer_id]
+                        rescale = 2 ** scale
+                        if layer_id != 58:
+                            param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
+                        else:
+                            wo_abs = weight.abs().cpu()
+                            maxpos = wo_abs.argmax()
+                            row, col = divmod(maxpos.item(), weight.size(1))
+                            print(f'maxpos: {maxpos}, {row} {col}', flush=True)
+                            vstr = getBF16PrintStr(weight[row][col])
+                            print(f'weight[{row}][{col}]: {vstr}', flush=True)
+                            weight[row][col] = 0
+                            param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
+                            param_int[row][col] = -(2 ** 31)
+                        layer_state_dict[layer_id][weight_name] = param_int.cpu()
+                        layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
+                        print(f'layer {layer_id} wo weight, type: {typ}, shape: {shape}, weight: {weight_name}, scale: {scale_name}', flush=True)
+                    else:
+                        layer_state_dict[layer_id][name] = param
+                else:
+                    layer_state_dict[layer_id][name] = param
+            else:
+                layer_state_dict[layer_id][name] = param
+        save_file(layer_state_dict[layer_id], os.path.join(save_path, f"layer-{layer_id}.safetensors"))
+        print(f'Finish saving layer {layer_id}', flush=True)
+        layer_state_dict0[layer_id] = {}
+        layer_state_dict[layer_id] = {}
+    print('Finish opening')
+    os.makedirs(save_path, exist_ok=True)
+    print(layer_state_dict)
+    print(experts)
+    save_file(head_state_dict, os.path.join(save_path, f"head_int.safetensors"))
+    save_file(norm_state_dict, os.path.join(save_path, f"norm_int.safetensors"))
+    save_file(embed_state_dict, os.path.join(save_path, f"embed_int.safetensors"))
+    # for i, st in enumerate(layer_state_dict):
+    #     # print(f'{i} {st['attn_norm.weight']}', flush=True)
+    #     # print(f'{i} {st['ffn_norm.weight']}', flush=True)
+    #     save_file(st, os.path.join(save_path, f"layer-{i}.safetensors"))
+    #     print(f'Finish saving layer {i}', flush=True)
+    # for i in trange(mp):
+    #     save_file(state_dicts[i], os.path.join(save_path, f"model{i}-mp{mp}.safetensors"))
+    # print('Finish saving files')
+    for file_path in glob(os.path.join(hf_ckpt_path, "*token*")):
+        new_file_path = os.path.join(save_path, os.path.basename(file_path))
+        shutil.copyfile(file_path, new_file_path)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--hf-ckpt-path", type=str, required=True)
+    parser.add_argument("--save-path", type=str, required=True)
+    parser.add_argument("--n-experts", type=int, required=True)
+    parser.add_argument("--model-parallel", type=int, required=True)
+    args = parser.parse_args()
+    assert args.n_experts % args.model_parallel == 0, "Number of experts must be divisible by model parallelism"
+    main(args.hf_ckpt_path, args.save_path, args.n_experts, args.model_parallel)

inference/generate.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import os
+import time
+import json
+from argparse import ArgumentParser
+from typing import List
+from torch import nn
+import torch
+import torch.distributed as dist
+from transformers import AutoTokenizer
+from safetensors.torch import load_file, load_model
+from model import Transformer, ModelArgs, Block
+from concurrent.futures import ThreadPoolExecutor
+from kernel import softmax_q21, softmax_q19
+snark = False
+model = None
+kv_caches = [ torch.zeros(1, 4096 * 4, 512, dtype=torch.int64) ] * 61
+pe_caches = [ torch.zeros(1, 4096 * 4, 64, dtype=torch.int64) ] * 61
+state_dicts = [None] * 61
+def getF32PrintStr(ele):
+    v = int(ele.cpu().view(torch.uint32).item())
+    ex = str((v >> 23 & 0xFF) - 127)
+    r = '(1+' + str(v & 0x7FFFFF) + '/8388608)'
+    if v & 0x80000000:
+        vstr = '-' + r + '*2^' + ex
+    else:
+        vstr = r + '*2^' + ex
+    return vstr
+def getBF16PrintStr(ele):
+    v = int(ele.cpu().view(torch.uint16).item())
+    ex = v >> 7 & 0xFF
+    r = '(1+' + str(v & 0x7F) + '/128)'
+    rraw = v & 0x7F
+    if v & 0x8000:
+        vstr = '-' + r + '*2^' + str(ex - 127)
+    else:
+        vstr = r + '*2^' + str(ex - 127)
+    return vstr
+def mem(i):
+    a = torch.cuda.memory_allocated()/1024**2
+    r = torch.cuda.memory_reserved()/1024**2
+    m = torch.cuda.max_memory_allocated()/1024**2
+    print(f"{i} allocated={a:.1f}MB, reserved={r:.1f}MB, max={m:.1f}MB", flush=True)
+def load_model2(ckpt_path):
+    global model
+    with torch.device("cuda"):
+        load_model(model.embed, os.path.join(ckpt_path, f"embed_int.safetensors"))
+        load_model(model.norm, os.path.join(ckpt_path, f"norm_int.safetensors"))
+        load_model(model.head, os.path.join(ckpt_path, f"head_int.safetensors"))
+# logits 的 scale = 2^21
+def sample(logits, temperature: float = 1.0):
+    """
+    Samples a token from the logits using temperature scaling.
+    Args:
+        logits (torch.Tensor): The logits tensor for token predictions.
+        temperature (float, optional): Temperature for scaling logits. Defaults to 1.0.
+    Returns:
+        torch.Tensor: The sampled token.
+    """
+    # logits = logits.to(torch.float32) * (2 ** -15)
+    # typ = logits.type()
+    # print(f'sample logits type: {typ}')
+    # logits = logits / max(temperature, 1e-5)
+    # probs = torch.softmax(logits, dim=-1)
+    # return probs.div_(torch.empty_like(probs).exponential_(1)).argmax(dim=-1)
+    sample_open = False
+    if sample_open:
+        maxx = logits.abs().max()
+        typ = logits.type()
+        print(f'sample logits type: {typ}, shape: {logits.shape}, abs max: {maxx}')
+        if temperature > 1e-5:
+            temp_int = int(temperature)
+            # logits = (logits + temp_int // 2) // temp_int
+            logits = logits // temp_int
+            print(f'temp_int: {temp_int}', flush=True)
+        else:
+            logits = logits * (10 ** 5)
+        # print(f'sample 22 logits type: {typ}, shape: {logits.shape}, logits: {logits}')
+        # probs = torch.softmax(logits, dim=-1)
+        logits = logits.unsqueeze(2)
+        max0 = logits.abs().max()
+        print(f'sample 2233 logits shape: {logits.shape}, abs max0: {max0}')
+        # probs 的 rescale 为 2^21
+        probs = torch.empty_like(logits, dtype=torch.int64, device='cuda')
+        softmax_q21(logits.contiguous(), probs)
+        probs = probs.squeeze(2)
+        # print(f'sample 2233 probs shape: {probs.shape}')
+        typ2 = probs.type()
+        max1 = probs.abs().max()
+        print(f'sample 33 probs type: {typ2}, shape: {probs.shape}, probs abs max: {max1}', flush=True)
+        rand = torch.empty_like(probs, dtype=torch.float32, device='cuda').exponential_(1)
+        rand_abs = rand.abs()
+        rmin = getF32PrintStr(rand_abs.min())
+        rmax = getF32PrintStr(rand_abs.max())
+        print(f'sample 333 rand abs min: {rmin}, max: {rmax}', flush=True)
+        # rand = (rand * (2 ** 21)).round().to(torch.int64) + (2 ** 15)
+        rand = (rand * (2 ** 10)).round().to(torch.int64) + (2 ** 4)
+        max2 = rand.abs().max()
+        min2 = rand.abs().min()
+        print(f'sample 55 rand abs min: {min2}, max: {max2}', flush=True)
+        # probs 的 rescale 为 2^21
+        # probs = (probs * (2 ** 21) + rand // 2) // rand
+        probs = (probs * (2 ** 10)) // rand
+        max3 = probs.abs().max()
+        print(f'sample 66 probs abs max: {max3}', flush=True)
+        res = probs.argmax(dim=-1)
+        tid = res[0][0].item()
+        tv = probs[0][0][tid]
+        randv = rand[0][0][tid]
+        # typ3 = res.type()
+        print(f'sample 44 res: {res}, tid: {tid}, tv: {tv}, randv: {randv}')
+    else:
+        probs = logits.unsqueeze(2)
+        max3 = probs.abs().max()
+        print(f'sample 66 probs abs max: {max3}', flush=True)
+        res = probs.argmax(dim=-1)
+    return res
+def saveTensor(fileName, t):
+    with open(fileName, "w", encoding="utf-8") as f:
+        t = t.detach()
+        if t.device.type != "cpu":
+            t = t.cpu()
+        t = t.contiguous()
+        with open(fileName, "wb") as f:
+            # .numpy() -> bytes（C-order）
+            f.write(t.numpy().tobytes(order="C"))
+# model：用来输出最终结果token的模型。这里导入的是deepseek的模型架构。
+# prompt_tokens： 即前文中的prompt_tokens, shape为 (batch_size, total_len, 7168)
+# max_new_tokens：允许生成的最大的tokens的数量。生成过程会在这个数量或遇到终止标识符 (eos_id) 时停止。
+# eos_id：<end▁of▁sentence>这个token对应的ID。当生成结果中遇到这个 token 时，该序列的生成会停止。
+# temperature：采样温度。温度值控制生成时的随机性：温度越高，采样的随机性越大；当温度为 0 时，使用贪心策略（即选取概率最高的 token）。
+# prompt的输入是List[List[int]]，外面的那个List是batch，里面的这个List是seq。等效于输入进去的就是已经tokenize好了的batch个的prompt。在我们这个“Who are you?”的示例中，batch = 1
+@torch.inference_mode()
+def generate(
+    # model: Transformer,
+    ckpt_path: str,
+    args: ModelArgs,
+    tokenizer: AutoTokenizer,
+    prompt_tokens: List[List[int]],
+    max_new_tokens: int,
+    eos_id: int,
+    temperature: float = 1.0
+) -> List[List[int]]:
+    """
+    Generates new tokens based on the given prompt tokens using the specified model.
+    Args:
+        model (Transformer): The transformer model used for token generation.
+        prompt_tokens (List[List[int]]): A list of lists containing the prompt tokens for each sequence.
+        max_new_tokens (int): The maximum number of new tokens to generate.
+        eos_id (int): The end-of-sequence token ID.
+        temperature (float, optional): The temperature value for sampling. Defaults to 1.0.
+    Returns:
+        List[List[int]]: A list of lists containing the generated tokens for each sequence.
+    """
+    global model, layers
+    global kv_caches, pe_caches
+    prompt_lens = [len(t) for t in prompt_tokens]
+    assert max(prompt_lens) <= args.max_seq_len, f"Prompt length exceeds model maximum sequence length (max_seq_len={args.max_seq_len})"
+    total_len = min(args.max_seq_len, max_new_tokens + max(prompt_lens))
+    # 利用 torch.full 初始化一个形状为 (batch_size, total_len) 的张量，所有值初始为 -1。这里 -1 作为“未填充token”的标志
+    # torch.long 64 位 bit
+    tokens = torch.full((len(prompt_tokens), total_len), -1, dtype=torch.long, device="cuda")
+    # 遍历每个 prompt，将其 token 填入对应行的前面部分。这样，张量中前面部分对应的是已知的 prompt，后面部分为待生成的 token 空间。
+    for i, t in enumerate(prompt_tokens):
+        tokens[i, :len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
+    beginstr = tokenizer.decode(tokens[0][0:prompt_lens[0]], skip_special_tokens=True)
+    # torch.cuda.synchronize()
+    print(' ++++++ token:', beginstr, flush=True)
+    prev_pos = 0
+    # finished则是一个布尔张量，标记每个序列是否已经完成生成。初始时假设所有序列均未完成（False）
+    finished = torch.tensor([False] * len(prompt_tokens), device="cuda")
+    # prompt_mask则生成一个掩码张量，用来标记哪些位置已经有prompt token（即 token 不等于 -1）。在生成过程中，这个掩码帮助区分哪些位置是用户提供的prompt，哪些是模型生成的token。
+    # 这是用来辅助自回归的生成的，避免prompt_tokens的部分被覆盖。
+    prompt_mask = tokens != -1
+    # cur_pos则记录prompt_tokens里最短的那段prompt的长度，后续的生成就从这个位置开始，以确保所有的输入都能得到生成正确而完整的回答。
+    for cur_pos in range(min(prompt_lens), total_len):
+        print(f'prev_pos: {prev_pos}, cur_pos: {cur_pos}, total_len: {total_len}', flush=True)
+        t = tokenizer.decode(tokens[0][prev_pos:cur_pos], skip_special_tokens=True)
+        print(str(cur_pos) + ' ---------- token list: ' + str(tokens[0][prev_pos:cur_pos].tolist()), flush=True)
+        if snark:
+            os.makedirs(f'zkdata/pos_{prev_pos}', exist_ok=True)
+            saveTensor(f'zkdata/pos_{prev_pos}/tokens.bin', tokens[0][prev_pos:cur_pos].cpu())
+        # logits = model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
+        h, start_pos, seqlen = model.prep_inference(tokens[:, prev_pos:cur_pos], prev_pos)
+        print('h 1 shape: ' + str(h.shape), flush=True)
+        for i in range(args.n_layers):
+            print(f'begin layer {i} -----------------', flush=True)
+            with torch.device("cuda"):
+                with torch.no_grad():
+                    if hasattr(model.layers[i], 'attn_norm'):
+                        del model.layers[i].attn_norm.weight
+                model.layers[i] = Block(i, args, ckpt_path)
+                model.layers[i].load_state_dict(state_dicts[i], False)
+                model.layers[i].attn.kv_cache = kv_caches[i].to('cuda')
+                model.layers[i].attn.pe_cache = pe_caches[i].to('cuda')
+            h = model.layer_inference(i, h, start_pos, seqlen)
+            kv_caches[i] = model.layers[i].attn.kv_cache
+            pe_caches[i] = model.layers[i].attn.pe_cache
+            model.layers[i] = nn.Module()
+            tmph = model.norm(h)[0][:, -1]
+            tmph_abs = tmph.abs()
+            tmph_min = tmph_abs.min()
+            tmph_max = tmph_abs.max()
+            print(f'tmph_abs min: {tmph_min}, max: {tmph_max}', flush=True)
+            tmplogits = model.head(tmph[None, :])
+            tmp_next_token = tmplogits.argmax(dim=-1)
+            tid = tmp_next_token[0][0].item()
+            tmp_logit = tmplogits[0][0][tid]
+            tmp_completion = tokenizer.decode([tmp_next_token[0][0]], skip_special_tokens=True)
+            print(f'position {cur_pos} tid: {tid}, tmp_logit:{tmp_logit}, candidate: {tmp_completion}', flush=True)
+        # logits 的 scale = 2^21
+        logits = model.finish_inference(h)
+        if temperature > 0:
+            next_token = sample(logits, temperature)
+        else:
+            next_token = logits.argmax(dim=-1)
+        next_token = torch.where(prompt_mask[:, cur_pos], tokens[:, cur_pos], next_token)
+        # print('next_token shape: ' + str(next_token.shape))
+        tokens[:, cur_pos] = next_token
+        # 当所有finished里对应每一行的值都变成true的时候就意味着生成结束了。之后再进行decode，就得到了最终的输出。
+        finished |= torch.logical_and(~prompt_mask[:, cur_pos], next_token.view(-1) == eos_id)
+        prev_pos = cur_pos
+        completion = tokenizer.decode(tokens[0][0:cur_pos+1], skip_special_tokens=True)
+        print(f'----------  Result: position {cur_pos}, token: {completion}', flush=True)
+        if finished.all():
+            break
+    completion_tokens = []
+    for i, toks in enumerate(tokens.tolist()):
+        toks = toks[prompt_lens[i]:prompt_lens[i]+max_new_tokens]
+        if eos_id in toks:
+            toks = toks[:toks.index(eos_id)]
+        completion_tokens.append(toks)
+    return completion_tokens
+def main(
+    ckpt_path: str,
+    config: str,
+    input_file: str = "",
+    interactive: bool = True,
+    max_new_tokens: int = 100,
+    temperature: float = 1.0,
+) -> None:
+    """
+    Main function to load the model and perform interactive or batch text generation.
+    Args:
+        ckpt_path (str): Path to the model checkpoint directory.
+        config (str): Path to the model configuration file.
+        input_file (str, optional): Path to a file containing input prompts. Defaults to "".
+        interactive (bool, optional): Whether to run in interactive mode. Defaults to True.
+        max_new_tokens (int, optional): Maximum number of new tokens to generate. Defaults to 100.
+        temperature (float, optional): Temperature for sampling. Defaults to 1.0.
+    """
+    global model
+    # WORLD_SIZE描述了全局进程总数（即参与训练的 GPU 总数）
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+    # RANK则是当前进程的全局编号（即多机多卡上的进程编号，范围是[0,world_size-1]）
+    rank = int(os.getenv("RANK", "0"))
+    # LOCAL_RANK则是当前节点（机器）上的进程编号（即目前机器上的编号）
+    # local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    print('WORLD_SIZE: ' + str(world_size) + ', rank: ' + str(rank))
+    # 当world_size>1时，则表示当前是多机多卡训练，就需要初始化分布式进程组了。这个时候就使用NCCL后端来初始化分布式训练。
+    # 这里初始化了进程组，因此在后续的加载参数中，每个进程将通过仅加载属于自己进程部分的模型参数来全量加载模型。
+    # NCCL（NVIDIA Collective Communications Library）是 NVIDIA 提供的一个用于高效多 GPU 和多节点通信的库。
+    # 它专为深度学习和高性能计算（HPC）设计，能够显著加速分布式训练和多 GPU 计算任务。
+    # if world_size > 1:
+        # dist.init_process_group("nccl")
+    # global print
+    # 屏蔽非主进程的print函数，防止多个进程同时打印日志，保持输出整洁
+    # if rank != 0:
+        # print = lambda *_, **__: None
+    # 设定GPU设备，让当前进程只使用local_rank对应的GPU：
+    # torch.cuda.set_device(local_rank)
+    torch.cuda.set_device(0)
+    # 设置pytorch计算时的默认数据类型。这里使用的是BF16
+    torch.set_default_dtype(torch.bfloat16)
+    # 限制 PyTorch 在 CPU 计算时最多使用 8 个线程，防止过多线程竞争资源：
+    torch.set_num_threads(8)
+    # 设定随机种子，保证不同进程初始化时随机数相同。
+    torch.manual_seed(965)
+    with open(config) as f:
+        args = ModelArgs(**json.load(f))
+    print(args)
+    # 首先根据deepseek给定的tokenizer.json加载了tokenizer，然后通过load_model加载了参数。注意：一般来讲，load_model是只能加载单一的safetensors的。
+    # 但由于之前我们通过dist.init_process_group("nccl")完成了进程组的初始化，因此这一行代码每个进程都会执行。又因为确定好了rank ，
+    # 进而使得每个进程只会加载属于自己那部分的模型。到此便完成了模型的全量加载。
+    for i in range(args.n_layers):
+        modelPath = os.path.join(ckpt_path, f"layer-{i}.safetensors")
+        state_dicts[i] = load_file(modelPath, device="cpu")
+    with torch.device("cuda"):
+        model = Transformer(args)
+    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
+    load_model2(ckpt_path)
+    # with torch.device("cuda"):
+    #     freqs_cis_orig = precompute_freqs_cis(args)
+    # load_model2(ckpt_path)
+    # tokenizer.encode  将字符编码转换为 token, tokenizer.decode 转换为字符编码
+    # generate 函数将一直生成下一个字符，直到遇到结束字符为止
+    # tokenizer.decode(generate(model, [tokenizer.encode("DeepSeek")], 200, -1, 1.)[0])
+    # cmp1 = tokenizer.decode(generate(ckpt_path, args, tokenizer, [tokenizer.encode("DeepSeek")], 2, -1, 1.)[0])
+    # print(' ---------- DeepSeek result: ' + str(cmp1), flush=True)
+    # print('begin to load model: ' + f"model{rank}-mp{world_size}.safetensors")
+    if rank == 0:
+        # !!! 这一块代码会导致显存泄露
+        embed_abs = model.embed.weight.detach().cpu().abs()
+        abs_min = torch.min(embed_abs)
+        abs_max = torch.max(embed_abs)
+        print('embed abs_min: ' + str(abs_min), flush=True)
+        print('embed abs_max: ' + str(abs_max), flush=True)
+    else:
+        pass
+    if interactive:
+        messages = []
+        while True:
+            if world_size == 1:
+                prompt = input(">>> ")
+            # 当多机多卡（world_size>1）并且只有主进程（rank==0）接受用户的输入prompt，并通过dist.broadcast_object_list(objects,0)的方式广播给其他进程（rank!=0）。
+            # 其他进程通过dist.broadcast_object_list(objects,0)接受主进程的prompt，并用于后续进入模型之中的输入。
+            # 主进程在input()处会阻塞，而非主进程将在广播这一步阻塞。因此在接受到输入之后，可以保证所有进程接收到相同的prompt。
+            elif rank == 0:
+                prompt = input(">>> ")
+                objects = [prompt]
+                dist.broadcast_object_list(objects, 0)
+            else:
+                objects = [None]
+                dist.broadcast_object_list(objects, 0)
+                prompt = objects[0]
+            if prompt == "/exit":
+                break
+            elif prompt == "/clear":
+                messages.clear()
+                continue
+            # 假设我们的prompt是“Hello,Who are you?”则其输入会整理成如下的chat template：
+            #[
+                #{
+                    #"role":"user",
+                    #"content":"Hello,Who are you?"
+                #}
+            #]
+            messages.append({"role": "user", "content": prompt})
+            # 而后经过tokenizer.apply_chat_template 将输入的chat template转化为模型训练时所使用的真正输入的token。
+            # 可以看huggingface关于chat template的官方文档，这里面介绍得十分详细。在这里，我们只需要知道这个chat template转化为了模型输入的token即可。
+            # tokenizer.apply_chat_template的tokenize参数是默认为正的。因此，经过了转化后的聊天模板将会变成int型的token形式。
+            # 也就是说，上面的chat template 最终将变为List[int]，如[134,135,1617,...,124]等，之后作为input tokens输入到模型中。
+            prompt_tokens = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
+            # prompt_tokens = tokenizer(prompt, add_special_tokens=True)
+            # 我们现在的prompt已经变成了prompt_tokens，并通过generate()变成了 输出的回答所对应的token（completion_tokens），
+            # 而后再decode成为完整的回答后重新组成chat template并加入到历史的message中，则一个流程的问答就结束了。
+            # completion_tokens = generate(model, [prompt_tokens], max_new_tokens, tokenizer.eos_token_id, temperature)
+            # with torch.no_grad():
+            completion_tokens = generate(ckpt_path, args, tokenizer, [prompt_tokens], max_new_tokens, tokenizer.eos_token_id, temperature)
+            # completion_tokens = generate(ckpt_path, args, tokenizer, [prompt_tokens['input_ids']], max_new_tokens, tokenizer.eos_token_id, temperature)
+            completion = tokenizer.decode(completion_tokens[0], skip_special_tokens=True)
+            print(completion)
+            messages.append({"role": "assistant", "content": completion})
+    else:
+        with open(input_file) as f:
+            prompts = [line.strip() for line in f.readlines()]
+        assert len(prompts) <= args.max_batch_size, f"Number of prompts exceeds maximum batch size ({args.max_batch_size})"
+        prompt_tokens = [tokenizer.apply_chat_template([{"role": "user", "content": prompt}], add_generation_prompt=True) for prompt in prompts]
+        # completion_tokens = generate(model, prompt_tokens, max_new_tokens, tokenizer.eos_token_id, temperature)
+        completion_tokens = generate(ckpt_path, args, tokenizer, prompt_tokens, max_new_tokens, tokenizer.eos_token_id, temperature)
+        completions = tokenizer.batch_decode(completion_tokens, skip_special_tokens=True)
+        for prompt, completion in zip(prompts, completions):
+            print("Prompt:", prompt)
+            print("Completion:", completion)
+            print()
+    if world_size > 1:
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    """
+    Command-line interface for distributed text generation.
+    Arguments:
+        --ckpt-path (str): Path to the model checkpoint directory. 模型参数存放的路径。
+        --config (str): Path to the model configuration file. 模型的超参配置文件的路径。
+        --input-file (str, optional): File containing prompts for batch processing. 假设我们是批量输入prompt，则该参数是批量输入prompt的文件的路径。
+        --interactive (bool, optional): Enable interactive mode for generating text. 是否是问答交互式？这里相当于开启模型的“问答”模式。bool变量。
+        --max-new-tokens (int, optional): Maximum number of new tokens to generate. Defaults to 200. 限制要求生成的tokens的数量。
+        --temperature (float, optional): Temperature for sampling. Defaults to 0.2. 采样温度。
+    Raises:
+        AssertionError: If neither input-file nor interactive mode is specified.
+    """
+    parser = ArgumentParser()
+    parser.add_argument("--ckpt-path", type=str, required=True)
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--input-file", type=str, default="")
+    parser.add_argument("--interactive", action="store_true")
+    parser.add_argument("--max-new-tokens", type=int, default=200)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    args = parser.parse_args()
+    assert args.input_file or args.interactive, "Either input-file or interactive mode must be specified"
+    main(args.ckpt_path, args.config, args.input_file, args.interactive, args.max_new_tokens, args.temperature)

inference/int64_gemm.cu ADDED Viewed

	@@ -0,0 +1,1030 @@

+// int64_gemm.cu
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include <stdio.h>
+extern "C" __global__ void int64_32_bmm_broadcast_kernel(
+    const int64_t* __restrict__ A,  // (B, M, K)
+    const int32_t* __restrict__ B,  // (N, K)
+    int64_t* __restrict__ C,        // (B, M, N)
+    int64_t* __restrict__ R,        // remainer (B, M, N)
+    const int64_t a_rescale,
+    const int64_t b_rescale,
+    const int64_t c_rescale,
+    int Bdim, int M, int K, int N)
+{
+    int b = blockIdx.z;                              // batch
+    int row = blockIdx.y * blockDim.y + threadIdx.y; // M
+    int col = blockIdx.x * blockDim.x + threadIdx.x; // N
+    if (row < M && col < N) {
+        __int128_t sum = 0;
+        __int128_t rescale = (1 << c_rescale) - 1;
+        for (int k = 0; k < K; ++k) {
+            int64_t a_val = A[b * M * K + row * K + k];   // A[b, row, k]
+            int32_t b_val = B[col * K + k];               // B[col, k]
+            sum += __int128_t(a_val / a_rescale) * __int128_t(b_val / b_rescale);
+        }
+        int ind = b * M * N + row * N + col;
+        // C[ind] = sum / c_rescale;  // C[b, row, col]
+        // R[ind] = sum % c_rescale;  // R[b, row, col]
+        C[ind] = int64_t(sum >> c_rescale);  // C[b, row, col]
+        R[ind] = int64_t(sum & rescale);  // R[b, row, col]
+    }
+}
+extern "C" void int64_32_bmm_broadcast_launcher(
+    const int64_t* A, const int32_t* B, int64_t* C, int64_t* R,
+    const int64_t a_rescale, const int64_t b_rescale, const int64_t c_rescale,
+    int Bdim, int M, int K, int N)
+{
+    dim3 threads(32, 32);
+    dim3 blocks((N + threads.x - 1) / threads.x,
+                (M + threads.y - 1) / threads.y,
+                Bdim);
+    int64_32_bmm_broadcast_kernel<<<blocks, threads>>>(A, B, C, R, a_rescale, b_rescale, c_rescale, Bdim, M, K, N);
+}
+extern "C" __global__ void int64_64_bmm_broadcast_kernel(
+    const int64_t* __restrict__ A,  // (B, M, K)
+    const int64_t* __restrict__ B,  // (N, K)
+    int64_t* __restrict__ C,        // (B, M, N)
+    int64_t* __restrict__ R,        // remainer (B, M, N)
+    const int64_t a_rescale,
+    const int64_t b_rescale,
+    const int64_t c_rescale,
+    int Bdim, int M, int K, int N)
+{
+    int b = blockIdx.z;                              // batch
+    int row = blockIdx.y * blockDim.y + threadIdx.y; // M
+    int col = blockIdx.x * blockDim.x + threadIdx.x; // N
+    if (row < M && col < N) {
+        __int128_t sum = 0;
+        __int128_t rescale = (1 << c_rescale) - 1;
+        for (int k = 0; k < K; ++k) {
+            int64_t a_val = A[b * M * K + row * K + k];   // A[b, row, k]
+            int64_t b_val = B[col * K + k];               // B[col, k]
+            sum += __int128_t(a_val / a_rescale) * __int128_t(b_val / b_rescale);
+        }
+        int ind = b * M * N + row * N + col;
+        // C[ind] = sum / c_rescale;  // C[b, row, col]
+        // R[ind] = sum % c_rescale;  // R[b, row, col]
+        C[ind] = int64_t(sum >> c_rescale);  // C[b, row, col]
+        R[ind] = int64_t(sum & rescale);  // R[b, row, col]
+    }
+}
+extern "C" void int64_64_bmm_broadcast_launcher(
+    const int64_t* A, const int64_t* B, int64_t* C, int64_t* R,
+    const int64_t a_rescale, const int64_t b_rescale, const int64_t c_rescale,
+    int Bdim, int M, int K, int N)
+{
+    dim3 threads(32, 32);
+    dim3 blocks((N + threads.x - 1) / threads.x,
+                (M + threads.y - 1) / threads.y,
+                Bdim);
+    int64_64_bmm_broadcast_kernel<<<blocks, threads>>>(A, B, C, R, a_rescale, b_rescale, c_rescale, Bdim, M, K, N);
+}
+extern "C" __global__ void bf16_to_int32_2d_kernel(const uint16_t* input, int32_t* output, int rows, int cols, int rescale)
+{
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row < rows && col < cols) {
+        int idx = row * cols + col;
+        int v0 = input[idx];
+        int ex0 = ((v0 >> 7) & 0xFF) - 127;
+        int r0 = v0 & 0x7F;
+        if (ex0 == -127 && r0 == 0) {
+            output[idx] = 0;
+            return;
+        }
+        int ex2 = ex0 + rescale;
+        int r2 = r0 + 128;
+        uint32_t v = 0;
+        if(ex2 >= 0) {
+            v = r2 * (1 << ex2);
+        } else {
+            v = r2 / (1 << -ex2);
+        }
+        if (v0 & 0x8000) {
+            v = -v;
+        }
+        output[idx] = v;
+    }
+}
+extern "C" void bf16_to_int32_2d(const uint16_t* input, int32_t* output, int rows, int cols, int rescale) {
+    dim3 threads(32, 32);
+    dim3 blocks((cols + threads.x - 1) / threads.x,
+                (rows + threads.y - 1) / threads.y);
+    bf16_to_int32_2d_kernel<<<blocks, threads>>>(input, output, rows, cols, rescale);
+}
+extern "C" __global__ void wkv_b_bf16_to_int32_kernel(const uint16_t* input, int32_t* output, int rows, int cols)
+{
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row < rows && col < cols) {
+        int idx = row * cols + col;
+        int v0 = input[idx];
+        int ex0 = ((v0 >> 7) & 0xFF) - 127;
+        int r0 = v0 & 0x7F;
+        if (ex0 == -127 && r0 == 0) {
+            output[idx] = 0;
+            return;
+        }
+        if (ex0 >= -1) {
+            output[idx] = 0x7FFFFFFF;
+            return;
+        }
+        int ex2 = ex0 + 25;
+        int r2 = r0 + 128;
+        uint32_t v = 0;
+        if(ex2 >= 0) {
+            v = r2 * (1 << ex2);
+        } else {
+            v = r2 / (1 << -ex2);
+        }
+        if (v0 & 0x8000) {
+            v = -v;
+        }
+        output[idx] = v;
+    }
+}
+extern "C" void wkv_b_bf16_to_int32(const uint16_t* input, int32_t* output, int rows, int cols) {
+    dim3 threads(32, 32);
+    dim3 blocks((cols + threads.x - 1) / threads.x,
+                (rows + threads.y - 1) / threads.y);
+    wkv_b_bf16_to_int32_kernel<<<blocks, threads>>>(input, output, rows, cols);
+}
+extern "C" __global__ void float32_to_int64_2d_kernel(const uint32_t* input, int64_t* output, int rows, int cols, int rescale)
+{
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row < rows && col < cols) {
+        int idx = row * cols + col;
+        uint32_t v0 = input[idx];
+        int ex0 = ((v0 >> 23) & 0xFF) - 127;
+        int r0 = v0 & 0x7FFFFF;
+        if (ex0 == -127 && r0 == 0) {
+            output[idx] = 0;
+            return;
+        }
+        int ex2 = ex0 + rescale;
+        int64_t r2 = r0 + 8388608;
+        int64_t v = 0;
+        if(ex2 >= 0) {
+            v = r2 * (1 << ex2);
+        } else {
+            v = r2 / (1 << -ex2);
+        }
+        if (v0 & 0x80000000) {
+            v = -v;
+        }
+        output[idx] = v;
+    }
+}
+extern "C" void float32_to_int64_2d(const uint32_t* input, int64_t* output, int rows, int cols, int rescale) {
+    dim3 threads(32, 32);
+    dim3 blocks((cols + threads.x - 1) / threads.x,
+                (rows + threads.y - 1) / threads.y);
+    float32_to_int64_2d_kernel<<<blocks, threads>>>(input, output, rows, cols, rescale);
+}
+extern "C" __global__ void complex_int64_mul_kernel(
+    const int64_t* __restrict__ A,
+    const int64_t* __restrict__ B,
+    int64_t* __restrict__ C,
+    // int64_t high_rescale, int64_t row_rescale,
+    int batchSize, int seqLen, int headCount, int headDim)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total = batchSize * seqLen * headCount * headDim;
+    if (idx >= total) return;
+    // 计算 A 的索引
+    int i = idx;
+    int dimId = i % headDim; i /= headDim;
+    int headId = i % headCount; i /= headCount;
+    int seqId = i % seqLen; i /= seqLen;
+    int batchId = i;
+    // A 索引
+    int a_idx = ((batchId * seqLen + seqId) * headCount + headId) * headDim + dimId;
+    // B 索引 (广播)
+    int b_idx = ((0 * seqLen + seqId) * 1 + 0) * headDim + dimId;
+    int64_t a0 = A[2 * a_idx];
+    int64_t a1 = A[2 * a_idx + 1];
+    int64_t b0 = B[2 * b_idx];
+    int64_t b1 = B[2 * b_idx + 1];
+    // C[2 * a_idx] = (a0 * b0 - a1 * b1) / c_resacle;
+    // C[2 * a_idx + 1] = (a0 * b1 + a1 * b0) / c_resacle;
+    // C[2 * a_idx] = __mul64hi(a0, b0) * high_rescale + a0 * b0 / row_rescale) - (__mul64hi(a1, b1) * high_rescale + a1 * b1 / row_rescale);
+    // C[2 * a_idx + 1] = (__mul64hi(a0, b1) * high_rescale + a0 * b1 / row_rescale) + (__mul64hi(a1, b0) * high_rescale + a1 * b0 / row_rescale);
+    int64_t a0b0 = ((__mul64hi(a0, b0) & 0x3FFFFFFFFFF) << 22) | (((a0 * b0) >> 42) & 0x3FFFFF);
+    int64_t a1b1 = ((__mul64hi(a1, b1) & 0x3FFFFFFFFFF) << 22) | (((a1 * b1) >> 42) & 0x3FFFFF);
+    int64_t a0b1 = ((__mul64hi(a0, b1) & 0x3FFFFFFFFFF) << 22) | (((a0 * b1) >> 42) & 0x3FFFFF);
+    int64_t a1b0 = ((__mul64hi(a1, b0) & 0x3FFFFFFFFFF) << 22) | (((a1 * b0) >> 42) & 0x3FFFFF);
+    C[2 * a_idx] = a0b0 - a1b1;
+    C[2 * a_idx + 1] = a0b1 + a1b0;
+    // if(idx == 32) {
+    //     printf("%d %d %d, %d %d %d %d (%d %d %d %d): (%ld, %ld i) * (%ld, %ld i) = (%ld, %ld i)\n",
+    //     idx, a_idx, b_idx,
+    //     batchSize, seqLen, headCount, headDim,
+    //     batchId, seqId, headId, dimId,
+    //     a0, a1, b0, b1, C[2 * a_idx], C[2 * a_idx + 1]);
+    // }
+}
+extern "C" void complex_int64_mul(
+    const int64_t* A, const int64_t* B, int64_t* C,
+    // const int64_t high_rescale, const int64_t row_rescale,
+    int batchSize, int seqLen, int headCount, int headDim)
+{
+    int total = batchSize * seqLen * headCount * headDim;
+    int threads = 256;
+    int blocks = (total + threads - 1) / threads;
+    complex_int64_mul_kernel<<<blocks, threads>>>(A, B, C,
+        // high_rescale, row_rescale,
+        batchSize, seqLen, headCount, headDim);
+}
+extern "C" __global__ void rms_norm_kernel_32(
+    const int64_t* __restrict__ A,
+    const int32_t* __restrict__ W,
+    const int64_t* __restrict__ rms,
+    int64_t* __restrict__ C,
+    int seqLen, int Dim)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total = seqLen * Dim;
+    if (idx >= total) return;
+    // 计算 A 的索引
+    int dimId = idx % Dim;
+    int seqId = idx / Dim;
+    // A 索引
+    int a_idx = seqId * Dim + dimId;
+    // W 索引 (广播)
+    int w_idx = dimId;
+    int64_t a = A[a_idx];
+    int32_t w = W[w_idx];
+    int64_t r = rms[seqId];
+    __int128 prod = ( __int128)a * ( __int128)w;  // 在 128 位里计算乘积，不溢出
+    __int128 qq = prod / (__int128)r;            // 整数除法
+    __int128 rr = prod % (__int128)r;            // 整数取模
+    if(rr < 0) {
+        qq = qq - 1;
+        rr = rr + r;
+    }
+    int64_t res = (int64_t)qq;
+    C[a_idx] = res;
+}
+extern "C" __global__ void rms_norm_kernel_64(
+    const int64_t* __restrict__ A,
+    const int64_t* __restrict__ W,
+    const int64_t* __restrict__ rms,
+    int64_t* __restrict__ C,
+    int seqLen, int Dim)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total = seqLen * Dim;
+    if (idx >= total) return;
+    // 计算 A 的索引
+    int dimId = idx % Dim;
+    int seqId = idx / Dim;
+    // A 索引
+    int a_idx = seqId * Dim + dimId;
+    // W 索引 (广播)
+    int w_idx = dimId;
+    int64_t a = A[a_idx];
+    int64_t w = W[w_idx];
+    int64_t r = rms[seqId];
+    __int128 prod = ( __int128)a * ( __int128)w;  // 在 128 位里计算乘积，不溢出
+    __int128 qq = prod / (__int128)r;            // 整数除法
+    __int128 rr = prod % (__int128)r;            // 整数取模
+    if(rr < 0) {
+        qq = qq - 1;
+        rr = rr + r;
+    }
+    int64_t res = (int64_t)qq;
+    C[a_idx] = res;
+}
+extern "C" void rms_norm_32(
+    const int64_t* A, const int32_t* W, const int64_t* rms, int64_t* C,
+    int seqLen, int Dim)
+{
+    int total = seqLen * Dim;
+    int threads = 256;
+    int blocks = (total + threads - 1) / threads;
+    rms_norm_kernel_32<<<blocks, threads>>>(A, W, rms, C, seqLen, Dim);
+}
+extern "C" void rms_norm_64(
+    const int64_t* A, const int64_t* W, const int64_t* rms, int64_t* C,
+    int seqLen, int Dim)
+{
+    int total = seqLen * Dim;
+    int threads = 256;
+    int blocks = (total + threads - 1) / threads;
+    rms_norm_kernel_64<<<blocks, threads>>>(A, W, rms, C, seqLen, Dim);
+}
+extern "C" __global__ void einsum_bshd_hdc_bshc_kernel(
+    const int64_t* q_nope,  // [B, S, H, D]
+    const int32_t* wkv_b_1, // [H, D, C]
+    int64_t* out,           // [B, S, H, C]
+    int64_t rescale,
+    int B, int S, int H, int D, int C)
+{
+    int b = blockIdx.x;   // batch
+    int s = blockIdx.y;   // sequence
+    int h = blockIdx.z;   // head
+    int c = threadIdx.x;  // output channel
+    if (c >= C) return;
+    __int128_t sum = rescale / 2;
+    int q_base = ((b * S + s) * H + h) * D;
+    int w_base = h * D * C + c;
+    for (int d = 0; d < D; d++) {
+        // int w_idx = (h * D + d) * C + c;
+        sum += __int128_t(q_nope[q_base + d]) * __int128_t(wkv_b_1[w_base +  d * C]);
+    }
+    // sum /= rescale;
+    int64_t sum2 = int64_t(sum >> rescale);
+    int out_idx = ((b * S + s) * H + h) * C + c;
+    out[out_idx] = sum2;
+}
+extern "C" void einsum_bshd_hdc_bshc(const int64_t* q_nope, const int32_t* wkv_b_1, int64_t* out,
+    int64_t rescale, int B, int S, int H, int D, int C) {
+    dim3 grid(B, S, H);
+    dim3 block(C);
+    einsum_bshd_hdc_bshc_kernel<<<grid, block>>>(
+        q_nope, wkv_b_1, out, rescale,
+        B, S, H, D, C);
+}
+extern "C" __global__ void einsum_bshc_btc_bsht_kernel(
+    const int64_t* __restrict__ A,  // [B, S, H, C]
+    const int64_t* __restrict__ B,  // [B, T, C]
+    int64_t* __restrict__ C,        // [B, S, H, T]
+    int64_t rescale,
+    int Bsz, int S, int H, int T, int Cdim)
+{
+    int b = blockIdx.x;
+    int s = blockIdx.y;
+    int h = blockIdx.z;
+    int t = threadIdx.x;
+    if (t >= T) return;
+    // 计算 A[b,s,h,:] 和 B[b,t,:] 的内积
+    __int128_t sum = rescale / 2;
+    int A_base = ((b * S + s) * H + h) * Cdim;
+    int B_base = (b * T + t) * Cdim;
+    for (int c = 0; c < Cdim; c++) {
+        // int idxB = (b * T + t) * Cdim + c;
+        sum += __int128_t(A[A_base + c]) * __int128_t(B[B_base + c]);
+    }
+    // sum /= rescale;
+    int64_t sum2 =  int64_t(sum >> rescale);
+    int idxC = ((b * S + s) * H + h) * T + t;
+    C[idxC] = sum2;
+}
+extern "C" void einsum_bshc_btc_bsht(const int64_t* A, const int64_t* B, int64_t* C,
+    int64_t rescale, int Bsz, int S, int H, int T, int Cdim)
+{
+    dim3 grid(Bsz, S, H);
+    dim3 block(T);
+    einsum_bshc_btc_bsht_kernel<<<grid, block>>>(
+        A, B, C, rescale,
+        Bsz, S, H, T, Cdim);
+}
+extern "C" __global__ void einsum_bsht_btc_bshc_kernel(
+    const int64_t* __restrict__ A,
+    const int64_t* __restrict__ B,
+    int64_t* __restrict__ C,
+    int64_t rescale,
+    int Bsz, int S, int H, int T, int Cdim)
+{
+    int b = blockIdx.x;
+    int s = blockIdx.y;
+    int h = blockIdx.z;
+    int c = threadIdx.x;
+    if (c >= Cdim) return;
+    __int128_t sum = rescale / 2;
+    int A_base = ((b * S + s) * H + h) * T;
+    int B_base = b * T * Cdim + c;
+    for (int t = 0; t < T; ++t) {
+        // int idxB = (b * T + t) * Cdim + c;
+        sum += __int128_t(A[A_base + t]) * __int128_t(B[B_base +  t * Cdim]);
+    }
+    // sum /= rescale;
+    int64_t sum2 = int64_t(sum >> rescale);
+    const int idxC = ((b * S + s) * H + h) * Cdim + c;
+    C[idxC] = sum2;
+}
+extern "C" void einsum_bsht_btc_bshc(
+    const int64_t* A, const int64_t* B, int64_t* C,
+    int64_t rescale, int Bsz, int S, int H, int T, int Cdim)
+{
+    dim3 grid(Bsz, S, H);
+    dim3 block(Cdim);
+    einsum_bsht_btc_bshc_kernel<<<grid, block>>>(
+        A, B, C, rescale,
+        Bsz, S, H, T, Cdim);
+}
+extern "C" __global__ void einsum_bshc_hdc_bshd_kernel(
+    const int64_t* __restrict__ A,
+    const int32_t* __restrict__ B,
+    int64_t* __restrict__ C,
+    int64_t rescale,
+    int Bsz, int S, int H, int D, int Cdim)
+{
+    int b = blockIdx.x;
+    int s = blockIdx.y;
+    int h = blockIdx.z;
+    int d = threadIdx.x;
+    if (d >= D) return;
+    __int128_t sum = 0;
+    int A_base = ((b * S + s) * H + h) * Cdim;
+    int B_base = (h * D + d) * Cdim;
+    for (int c = 0; c < Cdim; ++c) {
+        sum += __int128_t(A[A_base + c]) * __int128_t(B[B_base + c]);
+    }
+    // sum = (sum + rescale / 2) / rescale;
+    int64_t sum2 = int64_t(sum >> rescale);
+    const int idxC = ((b * S + s) * H + h) * D + d;
+    C[idxC] = sum2;
+}
+extern "C" void  einsum_bshc_hdc_bshd(const int64_t* A, const int32_t* B, int64_t* C,
+    int64_t rescale, int Bsz, int S, int H, int D, int Cdim)
+{
+    dim3 grid(Bsz, S, H);
+    dim3 block(D);
+    einsum_bshc_hdc_bshd_kernel<<<grid, block>>>(
+        A, B, C, rescale,
+        Bsz, S, H, D, Cdim
+    );
+}
+// static const int64_t LOG2E_Q32 = 6196328019ULL; // log2(e)*2^32
+static const int64_t LOG2E_Q21 = 3025551; // log2(e)*2^21
+static const int64_t LOG2E_Q19 = 756388; // log2(e)*2^19
+// static const int LOG_TABLE_SIZE = 10;
+static const int LOG_TABLE_SIZE = 8;
+// static uint64_t EXP2_FRAC_LUT[256] = { /* 预生成：round(2^(i/256)*2^32) */ };
+// static int64_t EXP2_FRAC_LUT[256] = { /* 预生成：round(2^(i/256)*2^32) */ };
+// EXP2_FRAC_LUT = torch.zeros([256, ], dtype=torch.int64, device="cuda")
+// extern "C" void softmax_q21_to_probs(const int64_t* R, int n, int64_t* P_q21) {
+//     int32_t Rmax = R[0];
+//     for (int i = 1; i < n; ++i) if (R[i] > Rmax) Rmax = R[i];
+//     // printf("Rmax: %d\n", Rmax);
+//     int64_t sumW = 0;
+//     static thread_local int64_t Wbuf[4096]; // 或动态分配
+//     for (int i = 0; i < n; ++i) {
+//         int64_t d = R[i] - Rmax; // Δ_i (<=0)
+//         // 剪裁：小于 -16 的差值近似为 0
+//         if (d < -(16 << 21))
+//         {
+//             Wbuf[i] = 0;
+//             continue;
+//         }
+//         // y = d * log2(e) / 2^21  (Q32)
+//         int64_t y = (d * LOG2E_Q21) >> 21;
+//         // printf("y: %ld\n", y);
+//         int64_t k = (-y) >> 21;        // 整数部分 取正数（k > 0）
+//         int64_t f = (-y) & 0x1FFFFF; // 小数部分 取正数（Q21, f > 0）
+//         // printf("k: %ld, f: %ld\n", k, f);
+//         int64_t t = EXP2_FRAC_LUT[ f >> 13 ]; // 2^(frac(y)) in Q21, 13 = 21 - 8, 取小数部分 转换成整数之后的 高8位
+//         // int64_t t = 0;
+//         int64_t wi = (k >= 32) ? 0u : (t >> k); // 2^(-k) * t, 右移
+//         Wbuf[i] = wi;
+//         sumW += wi;
+//     }
+//     // 归一化到 Q21 概率
+//     for (int i = 0; i < n; ++i) {
+//         int64_t num = Wbuf[i] << 21; // 提升精度
+//         P_q21[i] = sumW ? (num / sumW) : 0;
+//     }
+// }
+// start of softmax_q21 ------------------------
+extern "C" __global__ void softmax_kernel_q21(
+    int64_t* R,  // [B, S, H, T]
+    int64_t* C,  // [B, S, H, T]
+    int64_t LOG2E_Q21, int64_t* EXP2_FRAC_LUT,
+    int Bsz, int S, int H, int T)
+{
+    int b = blockIdx.x;
+    int s = blockIdx.y;
+    int h = threadIdx.x;
+    if (h >= H) return;
+    int idxbase = ((b * S + s) * H + h) * T;
+    int64_t Rmax = R[idxbase];
+    for (int i = 1; i < T; ++i) if (R[idxbase + i] > Rmax) Rmax = R[idxbase + i];
+    int64_t sumW = 0;
+    for (int i = 0; i < T; i++) {
+        int64_t d = R[idxbase + i] - Rmax; // Δ_i (d <= 0)
+        // 剪裁：小于 -64 的差值近似为 0
+        if (d < -(64 << 21))
+        {
+            R[idxbase + i] = 0;
+            continue;
+        }
+        // y = d * log2(e) / 2^21  (Q21, y <= 0)
+        int64_t y = (d * LOG2E_Q21 + (1 << 20)) >> 21;
+        int64_t k = (-y) >> 21;        // 整数部分 取正数（k > 0）
+        int64_t f = (-y) & 0x1FFFFF; // 小数部分 取正数（Q21）
+        // printf("k: %ld, f: %ld\n", k, f);
+        int64_t t = EXP2_FRAC_LUT[ f >> (21 - LOG_TABLE_SIZE) ]; // 2^(frac(y)) in Q21, 取小数部分 转换成整数之后的高 LOG_TABLE_SIZE 位
+        // int64_t t = 0;
+        int64_t wi = (k >= 64) ? 0 : (t >> k); // 2^(-k) * t, 右移
+        // if(b == 0 && s == 3 && h == 127) {
+        //     printf("i: %d, d: %ld, y: %ld, k: %ld, f: %ld, t: %ld, wi: %ld\n", i, d, y, k, f, t, wi);
+        // }
+        R[idxbase + i] = wi;
+        sumW += wi;
+    }
+    // 归一化到 Q21 概率
+    for (int i = 0; i < T; ++i) {
+        int64_t num = R[idxbase + i] << 21; // 提升精度
+        C[idxbase + i] = sumW ? ((num + sumW / 2) / sumW) : 0;
+    }
+}
+extern "C" void softmax_q21(int64_t* R, int64_t* C, int64_t* EXP2_FRAC_LUT,
+    int Bsz, int S, int H, int T)
+{
+    dim3 grid(Bsz, S);
+    dim3 block(H);
+    softmax_kernel_q21<<<grid, block>>>(
+        R, C, LOG2E_Q21, EXP2_FRAC_LUT,
+        Bsz, S, H, T);
+}
+extern "C" void softmax_init_q21(int64_t* EXP2_FRAC_LUT)
+{
+    // printf("inited!\n");
+    int x2_21 = 1 << 21;
+    for(int i = 0; i < (1 << LOG_TABLE_SIZE); i++) {
+        // EXP2_FRAC_LUT[i] = uint64_t(std::pow(2, i / 256.0) * 4294967296);
+        EXP2_FRAC_LUT[i] = int64_t(std::pow(2, i * (-1.0f) / (1 << LOG_TABLE_SIZE)) * x2_21);
+    }
+}
+// -- end of softmax_q21 -----------------------------
+// start of softmax_q19 ------------------------
+extern "C" __global__ void softmax_kernel_q19(
+    int64_t* R,  // [B, S, H, T]
+    int64_t* C,  // [B, S, H, T]
+    int64_t LOG2E_Q19, int64_t* EXP2_FRAC_LUT,
+    int Bsz, int S, int H, int T)
+{
+    int b = blockIdx.x;
+    int s = blockIdx.y;
+    int h = threadIdx.x;
+    if (h >= H) return;
+    int idxbase = ((b * S + s) * H + h) * T;
+    int64_t Rmax = R[idxbase];
+    for (int i = 1; i < T; ++i) if (R[idxbase + i] > Rmax) Rmax = R[idxbase + i];
+    int64_t sumW = 0;
+    for (int i = 0; i < T; i++) {
+        u_int64_t d = Rmax - R[idxbase + i]; // Δ_i (d >= 0)
+        // 剪裁：小于 -64 的差值近似为 0, d > (64 << 19)的条件 比 (k >= 64) 要宽松
+        if (d > (64 << 19))
+        {
+            R[idxbase + i] = 0;
+            continue;
+        }
+        // y = d * log2(e) / 2^19  (Q19, y >= 0)
+        int64_t y = int64_t((__int128_t(d) * __int128_t(LOG2E_Q19)) >> 19);
+        int64_t k = y >> 19;        // 整数部分 取正数（k > 0）
+        int64_t f = y & 0x7FFFF; // 小数部分 取正数（Q19）
+        // printf("k: %ld, f: %ld\n", k, f);
+        int64_t t = EXP2_FRAC_LUT[ f >> (19 - LOG_TABLE_SIZE) ]; // 2^(frac(y)) in Q19, 取小数部分 转换成整数之后的高 LOG_TABLE_SIZE 位
+        // int64_t t = 0;
+        int64_t wi = (k >= 64) ? 0 : (t >> k); // 2^(-k) * t, 右移
+        // if(b == 0 && s == 2 && h == 2) {
+        //     printf("i: %d, d: %ld, y: %ld, k: %ld, f: %ld, t: %ld, wi: %ld\n", i, d, y, k, f, t, wi);
+        // }
+        R[idxbase + i] = wi;
+        sumW += wi;
+    }
+    // if(b == 0 && s == 9 && h == 7) {
+    //     printf("sumW: %ld\n", sumW);
+    // }
+    // 归一化到 Q19 概率
+    for (int i = 0; i < T; ++i) {
+        int64_t num = R[idxbase + i] << 19; // 提升精度
+        C[idxbase + i] = sumW ? (num / sumW) : 0;
+        // if(b == 0 && s == 9 && h == 7) {
+        //     printf("i: %d, r: %ld, num: %ld, c: %ld\n", i, R[idxbase + i], num, C[idxbase + i]);
+        // }
+    }
+}
+extern "C" void softmax_q19(int64_t* R, int64_t* C, int64_t* EXP2_FRAC_LUT,
+    int Bsz, int S, int H, int T)
+{
+    dim3 grid(Bsz, S);
+    dim3 block(H);
+    softmax_kernel_q19<<<grid, block>>>(
+        R, C, LOG2E_Q19, EXP2_FRAC_LUT,
+        Bsz, S, H, T);
+}
+extern "C" void softmax_init_q19(int64_t* EXP2_FRAC_LUT)
+{
+    // printf("inited!\n");
+    int x2_19 = 1 << 19;
+    for(int i = 0; i < (1 << LOG_TABLE_SIZE); i++) {
+        // EXP2_FRAC_LUT[i] = uint64_t(std::pow(2, i / 256.0) * 4294967296);
+        EXP2_FRAC_LUT[i] = int64_t(std::pow(2, i * (-1.0f) / (1 << LOG_TABLE_SIZE)) * x2_19);
+    }
+}
+// -- end of softmax_q19 -----------------------------
+// -- start of silu_q25 -----------------------------
+static const int64_t LOG2E_Q25 = 48408813; // round(log2(e)*2^25)
+static const int64_t exp2_25 = 33554432; // 1 << 25;
+static const int64_t exp2_50 = 1125899906842624; // 1 << 50;
+extern "C" __global__ void silu_kernel_q25(
+    int64_t* R,  // [B, S, Dim]
+    int64_t* C,  // [B, S, Dim]
+    int64_t LOG2E_Q25, int64_t* EXP2_FRAC_LUT_Q25,
+    int Bsz, int S, int Dim)
+{
+    int b = blockIdx.x;
+    int s = blockIdx.y;
+    int d = blockIdx.z * blockDim.x + threadIdx.x;
+    if (d >= Dim) return;
+    int idx = (b * S + s) * Dim + d;
+    int64_t r = R[idx];
+    // 饱和区裁剪（可调阈值 64）
+    const int64_t LIM = (int64_t)64 << 25;
+    if (r >= LIM) // σ≈1 -> SiLU ~= x
+    {
+        C[idx] = r;
+        return;
+    }
+    if (r <= -LIM) // σ≈0 -> SiLU ~= 0
+    {
+        C[idx] = 0;
+        return;
+    }
+    // y = - x * log2(e) / 2^25   (Q25)
+    int64_t y = -int64_t((__int128_t(r) * __int128_t(LOG2E_Q25)) >> 25);
+    // u ≈ 2^y = e^{-x} = 2^k * 2^f  (Q25)
+    int64_t u = 0;
+    int64_t k = y >> 25;                  // 整数部分
+    int64_t f = y & 0x1FFFFFF;         // 小数部分 (Q25)
+    int64_t t = EXP2_FRAC_LUT_Q25[f >> (25 - LOG_TABLE_SIZE)];             // 2^(frac) in Q25
+    if (k > -63)
+        u = (k < 0) ? (t >> (-k)) : (t << k);             // 一般 k<=0
+    // σ = 1 / (1 + u)   (Q25)
+    int64_t q = exp2_50 / (exp2_25 + u);     // Q25
+    // SiLU = x * σ     : (Q25 * Q25) >> 25 → Q25
+    C[idx] = ((r * q) >> 25);
+}
+extern "C" void silu_q25(int64_t* R, int64_t* C, int64_t* EXP2_FRAC_LUT_25,
+    int Bsz, int S, int Dim)
+{
+    dim3 grid(Bsz, S, (Dim + 255) / 256);
+    dim3 block(256, 1, 1);
+    silu_kernel_q25<<<grid, block>>>(
+        R, C, LOG2E_Q25, EXP2_FRAC_LUT_25,
+        Bsz, S, Dim);
+}
+extern "C" void silu_init_q25(int64_t* EXP2_FRAC_LUT)
+{
+    int tableSize = 1 << LOG_TABLE_SIZE;
+    for(int i = 0; i < tableSize; i++) {
+        // EXP2_FRAC_LUT[i] = uint64_t(std::pow(2, i / 1024.0) * 2^25);
+        EXP2_FRAC_LUT[i] = int64_t(std::pow(2, i * 1.0f / tableSize) * exp2_25);
+    }
+}
+extern "C" __global__ void sigmoid_kernel_q25(
+    int64_t* R,  // [B, S, Dim]
+    int64_t* C,  // [B, S, Dim]
+    int64_t LOG2E_Q25, int64_t* EXP2_FRAC_LUT_Q25,
+    int Bsz, int S, int Dim)
+{
+    int b = blockIdx.x;
+    int s = blockIdx.y;
+    int d = blockIdx.z * blockDim.x + threadIdx.x;
+    if (d >= Dim) return;
+    int idx = (b * S + s) * Dim + d;
+    int64_t r = R[idx];
+    // 饱和区裁剪（可调阈值 64）
+    const int64_t LIM = (int64_t)64 << 25;
+    if (r >= LIM) // σ≈1 -> SiLU ~= x
+    {
+        C[idx] = r;
+        return;
+    }
+    if (r <= -LIM) // σ≈0 -> SiLU ~= 0
+    {
+        C[idx] = 0;
+        return;
+    }
+    // y = - x * log2(e) / 2^25   (Q25)
+    int64_t y = -int64_t((__int128_t(r) * __int128_t(LOG2E_Q25)) >> 25);
+    // u ≈ 2^y = e^{-x} = 2^k * 2^f  (Q25)
+    int64_t u = 0;
+    int64_t k = y >> 25;                  // 整数部分
+    int64_t f = y & 0x1FFFFFF;         // 小数部分 (Q25)
+    int64_t t = EXP2_FRAC_LUT_Q25[f >> (25 - LOG_TABLE_SIZE)];             // 2^(frac) in Q25
+    if (k > -63)
+        u = (k < 0) ? (t >> (-k)) : (t << k);             // 一般 k<=0
+    // σ = 1 / (1 + u)   (Q25)
+    C[idx] = exp2_50 / (exp2_25 + u);
+}
+extern "C" void sigmoid_q25(int64_t* R, int64_t* C, int64_t* EXP2_FRAC_LUT_25,
+    int Bsz, int S, int Dim)
+{
+    dim3 grid(Bsz, S, (Dim + 255) / 256);
+    dim3 block(256, 1, 1);
+    sigmoid_kernel_q25<<<grid, block>>>(
+        R, C, LOG2E_Q25, EXP2_FRAC_LUT_25,
+        Bsz, S, Dim);
+}
+// -- end of silu_q25 -----------------------------
+// -- start of silu_q23 -----------------------------
+static const int64_t LOG2E_Q23 = 12102203; // round(log2(e)*2^23)
+static const int64_t exp2_23 = 8388608; // 1 << 23;
+static const int64_t exp2_46 = 70368744177664; // 1 << 46;
+extern "C" __global__ void silu_kernel_q23(
+    int64_t* R,  // [B, S, Dim]
+    int64_t* C,  // [B, S, Dim]
+    int64_t LOG2E_Q23, int64_t* EXP2_FRAC_LUT_Q23,
+    int Bsz, int S, int Dim)
+{
+    int b = blockIdx.x;
+    int s = blockIdx.y;
+    int d = blockIdx.z * blockDim.x + threadIdx.x;
+    if (d >= Dim) return;
+    int idx = (b * S + s) * Dim + d;
+    int64_t r = R[idx];
+    // 饱和区裁剪（可调阈值 64）
+    const int64_t LIM = (int64_t)64 << 23;
+    if (r >= LIM) // σ≈1 -> SiLU ~= x
+    {
+        C[idx] = r;
+        return;
+    }
+    if (r <= -LIM) // σ≈0 -> SiLU ~= 0
+    {
+        C[idx] = 0;
+        return;
+    }
+    // y = - x * log2(e) / 2^23   (Q23)
+    int64_t y = int64_t((__int128_t(-r) * __int128_t(LOG2E_Q23)) >> 23);
+    // u ≈ 2^y = e^{-x} = 2^k * 2^f  (Q23)
+    int64_t u = 0;
+    int64_t k = y >> 23;                  // 整数部分
+    int64_t f = y & 0x7FFFFF;         // 小数部分 (Q23)
+    int64_t t = EXP2_FRAC_LUT_Q23[f >> (23 - LOG_TABLE_SIZE)];             // 2^(frac) in Q23
+    if (k > -63)
+        u = (k < 0) ? (t >> (-k)) : (t << k);             // 一般 k<=0
+    // σ = 1 / (1 + u)   (Q23)
+    int64_t q = exp2_46 / (exp2_23 + u);     // Q23
+    // SiLU = x * σ     : (Q23 * Q23) >> 23 → Q23
+    C[idx] = ((r * q) >> 23);
+}
+extern "C" void silu_q23(int64_t* R, int64_t* C, int64_t* EXP2_FRAC_LUT_23,
+    int Bsz, int S, int Dim)
+{
+    dim3 grid(Bsz, S, (Dim + 255) / 256);
+    dim3 block(256, 1, 1);
+    silu_kernel_q23<<<grid, block>>>(
+        R, C, LOG2E_Q23, EXP2_FRAC_LUT_23,
+        Bsz, S, Dim);
+}
+extern "C" void silu_init_q23(int64_t* EXP2_FRAC_LUT)
+{
+    int tableSize = 1 << LOG_TABLE_SIZE;
+    for(int i = 0; i < tableSize; i++) {
+        // EXP2_FRAC_LUT[i] = uint64_t(std::pow(2, i / 1024.0) * 2^23);
+        EXP2_FRAC_LUT[i] = int64_t(std::pow(2, i * 1.0f / tableSize) * exp2_23);
+    }
+}
+extern "C" __global__ void sigmoid_kernel_q23(
+    int64_t* R,  // [B, S, Dim]
+    int64_t* C,  // [B, S, Dim]
+    int64_t LOG2E_Q23, int64_t* EXP2_FRAC_LUT_Q23,
+    int Bsz, int S, int Dim)
+{
+    int b = blockIdx.x;
+    int s = blockIdx.y;
+    int d = blockIdx.z * blockDim.x + threadIdx.x;
+    if (d >= Dim) return;
+    int idx = (b * S + s) * Dim + d;
+    int64_t r = R[idx];
+    // 饱和区裁剪（可调阈值 64）
+    const int64_t LIM = (int64_t)64 << 23;
+    if (r >= LIM) // σ≈1 -> SiLU ~= x
+    {
+        printf("r: %ld >= LIM", r);
+        C[idx] = r;
+        return;
+    }
+    if (r <= -LIM) // σ≈0 -> SiLU ~= 0
+    {
+        printf("r: %ld <= -LIM", r);
+        C[idx] = 0;
+        return;
+    }
+    // y = - x * log2(e) / 2^23   (Q23)
+    int64_t y = int64_t((__int128_t(-r) * __int128_t(LOG2E_Q23)) >> 23);
+    // u ≈ 2^y = e^{-x} = 2^k * 2^f  (Q23)
+    int64_t u = 0;
+    int64_t k = y >> 23;                  // 整数部分
+    int64_t f = y & 0x7FFFFF;         // 小数部分 (Q23)
+    int64_t t = EXP2_FRAC_LUT_Q23[f >> (23 - LOG_TABLE_SIZE)];             // 2^(frac) in Q23
+    if (k > -63)
+        u = (k < 0) ? (t >> (-k)) : (t << k);             // 一般 k<=0
+    // if(s == 0 && d == 4)
+    // {
+    //     printf("s: %d, d: %d, x: %ld, y: %ld, k: %ld, f: %ld, t: %ld, u: %ld\n", s, d, r, y, k, f, t, u);
+    // }
+    // σ = 1 / (1 + u)   (Q23)
+    C[idx] = exp2_46 / (exp2_23 + u);
+}
+extern "C" void sigmoid_q23(int64_t* R, int64_t* C, int64_t* EXP2_FRAC_LUT_23,
+    int Bsz, int S, int Dim)
+{
+    dim3 grid(Bsz, S, (Dim + 255) / 256);
+    dim3 block(256, 1, 1);
+    sigmoid_kernel_q23<<<grid, block>>>(
+        R, C, LOG2E_Q23, EXP2_FRAC_LUT_23,
+        Bsz, S, Dim);
+}
+// -- end of silu_q23 -----------------------------

inference/kernel.py ADDED Viewed

	@@ -0,0 +1,724 @@

+from typing import Tuple
+import math
+import random
+import torch
+import ctypes
+import triton
+import triton.language as tl
+from triton import Config
+@triton.jit
+def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
+    """
+    Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factor in `s_ptr`.
+    Args:
+        x_ptr (triton.Pointer): Pointer to the input tensor.
+        y_ptr (triton.Pointer): Pointer to the output tensor where quantized values will be stored.
+        s_ptr (triton.Pointer): Pointer to the output tensor where scaling factors will be stored.
+        BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.
+    Returns:
+        None
+    """
+    pid = tl.program_id(axis=0)
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(x_ptr + offs).to(tl.float32)
+    s = tl.max(tl.abs(x)) / 448.
+    y = x / s
+    y = y.to(y_ptr.dtype.element_ty)
+    tl.store(y_ptr + offs, y)
+    tl.store(s_ptr + pid, s)
+# 把 张量 x 进行 量化
+def act_quant(x: torch.Tensor, block_size: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantizes the input tensor `x` using block-wise quantization.
+    Args:
+        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
+        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - The quantized tensor with dtype `torch.float8_e4m3fn`.
+            - A tensor of scaling factors with dtype `torch.float32`.
+    """
+    assert x.is_contiguous(), 'Input tensor must be contiguous'
+    assert x.size(-1) % block_size == 0, f'Last dimension size must be divisible by block_size (block_size={block_size})'
+    # 创建两个张量：一个形状与x 一致且dtype为FP8的张量y；一个是专门储存scale因子的张量s，依旧是每128维储存一个scale因子
+    # （按照上述代码来看，s的张量形状为(2, 3, 7168 // 128)=(2, 3, 56)，数据类型为FP32）。
+    y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    s = x.new_empty(*x.size()[:-1], x.size(-1) // block_size, dtype=torch.float32)
+    # 之后的两行代码，便涉及到了Triton Kernel的调度计算。Triton是一个专门用于优化GPU计算的编程框架。内核调度（Kernel Scheduling）指的是
+    # 如何将计算任务分配给GPU上的计算单元（SMs-Streaming Multiprocessors）。内核（kernel）指的是要求在 GPU 上并行执行的那段代码（也可以说是计算任务）。
+    # 众所周知，GPU并不像CPU那样串行计算，而是同时运行多个计算块（blocks），每个 block又包含多个线程，它们并行执行任务，以提高计算效率。
+    # grid 决定多少个计算block被调度到 GPU 上。这里调用了triton.cdiv(x.numel(), meta['BLOCK_SIZE']) 来计算需要多少个 blocks。
+    # x.numel()是输入x张量里元素的个数，在本例中为2×3×7168个。 triton.cdiv()负责作向上取整的除法，以确保整个张量都能被块覆盖。
+    # meta['BLOCK_SIZE']=128 ，于是可知grid为(2×3×7168/128, )=(336, ) ，即最终会划分为336块blocks进行并行计算。
+    grid = lambda meta: (triton.cdiv(x.numel(), meta['BLOCK_SIZE']), )
+    act_quant_kernel[grid](x, y, s, BLOCK_SIZE=block_size)
+    return y, s
+@triton.jit
+def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
+    """
+    Dequantizes weights using the provided scaling factors and stores the result.
+    Args:
+        x_ptr (tl.pointer): Pointer to the quantized weights.
+        s_ptr (tl.pointer): Pointer to the scaling factors.
+        y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
+        M (int): Number of rows in the weight matrix.
+        N (int): Number of columns in the weight matrix.
+        BLOCK_SIZE (tl.constexpr): Size of the block for tiling.
+    Returns:
+        None
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    n = tl.cdiv(N, BLOCK_SIZE)
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs = offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
+    s = tl.load(s_ptr + pid_m * n + pid_n)
+    y = x * s
+    tl.store(y_ptr + offs, y, mask=mask)
+def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
+    """
+    Dequantizes the given weight tensor using the provided scale tensor.
+    Args:
+        x (torch.Tensor): The quantized weight tensor of shape (M, N).
+        s (torch.Tensor): The scale tensor of shape (M, N).
+        block_size (int, optional): The block size to use for dequantization. Defaults to 128.
+    Returns:
+        torch.Tensor: The dequantized weight tensor of the same shape as `x`.
+    Raises:
+        AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
+    """
+    assert x.is_contiguous() and s.is_contiguous(), 'Input tensors must be contiguous'
+    assert x.dim() == 2 and s.dim() == 2, 'Input tensors must have 2 dimensions'
+    M, N = x.size()
+    y = torch.empty_like(x, dtype=torch.get_default_dtype())
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE']), triton.cdiv(N, meta['BLOCK_SIZE']))
+    weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
+    return y
+fp8_gemm_configs = [
+    Config({'BLOCK_SIZE_M': block_m, 'BLOCK_SIZE_N': block_n, 'BLOCK_SIZE_K': 128}, num_stages=num_stages, num_warps=8)
+    for block_m in [16, 32, 64] for block_n in [32, 64, 128] for num_stages in [3, 4, 5, 6]
+]
+@triton.autotune(configs=fp8_gemm_configs, key=['N', 'K'])
+@triton.jit
+def fp8_gemm_kernel(a_ptr, b_ptr, c_ptr,
+                    a_s_ptr, b_s_ptr,
+                    M, N: tl.constexpr, K: tl.constexpr,
+                    BLOCK_SIZE_M: tl.constexpr,
+                    BLOCK_SIZE_N: tl.constexpr,
+                    BLOCK_SIZE_K: tl.constexpr):
+    """
+    Performs a matrix multiplication operation on FP8 matrices with scaling factors.
+    Args:
+        a_ptr (tl.tensor): Pointer to the first input matrix A.
+        b_ptr (tl.tensor): Pointer to the second input matrix B.
+        c_ptr (tl.tensor): Pointer to the output matrix C.
+        a_s_ptr (tl.tensor): Pointer to the scaling factors for matrix A.
+        b_s_ptr (tl.tensor): Pointer to the scaling factors for matrix B.
+        M (int): Number of rows in matrix A and C.
+        N (tl.constexpr): Number of columns in matrix B and C.
+        K (tl.constexpr): Number of columns in matrix A and rows in matrix B.
+        BLOCK_SIZE_M (tl.constexpr): Block size for the M dimension.
+        BLOCK_SIZE_N (tl.constexpr): Block size for the N dimension.
+        BLOCK_SIZE_K (tl.constexpr): Block size for the K dimension.
+    Returns:
+        None
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    k = tl.cdiv(K, BLOCK_SIZE_K)
+    offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + offs_m[:, None] * K + offs_k[None, :]
+    b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
+    a_s_ptrs = a_s_ptr + offs_m * k
+    b_s_ptrs = b_s_ptr + (offs_n // BLOCK_SIZE_K) * k
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for i in range(k):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0.0)
+        a_s = tl.load(a_s_ptrs)
+        b_s = tl.load(b_s_ptrs)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += BLOCK_SIZE_K
+        a_s_ptrs += 1
+        b_s_ptrs += 1
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    tl.store(c_ptrs, c, mask=mask)
+# FP8通用矩阵乘法
+def fp8_gemm(a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor):
+    """
+    Perform a matrix multiplication using FP8 precision.
+    Args:
+        a (torch.Tensor): The first input matrix, must be contiguous.
+        a_s (torch.Tensor): The scaling factor for the first input matrix, must be contiguous.
+        b (torch.Tensor): The second input matrix, must be contiguous.
+        b_s (torch.Tensor): The scaling factor for the second input matrix, must be contiguous.
+    Returns:
+        torch.Tensor: The result of the matrix multiplication.
+    """
+    assert a.is_contiguous() and b.is_contiguous(), 'Input tensors must be contiguous'
+    assert a_s.is_contiguous() and b_s.is_contiguous(), 'Scaling factor tensors must be contiguous'
+    K = a.size(-1)
+    M = a.numel() // K
+    N = b.size(0)
+    c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype())
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']), triton.cdiv(N, META['BLOCK_SIZE_N']))
+    fp8_gemm_kernel[grid](a, b, c, a_s, b_s, M, N, K)
+    return c
+# 加载 CUDA 动态库
+lib = ctypes.CDLL("./libint64gemm.so")
+# 定义参数类型
+lib.int64_64_bmm_broadcast_launcher.argtypes = [
+    ctypes.c_void_p,  # A
+    ctypes.c_void_p,  # B
+    ctypes.c_void_p,  # C
+    ctypes.c_void_p,  # R
+    ctypes.c_longlong, ctypes.c_longlong, ctypes.c_longlong,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.int64_32_bmm_broadcast_launcher.argtypes = [
+    ctypes.c_void_p,  # A
+    ctypes.c_void_p,  # B
+    ctypes.c_void_p,  # C
+    ctypes.c_void_p,  # R
+    ctypes.c_longlong, ctypes.c_longlong, ctypes.c_longlong,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.complex_int64_mul.argtypes = [
+    ctypes.c_void_p,  # A
+    ctypes.c_void_p,  # B
+    ctypes.c_void_p,  # C
+    ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.rms_norm_32.argtypes = [
+    ctypes.c_void_p,  # A
+    ctypes.c_void_p,  # W
+    ctypes.c_void_p,  # rms
+    ctypes.c_void_p,  # C
+    ctypes.c_int, ctypes.c_int
+]
+lib.rms_norm_64.argtypes = [
+    ctypes.c_void_p,  # A
+    ctypes.c_void_p,  # W
+    ctypes.c_void_p,  # rms
+    ctypes.c_void_p,  # C
+    ctypes.c_int, ctypes.c_int
+]
+lib.einsum_bshd_hdc_bshc.argtypes = [
+    ctypes.c_void_p,  # A
+    ctypes.c_void_p,  # B
+    ctypes.c_void_p,  # C
+    ctypes.c_longlong,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.einsum_bshc_btc_bsht.argtypes = [
+    ctypes.c_void_p,  # A
+    ctypes.c_void_p,  # B
+    ctypes.c_void_p,  # C
+    ctypes.c_longlong,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.einsum_bsht_btc_bshc.argtypes = [
+    ctypes.c_void_p,  # A
+    ctypes.c_void_p,  # B
+    ctypes.c_void_p,  # C
+    ctypes.c_longlong,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.einsum_bshc_hdc_bshd.argtypes = [
+    ctypes.c_void_p,  # A
+    ctypes.c_void_p,  # B
+    ctypes.c_void_p,  # C
+    ctypes.c_longlong,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.softmax_q21.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.softmax_q19.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.softmax_init_q21.argtypes = [
+    ctypes.c_void_p
+]
+lib.softmax_init_q19.argtypes = [
+    ctypes.c_void_p
+]
+lib.silu_q25.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.sigmoid_q25.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.silu_init_q25.argtypes = [
+    ctypes.c_void_p
+]
+lib.silu_q23.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.sigmoid_q23.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_int, ctypes.c_int, ctypes.c_int
+]
+lib.silu_init_q23.argtypes = [
+    ctypes.c_void_p
+]
+def int64_bmm_broadcast(A: torch.Tensor, B: torch.Tensor, a_rescale, b_rescale, c_rescale) -> tuple[torch.Tensor]:
+    """
+    int64 批量矩阵乘法: (B, M, K) x (N, K) -> (B, M, N)
+    """
+    global lib
+    assert A.dtype == torch.int64
+    # and B.dtype == torch.int64
+    assert A.is_cuda and B.is_cuda
+    Bdim, M, K = A.shape
+    N, K2 = B.shape
+    assert K2 == K
+    C = torch.empty((Bdim, M, N), dtype=torch.int64, device="cuda")
+    R = torch.empty((Bdim, M, N), dtype=torch.int64, device="cuda")
+    if B.dtype == torch.int64:
+        lib.int64_64_bmm_broadcast_launcher(
+            A.data_ptr(), B.data_ptr(), C.data_ptr(), R.data_ptr(),
+            a_rescale, b_rescale, c_rescale,
+            Bdim, M, K, N
+        )
+    elif B.dtype == torch.int32:
+        lib.int64_32_bmm_broadcast_launcher(
+            A.data_ptr(), B.data_ptr(), C.data_ptr(), R.data_ptr(),
+            a_rescale, b_rescale, c_rescale,
+            Bdim, M, K, N
+        )
+    else:
+        print(f'Unsupported B type: {B.dtype}')
+    return (C, R)
+def complex_int64_mul_broadcast(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+    """
+    int64 复数逐元素乘法
+    """
+    global lib
+    # print(f'A type: {A.dtype}, B type: {B.dtype}')
+    assert A.dtype == torch.int64 and B.dtype == torch.int64
+    assert A.is_cuda and B.is_cuda
+    batch = A.shape[0]
+    seqLen = A.shape[1]
+    head = A.shape[2]
+    headDim = A.shape[3]
+    C = torch.zeros(A.shape, dtype=torch.int64, device=A.device)
+    lib.complex_int64_mul(
+            A.data_ptr(), B.data_ptr(), C.data_ptr(),
+            # high_rescale, row_rescale,
+            batch, seqLen, head, headDim)
+    return C
+def einsum_bshd_hdc_bshc(A: torch.Tensor, B: torch.Tensor, rescale) -> torch.Tensor:
+    global lib
+    assert A.shape[2] == B.shape[0] and A.shape[3] == B.shape[1]
+    assert A.is_cuda and B.is_cuda
+    Batch = A.shape[0]
+    S = A.shape[1]
+    H = A.shape[2]
+    D = A.shape[3]
+    Cp = B.shape[2]
+    C = torch.zeros([Batch, S, H, Cp], dtype=torch.int64, device=A.device)
+    lib.einsum_bshd_hdc_bshc(A.data_ptr(), B.data_ptr(), C.data_ptr(),
+        # (1 << rescale), Batch, S, H, D, Cp)
+        rescale, Batch, S, H, D, Cp)
+    return C
+def einsum_bshc_btc_bsht(A: torch.Tensor, B: torch.Tensor, rescale) -> torch.Tensor:
+    global lib
+    Bsz = A.shape[0]
+    S = A.shape[1]
+    H = A.shape[2]
+    Cdim = A.shape[3]
+    T = B.shape[1]
+    assert Bsz == B.shape[0] and Cdim == B.shape[2]
+    assert A.is_cuda and B.is_cuda
+    C = torch.zeros([Bsz, S, H, T], dtype=torch.int64, device=A.device)
+    lib.einsum_bshc_btc_bsht(A.data_ptr(), B.data_ptr(), C.data_ptr(),
+        # (1 << rescale), Bsz, S, H, T, Cdim)
+        rescale, Bsz, S, H, T, Cdim)
+    return C
+def einsum_bsht_btc_bshc(A: torch.Tensor, B: torch.Tensor, rescale) -> torch.Tensor:
+    global lib
+    Bsz = A.shape[0]
+    S = A.shape[1]
+    H = A.shape[2]
+    T = A.shape[3]
+    Cdim = B.shape[2]
+    assert Bsz == B.shape[0] and T == B.shape[1]
+    assert A.is_cuda and B.is_cuda
+    C = torch.zeros([Bsz, S, H, Cdim], dtype=torch.int64, device=A.device)
+    lib.einsum_bsht_btc_bshc(A.data_ptr(), B.data_ptr(), C.data_ptr(),
+        # (1 << rescale), Bsz, S, H, T, Cdim)
+        rescale, Bsz, S, H, T, Cdim)
+    return C
+def einsum_bshc_hdc_bshd(A: torch.Tensor, B: torch.Tensor, rescale) -> torch.Tensor:
+    global lib
+    Bsz = A.shape[0]
+    S = A.shape[1]
+    H = A.shape[2]
+    D = B.shape[1]
+    Cdim = A.shape[3]
+    assert H == B.shape[0] and Cdim == B.shape[2]
+    assert A.is_cuda and B.is_cuda
+    C = torch.zeros([Bsz, S, H, D], dtype=torch.int64, device=A.device)
+    lib.einsum_bshc_hdc_bshd(A.data_ptr(), B.data_ptr(), C.data_ptr(),
+        # (1 << rescale), Bsz, S, H, D, Cdim)
+        rescale, Bsz, S, H, D, Cdim)
+    return C
+def int64_RMS0(A: torch.Tensor, eps: int, dim: int) -> torch.Tensor:
+    assert A.dtype == torch.int64
+    assert A.ndim == 1
+    N = A.shape[0]
+    # 初始化累加器
+    acc = eps
+    for i in range(0, N):
+        a = A[i].item()
+        acc += a * a
+    acc = acc // dim
+    res1 = math.isqrt(acc)
+    return res1
+# x 的 scale 为 2 ** 31，范围为 0 - 2^31
+# weight的scale 为 2 ** 21, 范围为 2^5 - 2^20
+# rms 的 scale 为 2 ** 31
+# 返回的结果 scale 为 2 ** 21，31 + 21 - 31 = 21
+@triton.jit
+def int64_rms_norm_kernel(
+    A_ptr, W_ptr, C_ptr, RMS_ptr,
+    N,
+    batch_stride_a, batch_stride_c,
+    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr
+):
+    pid_m = tl.program_id(0)
+    for i in range(0, N):
+        a_ptrs = A_ptr + pid_m * batch_stride_a + i
+        w_ptrs = W_ptr + i
+        rms_ptrs = RMS_ptr + pid_m
+        a = tl.load(a_ptrs, mask=None)
+        w = tl.load(w_ptrs, mask=None)
+        rms = tl.load(rms_ptrs, mask=None)
+        res = a * w // rms
+        prod = a * w
+        tl.device_assert(prod > -2 ** 62 and prod < 2 ** 62, "Integer overflow risk!!!")
+        c_ptrs = C_ptr + pid_m * batch_stride_c + i
+        tl.store(c_ptrs, res, mask=None)
+rms = torch.empty((500, ), dtype=torch.int64, device='cpu')
+rms_gpu = torch.empty((500, ), dtype=torch.int64, device='cuda')
+def RMS_Norm_int64(A: torch.Tensor, W: torch.Tensor, eps, dim) -> torch.Tensor:
+    global lib
+    global rms
+    global rms_gpu
+    assert A.dtype == torch.int64
+    assert A.is_cuda and W.is_cuda
+    assert A.ndim == 2
+    M, N = A.shape
+    for i in range(M):
+        rms[i] = int64_RMS0(A[i], eps, dim)
+    rms_gpu.copy_(rms)
+    C = torch.empty((M, N), dtype=torch.int64, device=A.device)
+    if W.dtype == torch.int32:
+        lib.rms_norm_32(A.data_ptr(), W.data_ptr(), rms_gpu.data_ptr(), C.data_ptr(), M, N)
+    else:
+        lib.rms_norm_64(A.data_ptr(), W.data_ptr(), rms_gpu.data_ptr(), C.data_ptr(), M, N)
+    return (C, rms)
+def saveTensor(fileName, t):
+    with open(fileName, "w", encoding="utf-8") as f:
+        # for row in tensor:
+        #     vs = [str(v.item()) for v in row]
+        #     ss = ' '.join(vs) + '\n'
+        #     f.write(ss)
+        t = t.detach()
+        if t.device.type != "cpu":
+            t = t.cpu()
+        t = t.contiguous()
+        with open(fileName, "wb") as f:
+            # .numpy() -> bytes（C-order）
+            f.write(t.numpy().tobytes(order="C"))
+EXP2_FRAC_LUT_Q21 = None
+# LOG_TABLE_SIZE = 10
+LOG_TABLE_SIZE = 8
+def softmax_init_q21():
+    global lib
+    global EXP2_FRAC_LUT_Q21
+    EXP2_FRAC_LUT0 = torch.zeros((2 ** LOG_TABLE_SIZE, ), dtype=torch.int64, device="cpu")
+    lib.softmax_init_q21(EXP2_FRAC_LUT0.data_ptr())
+    # print(EXP2_FRAC_LUT0[619])
+    EXP2_FRAC_LUT_Q21 = EXP2_FRAC_LUT0.cuda()
+EXP2_FRAC_LUT_Q19 = None
+def softmax_init_q19():
+    global lib
+    global EXP2_FRAC_LUT_Q19
+    EXP2_FRAC_LUT0 = torch.zeros((2 ** LOG_TABLE_SIZE, ), dtype=torch.int64, device="cpu")
+    lib.softmax_init_q19(EXP2_FRAC_LUT0.data_ptr())
+    # print(EXP2_FRAC_LUT0[619])
+    EXP2_FRAC_LUT_Q19 = EXP2_FRAC_LUT0.cuda()
+    # saveTensor(f'zkdata/softmax_q19_table.bin', EXP2_FRAC_LUT0.cpu())
+def softmax_q21(R: torch.Tensor, C: torch.Tensor):
+    global lib
+    global EXP2_FRAC_LUT_Q21
+    assert R.is_cuda and C.is_cuda
+    # print(EXP2_FRAC_LUT_Q21)
+    Bsz = R.shape[0]
+    S = R.shape[1]
+    H = R.shape[2]
+    T = R.shape[3]
+    lib.softmax_q21(R.data_ptr(), C.data_ptr(), EXP2_FRAC_LUT_Q21.data_ptr(), Bsz, S, H, T)
+def softmax_q19(R: torch.Tensor, C: torch.Tensor):
+    global lib
+    global EXP2_FRAC_LUT_Q19
+    assert R.is_cuda and C.is_cuda
+    # print(EXP2_FRAC_LUT_Q19)
+    Bsz = R.shape[0]
+    S = R.shape[1]
+    H = R.shape[2]
+    T = R.shape[3]
+    lib.softmax_q19(R.data_ptr(), C.data_ptr(), EXP2_FRAC_LUT_Q19.data_ptr(), Bsz, S, H, T)
+# start of silu_q25 ---------------------------------
+EXP2_FRAC_LUT_Q25 = None
+def silu_init_q25():
+    global lib
+    global EXP2_FRAC_LUT_Q25
+    EXP2_FRAC_LUT0 = torch.zeros((2 ** LOG_TABLE_SIZE, ), dtype=torch.int64, device="cpu")
+    lib.silu_init_q25(EXP2_FRAC_LUT0.data_ptr())
+    # print(EXP2_FRAC_LUT0[619])
+    EXP2_FRAC_LUT_Q25 = EXP2_FRAC_LUT0.cuda()
+def silu_q25(R: torch.Tensor, C: torch.Tensor):
+    global lib
+    global EXP2_FRAC_LUT_Q25
+    # print(EXP2_FRAC_LUT_Q25)
+    Bsz = R.shape[0]
+    S = R.shape[1]
+    Dim = R.shape[2]
+    lib.silu_q25(R.data_ptr(), C.data_ptr(), EXP2_FRAC_LUT_Q25.data_ptr(), Bsz, S, Dim)
+def sigmoid_q25(R: torch.Tensor, C: torch.Tensor):
+    global lib
+    global EXP2_FRAC_LUT_Q25
+    Bsz = R.shape[0]
+    S = R.shape[1]
+    Dim = R.shape[2]
+    lib.sigmoid_q25(R.data_ptr(), C.data_ptr(), EXP2_FRAC_LUT_Q25.data_ptr(), Bsz, S, Dim)
+# end of silu_q25 ---------------------------------
+# start of silu_q23 ---------------------------------
+EXP2_FRAC_LUT_Q23 = None
+def silu_init_q23():
+    global lib
+    global EXP2_FRAC_LUT_Q23
+    EXP2_FRAC_LUT0 = torch.zeros((2 ** LOG_TABLE_SIZE, ), dtype=torch.int64, device="cpu")
+    lib.silu_init_q23(EXP2_FRAC_LUT0.data_ptr())
+    # print(EXP2_FRAC_LUT0[619])
+    EXP2_FRAC_LUT_Q23 = EXP2_FRAC_LUT0.cuda()
+    # saveTensor(f'zkdata/silu_q23_table.bin', EXP2_FRAC_LUT0.cpu())
+def silu_q23(R: torch.Tensor, C: torch.Tensor):
+    global lib
+    global EXP2_FRAC_LUT_Q23
+    # print(EXP2_FRAC_LUT_Q23)
+    Bsz = R.shape[0]
+    S = R.shape[1]
+    Dim = R.shape[2]
+    lib.silu_q23(R.data_ptr(), C.data_ptr(), EXP2_FRAC_LUT_Q23.data_ptr(), Bsz, S, Dim)
+def sigmoid_q23(R: torch.Tensor, C: torch.Tensor):
+    global lib
+    global EXP2_FRAC_LUT_Q23
+    Bsz = R.shape[0]
+    S = R.shape[1]
+    Dim = R.shape[2]
+    lib.sigmoid_q23(R.data_ptr(), C.data_ptr(), EXP2_FRAC_LUT_Q23.data_ptr(), Bsz, S, Dim)
+# end of silu_q23 ---------------------------------
+if __name__ == "__main__":
+    softmax_init_q21()
+    torch.manual_seed(0)
+    device = "cuda"
+    Bsz = 1
+    S = 1
+    H = 2
+    T = 10
+    A = torch.rand([Bsz, S, H, T], dtype=torch.bfloat16, device=device)
+    a = (A.to(torch.float32) * (2 ** 21)).to(torch.int64)
+    # a = (A * (2 ** 21)).to(torch.int64)
+    print('A: ' + str(A))
+    print('a: ' + str(a))
+    c = torch.zeros([Bsz, S, H, T], dtype=torch.int64, device=device)
+    softmax_q21(a, c)
+    r0 = A.softmax(dim=-1, dtype=torch.float32).type_as(A)
+    print('r0: ' + str(r0))
+    r1 = (c.to(torch.float32) * (2 ** -21)).to(torch.bfloat16)
+    print('r1: ' + str(r1))
+    R0 = (r0.to(torch.float32) * (2 ** 21)).to(torch.int64)
+    # R0 = (r0 * (2 ** 21)).to(torch.int64)
+    print('R0: ' + str(R0))
+    print('R1: ' + str(c))

inference/model.py ADDED Viewed

	@@ -0,0 +1,1631 @@

+import os
+import math
+import datetime
+from dataclasses import dataclass
+from typing import Tuple, Optional, Literal
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from safetensors.torch import load_model
+from kernel import act_quant, weight_dequant, fp8_gemm, int64_bmm_broadcast, \
+    complex_int64_mul_broadcast, einsum_bshd_hdc_bshc, einsum_bshc_btc_bsht, softmax_init_q21, softmax_q21, einsum_bsht_btc_bshc, einsum_bshc_hdc_bshd, \
+    silu_init_q25, silu_q25, sigmoid_q25, softmax_init_q19, softmax_q19, silu_init_q23, silu_q23, sigmoid_q23, RMS_Norm_int64
+world_size = 1
+rank = 0
+block_size = 128
+gemm_impl: Literal["bf16", "fp8"] = "bf16"
+attn_impl: Literal["naive", "absorb"] = "absorb"
+snark = False
+@dataclass
+class ModelArgs:
+    """
+    Data class for defining model arguments and hyperparameters.
+    Attributes:
+        max_batch_size (int): Maximum batch size.
+        max_seq_len (int): Maximum sequence length.
+        dtype (Literal["bf16", "fp8"]): Data type for computations.
+        vocab_size (int): Vocabulary size.
+        dim (int): Model dimension.
+        inter_dim (int): Intermediate dimension for MLP layers.
+        moe_inter_dim (int): Intermediate dimension for MoE layers.
+        n_layers (int): Number of transformer layers.
+        n_dense_layers (int): Number of dense layers in the model.
+        n_heads (int): Number of attention heads.
+        n_routed_experts (int): Number of routed experts for MoE layers.
+        n_shared_experts (int): Number of shared experts for MoE layers.
+        n_activated_experts (int): Number of activated experts in MoE layers.
+        n_expert_groups (int): Number of expert groups.
+        n_limited_groups (int): Number of limited groups for MoE routing.
+        score_func (Literal["softmax", "sigmoid"]): Scoring function for MoE routing.
+        route_scale (float): Scaling factor for routing scores.
+        q_lora_rank (int): LoRA rank for query projections.
+        kv_lora_rank (int): LoRA rank for key-value projections.
+        qk_nope_head_dim (int): Dimension for query-key projections without positional embeddings.
+        qk_rope_head_dim (int): Dimension for query-key projections with rotary embeddings.
+        v_head_dim (int): Dimension for value projections.
+        original_seq_len (int): Original sequence length.
+        rope_theta (float): Base for rotary positional encoding.
+        rope_factor (float): Scaling factor for extended sequence lengths.
+        beta_fast (int): Fast beta correction factor.
+        beta_slow (int): Slow beta correction factor.
+        mscale (float): Scaling factor for extended attention.
+    """
+    max_batch_size: int = 8
+    max_seq_len: int = 4096 * 4
+    dtype: Literal["bf16", "fp8"] = "bf16"
+    vocab_size: int = 102400
+    dim: int = 2048
+    inter_dim: int = 10944
+    moe_inter_dim: int = 1408
+    n_layers: int = 27
+    n_dense_layers: int = 1
+    n_heads: int = 16
+    # moe
+    n_routed_experts: int = 64
+    n_shared_experts: int = 2
+    n_activated_experts: int = 6
+    n_expert_groups: int = 1
+    n_limited_groups: int = 1
+    score_func: Literal["softmax", "sigmoid"] = "softmax"
+    route_scale: float = 1.
+    # mla
+    q_lora_rank: int = 0
+    kv_lora_rank: int = 512
+    qk_nope_head_dim: int = 128
+    qk_rope_head_dim: int = 64
+    v_head_dim: int = 128
+    # yarn
+    original_seq_len: int = 4096
+    rope_theta: float = 10000.0
+    rope_factor: float = 40
+    beta_fast: int = 32
+    beta_slow: int = 1
+    mscale: float = 1.
+def saveTensor(fileName, t):
+    with open(fileName, "w", encoding="utf-8") as f:
+        t = t.detach()
+        if t.device.type != "cpu":
+            t = t.cpu()
+        t = t.contiguous()
+        with open(fileName, "wb") as f:
+            # .numpy() -> bytes（C-order）
+            f.write(t.numpy().tobytes(order="C"))
+class ParallelEmbedding(nn.Module):
+    """
+    Embedding layer with parallelism support across distributed processes.
+    Args:
+        vocab_size (int): Vocabulary size.
+        dim (int): Embedding dimension.
+    """
+    def __init__(self, vocab_size: int, dim: int):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.dim = dim
+        assert vocab_size % world_size == 0, f"Vocabulary size must be divisible by world size (world_size={world_size})"
+        self.part_vocab_size = (vocab_size // world_size)
+        self.vocab_start_idx = rank * self.part_vocab_size
+        self.vocab_end_idx = self.vocab_start_idx + self.part_vocab_size
+        # weight 的 shape: [129280, 7168]
+        self.register_buffer("weight", torch.empty(self.part_vocab_size, self.dim, dtype=torch.int64))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for parallel embedding layer.
+        Args:
+            x (torch.Tensor): Input tensor containing token indices.
+        Returns:
+            torch.Tensor: Embedded representations.
+        Raises:
+            ValueError: If `world_size` is not defined.
+        """
+        # print('aaab ' + str(self.weight[0][0].type()))
+        if world_size > 1:
+            # 找出 x 中 的值不在 [vocab_start_idx, vocab_end_idx) 范围内的下标
+            mask = (x < self.vocab_start_idx) | (x >= self.vocab_end_idx)
+            # x 中所有的值都减去 vocab_start_idx
+            x = x - self.vocab_start_idx
+            # 之前找出的标记为 mask 下标的值设置为0
+            x[mask] = 0
+        y = F.embedding(x, self.weight)
+        if world_size > 1:
+            y[mask] = 0
+            dist.all_reduce(y)
+        # print(f'ParallelEmbedding x: {x}', flush=True)
+        return y
+def linear(x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Applies a linear transformation to the incoming data: y = xA^T + b.
+    This function supports specialized implementations based on quantization
+    and tensor formats.
+    Args:
+        x (torch.Tensor): The input tensor.
+        weight (torch.Tensor): The weight tensor. It may be quantized and
+            requires dequantization for certain cases.
+        bias (Optional[torch.Tensor]): The bias tensor to be added. Default is None.
+    Returns:
+        torch.Tensor: The result of the linear transformation, which may involve
+        quantization-aware computations depending on the input parameters.
+    Notes:
+        - If `weight` is quantized (e.g., `element_size() == 1`), a dequantized version
+          is used for computation.
+        - If `gemm_impl == "bf16"`, dequantization and a `bf16` GEMM operation are applied.
+        - For other cases, the function applies quantization to `x` and uses `fp8_gemm` for computation.
+    """
+    element_size = weight.element_size()
+    typ = weight.type()
+    # print(f'linear weight element_size {element_size}, type: {typ}', flush=True)
+    if weight.element_size() > 1:
+        # print('linear weight.element_size > 1, element_size=' + str(weight.element_size()), flush=True)
+        return F.linear(x, weight, bias)
+    elif gemm_impl == "bf16":
+        weight = weight_dequant(weight, weight.scale)
+        return F.linear(x, weight, bias)
+    else:
+        # print('linear act_quant', flush=True)
+        x, scale = act_quant(x, block_size)
+        y = fp8_gemm(x, scale, weight, weight.scale)
+        if bias is not None:
+            y += bias
+        return y
+def linear_int(x: torch.Tensor, weight: torch.Tensor, x_rescale, weight_rescale, res_rescale, bias: Optional[torch.Tensor] = None) -> tuple[torch.Tensor]:
+    if weight.element_size() > 1:
+        (q, r) = int64_bmm_broadcast(x, weight, x_rescale, weight_rescale, res_rescale)
+        return (q, r)
+    elif gemm_impl == "bf16":
+        weight = weight_dequant(weight, weight.scale)
+        return (F.linear(x, weight, bias), torch.tensor(0, dtype=torch.int64))
+    else:
+        print('linear act_quant', flush=True)
+        x, scale = act_quant(x, block_size)
+        y = fp8_gemm(x, scale, weight, weight.scale)
+        if bias is not None:
+            y += bias
+        return (y, torch.tensor(0, dtype=torch.int64))
+class Linear_int(nn.Module):
+    """
+    Custom linear layer with support for quantized weights and optional bias.
+    Args:
+        in_features (int): Number of input features.
+        out_features (int): Number of output features.
+        bias (bool): Whether to include a bias term. Defaults to False.
+        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
+    """
+    dtype = torch.int64
+    def __init__(self, layer_id, in_features: int, out_features: int, x_rescale, weight_rescale, res_rescale, dtype, bias: bool = False):
+        super().__init__()
+        self.layer_id = layer_id
+        self.in_features = in_features
+        self.out_features = out_features
+        self.x_rescale = x_rescale
+        self.weight_rescale = weight_rescale
+        self.res_rescale = res_rescale
+        self.register_buffer("weight", torch.empty(out_features, in_features, dtype=dtype))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
+        else:
+            self.register_parameter("bias", None)
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor]:
+        q, r = linear_int(x, self.weight, self.x_rescale, self.weight_rescale, self.res_rescale, self.bias)
+        return q, r
+class Linear_rescale_int(nn.Module):
+    """
+    Custom linear layer with support for quantized weights and optional bias.
+    Args:
+        in_features (int): Number of input features.
+        out_features (int): Number of output features.
+        bias (bool): Whether to include a bias term. Defaults to False.
+        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
+    """
+    dtype = torch.int64
+    def __init__(self, layer_id, in_features: int, out_features: int, x_rescale, weight_rescale, dtype, bias: bool = False):
+        super().__init__()
+        self.layer_id = layer_id
+        self.in_features = in_features
+        self.out_features = out_features
+        self.x_rescale = x_rescale
+        self.weight_rescale = weight_rescale
+        self.register_buffer("weight", torch.empty(out_features, in_features, dtype=dtype))
+        self.register_buffer("scale", torch.tensor(0, dtype=torch.int32))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
+        else:
+            self.register_parameter("bias", None)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        rescale = self.scale.item()
+        y, _r = linear_int(x, self.weight, self.x_rescale, self.weight_rescale, rescale, self.bias)
+        return y
+class Linear(nn.Module):
+    """
+    Custom linear layer with support for quantized weights and optional bias.
+    Args:
+        in_features (int): Number of input features.
+        out_features (int): Number of output features.
+        bias (bool): Whether to include a bias term. Defaults to False.
+        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
+    """
+    dtype = torch.bfloat16
+    def __init__(self, layer_id, in_features: int, out_features: int, bias: bool = False, dtype = None):
+        super().__init__()
+        self.layer_id = layer_id
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(torch.empty(out_features, in_features, dtype=dtype or Linear.dtype))
+        # print('Linear.weight.element_size: ' + str(self.weight.element_size()))
+        # nn.Parameter.element_size() 返回的是 每个元素在内存中占用的字节数
+        # torch.float32 -> 4 字节
+        # torch.float64 -> 8 字节
+        # torch.int64 -> 8 字节
+        # torch.bfloat16 -> 2 字节
+        # torch.float8_e4m3fn -> 1 字节
+        if self.weight.element_size() == 1:
+            scale_out_features = (out_features + block_size - 1) // block_size
+            scale_in_features = (in_features + block_size - 1) // block_size
+            self.weight.scale = self.scale = nn.Parameter(torch.empty(scale_out_features, scale_in_features, dtype=torch.float32))
+        else:
+            self.register_parameter("scale", None)
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
+        else:
+            self.register_parameter("bias", None)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for the custom linear layer.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Transformed tensor after linear computation.
+        """
+        return linear(x, self.weight, self.bias)
+class ColumnParallelLinear(Linear):
+    """
+    Linear layer with column parallelism, splitting output features across distributed processes.
+    Args:
+        in_features (int): Number of input features.
+        out_features (int): Total number of output features.
+        bias (bool): Whether to include a bias term. Defaults to False.
+        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
+    """
+    def __init__(self, layer_id, in_features: int, out_features: int, bias: bool = False, dtype = None):
+        assert out_features % world_size == 0, f"Output features must be divisible by world size (world_size={world_size})"
+        self.part_out_features = out_features // world_size
+        super().__init__(layer_id, in_features, self.part_out_features, bias, dtype)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for column parallel linear layer.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Transformed tensor with column-parallel computation.
+        """
+        y = linear(x, self.weight, self.bias)
+        return y
+class ColumnParallelLinear_int(Linear_int):
+    def __init__(self, layer_id, in_features: int, out_features: int, x_rescale, weight_rescale, res_rescale, dtype, bias: bool = False):
+        assert out_features % world_size == 0, f"Output features must be divisible by world size (world_size={world_size})"
+        self.part_out_features = out_features // world_size
+        super().__init__(layer_id, in_features, self.part_out_features, x_rescale, weight_rescale, res_rescale, dtype, bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        y, _r = linear_int(x, self.weight, self.x_rescale, self.weight_rescale, self.res_rescale, self.bias)
+        return y
+class ColumnParallelLinear_rescale_int(Linear_int):
+    def __init__(self, layer_id, in_features: int, out_features: int, x_rescale, weight_rescale, dtype, bias: bool = False):
+        assert out_features % world_size == 0, f"Output features must be divisible by world size (world_size={world_size})"
+        self.part_out_features = out_features // world_size
+        super().__init__(layer_id, in_features, self.part_out_features, x_rescale, weight_rescale, 1, dtype, bias)
+        self.register_buffer("scale", torch.tensor(0, dtype=torch.int32))
+        # self.res_rescale = self.scale
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        rescale = self.scale.item()
+        y, _r = linear_int(x, self.weight, self.x_rescale, self.weight_rescale, rescale, self.bias)
+        return y
+class RowParallelLinear(Linear):
+    """
+    Linear layer with row parallelism, splitting input features across distributed processes.
+    Args:
+        in_features (int): Total number of input features.
+        out_features (int): Number of output features.
+        bias (bool): Whether to include a bias term. Defaults to False.
+        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
+    """
+    def __init__(self, layer_id, in_features: int, out_features: int, bias: bool = False, dtype = None):
+        assert in_features % world_size == 0, f"Input features must be divisible by world size (world_size={world_size})"
+        self.part_in_features = in_features // world_size
+        super().__init__(layer_id, self.part_in_features, out_features, bias, dtype)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for row parallel linear layer.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Transformed tensor with row-parallel computation.
+        """
+        y = linear(x, self.weight)
+        if world_size > 1:
+            dist.all_reduce(y)
+        if self.bias is not None:
+            y += self.bias
+        return y
+class RowParallelLinear_rescale_int(Linear_int):
+    """
+    Linear layer with row parallelism, splitting input features across distributed processes.
+    Args:
+        in_features (int): Total number of input features.
+        out_features (int): Number of output features.
+        bias (bool): Whether to include a bias term. Defaults to False.
+        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
+    """
+    def __init__(self, layer_id, in_features: int, out_features: int, x_rescale, weight_rescale, res_rescale, dtype, bias: bool = False):
+        assert in_features % world_size == 0, f"Input features must be divisible by world size (world_size={world_size})"
+        self.part_in_features = in_features // world_size
+        super().__init__(layer_id, self.part_in_features, out_features, x_rescale, weight_rescale, res_rescale, dtype, bias)
+        self.register_buffer("scale", torch.tensor(0, dtype=torch.int32))
+        self.res_rescale = self.scale # useless
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for row parallel linear layer.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Transformed tensor with row-parallel computation.
+        """
+        # rescale = 2 ** self.scale.item()
+        rescale = self.scale.item()
+        # print(f'RowParallelLinear_rescale_int forward scale: {self.scale} ' + str(rescale), flush=True)
+        y, _ = linear_int(x, self.weight, self.x_rescale, self.weight_rescale, rescale, self.bias)
+        if world_size > 1:
+            dist.all_reduce(y)
+        if self.bias is not None:
+            y += self.bias
+        return y
+class RMSNorm(nn.Module):
+    """
+    Root Mean Square Layer Normalization (RMSNorm).
+    Args:
+        dim (int): Dimension of the input tensor.
+        eps (float): Epsilon value for numerical stability. Defaults to 1e-6.
+    """
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor):
+        """
+        Forward pass for RMSNorm.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Normalized tensor with the same shape as input.
+        """
+        return F.rms_norm(x, (self.dim,), self.weight, self.eps)
+class RMSNorm_int(nn.Module):
+    def __init__(self, dim: int, dtype, eps: float = 1e-6):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.register_buffer(
+            "weight",
+            torch.ones(dim, dtype=dtype))
+    def forward(self, x: torch.Tensor):
+        # x 的 scale 为 2 ** 31
+        # weight的scale 为 2 ** 15, 范围为 2^7 - 2^14
+        # rms 的 scale 为 2 ** 28
+        # 返回的结果 scale 为 2 ** 16,因为中间计算的时候 除以了 (1 << 15)，44 + 15 - 28 - 15 = 16
+        (c, rms) = RMS_Norm_int64(x[0], self.weight, 1, self.dim)
+        return (c[None, :], rms)
+def precompute_freqs_cis(args: ModelArgs) -> torch.Tensor:
+    """
+    Precomputes frequency-based complex exponential values for rotary positional embeddings.
+    Args:
+        args (ModelArgs): Model arguments containing positional embedding parameters.
+    Returns:
+        torch.Tensor: Precomputed complex exponential values for positional embeddings.
+    """
+    # dim = 64
+    dim = args.qk_rope_head_dim
+    seqlen = args.max_seq_len
+    beta_fast = args.beta_fast
+    beta_slow = args.beta_slow
+    base = args.rope_theta
+    factor = args.rope_factor
+    def find_correction_dim(num_rotations, dim, base, max_seq_len):
+        """
+        Computes the correction dimension for a given number of rotations in the rotary positional embedding.
+        Args:
+            num_rotations (float): Number of rotations to compute the correction for.
+            dim (int): Dimensionality of the embedding space.
+            base (float): Base value for the exponential computation.
+            max_seq_len (int): Maximum sequence length.
+        Returns:
+            float: The correction dimension based on the input parameters.
+        """
+        return dim * math.log(max_seq_len / (num_rotations * 2 * math.pi)) / (2 * math.log(base))
+    def find_correction_range(low_rot, high_rot, dim, base, max_seq_len):
+        """
+        Computes the range of correction dimensions for rotary positional embeddings.
+        Args:
+            low_rot (float): Lower bound for the number of rotations.
+            high_rot (float): Upper bound for the number of rotations.
+            dim (int): Dimensionality of the embedding space.
+            base (float): Base value for the exponential computation.
+            max_seq_len (int): Maximum sequence length.
+        Returns:
+            Tuple[int, int]: The range of correction dimensions (low, high), clamped to valid indices.
+        """
+        low = math.floor(find_correction_dim(low_rot, dim, base, max_seq_len))
+        high = math.ceil(find_correction_dim(high_rot, dim, base, max_seq_len))
+        return max(low, 0), min(high, dim-1)
+    def linear_ramp_factor(min, max, dim):
+        """
+        Computes a linear ramp function used to smooth values between a minimum and maximum range.
+        Args:
+            min (float): Minimum value for the ramp function.
+            max (float): Maximum value for the ramp function.
+            dim (int): Dimensionality of the ramp tensor.
+        Returns:
+            torch.Tensor: A tensor of shape (dim,) with values linearly interpolated between 0 and 1,
+                clamped to the range [0, 1].
+        """
+        if min == max:
+            max += 0.001
+        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+        ramp_func = torch.clamp(linear_func, 0, 1)
+        return ramp_func
+    # torch.arange(0, dim, 2, dtype=torch.float32) 的作用是： 生成从 0 开始、步长为 2、到 dim 之前（不含 dim）的一维张量，数据类型为 float32
+    # 1/10000^(2k/d_model)
+    # freqs shape: 一维向量，长度为 dim /2
+    freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+    # original_seq_len=4096
+    if seqlen > args.original_seq_len:
+        low, high = find_correction_range(beta_fast, beta_slow, dim, base, args.original_seq_len)
+        smooth = 1 - linear_ramp_factor(low, high, dim // 2)
+        freqs = freqs / factor * (1 - smooth) + freqs * smooth
+    t = torch.arange(seqlen)
+    # torch.outer 的作用是计算两个向量的 外积 (outer product)，比如：
+    # t = torch.tensor([1, 2, 3])       # shape = [3]
+    # freqs = torch.tensor([10, 20])    # shape = [2]
+    # out = torch.outer(t, freqs)
+    # tensor([[10, 20],
+    #         [20, 40],
+    #         [30, 60]])
+    # freqs shape为 [seqlen, dim/2]
+    freqs = torch.outer(t, freqs)
+    # torch.polar(abs, angle) 的作用: 把 极坐标 (r, θ) 转换成 复数 (x + iy) 的函数
+    # freqs_cis_0 shape为 [seqlen, dim/2]
+    freqs_cis_0 = torch.polar(torch.ones_like(freqs), freqs)
+    # return freqs_cis_0
+    # 复数转换成实数, freqs_cis_1 shape为 [seqlen, dim]
+    freqs_cis_1 = torch.view_as_real(freqs_cis_0)
+    # freqs_cis = torch.empty_like(freqs_cis_1, dtype=torch.int64, device='cuda')
+    # cols 为 2 * freqs_cis_1.shape[1] 是因为 复数的实部 和 虚部
+    # rescale 参数为 19 = 42 - 23, ex 部分加 +19，总的rescale为 2^42
+    freqs_cis = (freqs_cis_1 * (2 ** 42)).round().to(torch.int64)
+    freqs_cis_abs = freqs_cis.abs()
+    min1 = freqs_cis_abs.min()
+    max1 = freqs_cis_abs.max()
+    print(f'freqs_cis min {min1}, max: {max1}', flush=True)
+    # print(f'freqs_cis: {freqs_cis}')
+    # freqs_cis  的 rescale 为 2^42
+    return freqs_cis
+#  x(q_pe) 的维度 [batch, seqLen, 128, 64]
+def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+    """
+    Applies rotary positional embeddings to the input tensor.
+    Args:
+        x (torch.Tensor): Input tensor with positional embeddings to be applied.
+        freqs_cis (torch.Tensor): Precomputed complex exponential values for positional embeddings.
+    Returns:
+        torch.Tensor: Tensor with rotary embeddings applied.
+    """
+    # if x.dtype == torch.int64:
+    # x 的维度 变为 [batch, seqLen, 128, 32, 2]
+    ### important!!! 调用 so lib库之前，必须确保内存连续
+    x = x.contiguous().view(*x.shape[:-1], -1, 2)
+    # freqs_cis 的维度为 [1, seqLen, 1, 32, 2]
+    freqs_cis = freqs_cis.view(1, x.size(1), 1, x.size(-2), 2)
+    # freqs_cis = freqs_cis.view(1, x.size(1), 1, x.size(-1))
+    # 4194304 = 1 << (64 - 42), 42是 rescale, int64 * int64 结果的高 64位 乘以 4194304
+    # 4398046511104 = 1 << 42
+    # print(x)
+    # print(f'x shape: {x.shape}, freqs_cis shape: {freqs_cis.shape}')
+    # y = complex_int64_mul_broadcast(x, freqs_cis, 4194304, 4398046511104)
+    y = complex_int64_mul_broadcast(x, freqs_cis)
+    y2 = y.flatten(3)
+    return y2
+def getBF16PrintStr(ele):
+    v = int(ele.cpu().view(torch.uint16).item())
+    ex = v >> 7 & 0xFF
+    r = '(1+' + str(v & 0x7F) + '/128)'
+    rraw = v & 0x7F
+    if v & 0x8000:
+        vstr = '-' + r + '*2^' + str(ex - 127)
+    else:
+        vstr = r + '*2^' + str(ex - 127)
+    return vstr
+class MLA(nn.Module):
+    """
+    Multi-Headed Attention Layer (MLA).
+    Attributes:
+        dim (int): Dimensionality of the input features.
+        n_heads (int): Number of attention heads.
+        n_local_heads (int): Number of local attention heads for distributed systems.
+        q_lora_rank (int): Rank for low-rank query projection.
+        kv_lora_rank (int): Rank for low-rank key/value projection.
+        qk_nope_head_dim (int): Dimensionality of non-positional query/key projections.
+        qk_rope_head_dim (int): Dimensionality of rotary-positional query/key projections.
+        qk_head_dim (int): Total dimensionality of query/key projections.
+        v_head_dim (int): Dimensionality of value projections.
+        softmax_scale (float): Scaling factor for softmax in attention computation.
+    """
+    def __init__(self, layer_id, args: ModelArgs):
+        super().__init__()
+        # RowParallelLinear和ColumnParallelLinear是将Linear层按照行和列划分为多个子线性层并分配到各个设备上，每个设备维护一个子线性层，
+        # 如线性层的shape为[in_features, out_features]，RowParallelLinear的shape为[in_features/world_size, out_features]，
+        # ColumnParallelLinear的shape为[in_features，out_features/world_size]，world_size是设备数
+        self.layer_id = layer_id
+        # 7168
+        self.dim = args.dim
+        # 128
+        self.n_heads = args.n_heads
+        # 当前进程跑的header数目
+        self.n_local_heads = args.n_heads // world_size
+        # query向下投影矩阵维度，默认为0表示不压缩，实际使用过程为 1536
+        self.q_lora_rank = args.q_lora_rank
+        # key和value向下投影矩阵维度，实际使用过程为 512;
+        self.kv_lora_rank = args.kv_lora_rank
+        # query/key不包含位置信息的隐藏层维度, 实际使用过程为 128
+        self.qk_nope_head_dim = args.qk_nope_head_dim
+        # query/key包含rope位置信息的隐藏层维度, 实际使用过程为 64
+        self.qk_rope_head_dim = args.qk_rope_head_dim
+        # 192
+        self.qk_head_dim = args.qk_nope_head_dim + args.qk_rope_head_dim
+        # value隐藏层维度, 实际使用过程为 128
+        self.v_head_dim = args.v_head_dim
+        # query向下投影矩阵维度，默认为0表示不压缩，实际使用过程为 1536
+        if self.q_lora_rank == 0:
+            self.wq = ColumnParallelLinear(layer_id, self.dim, self.n_heads * self.qk_head_dim)
+        else:
+            # query向下投影矩阵, shape [7168, 1536], Float8_e4m3fnTensor
+            self.wq_a = Linear_int(layer_id, self.dim, self.q_lora_rank, 1, 1, 30, torch.int32)
+            self.q_norm = RMSNorm_int(self.q_lora_rank, torch.int32)
+            # query向上投影矩阵的列并行线性层, shape [1536, 24576(128 * 192)], Float8_e4m3fnTensor
+            # self.wq_b = ColumnParallelLinear_int(layer_id, self.q_lora_rank, self.n_heads * self.qk_head_dim, 1, 1, (1 << 30), torch.int32)
+            self.wq_b1 = ColumnParallelLinear_int(layer_id, self.q_lora_rank, self.n_heads * args.qk_nope_head_dim, 1, 1, 30, torch.int32)
+            self.wq_b2 = ColumnParallelLinear_int(layer_id, self.q_lora_rank, self.n_heads * args.qk_rope_head_dim, 1, 1, 30, torch.int32)
+        # key和value的向下投影矩阵, shape [576, 7168], Float8_e4m3fnTensor, kv_lora_rank=512, qk_rope_head_dim=64
+        # self.wkv_a = Linear_int(layer_id, self.dim, self.kv_lora_rank + self.qk_rope_head_dim, 1, 1, (1 << 29), torch.int32)
+        self.wkv_a1 = Linear_int(layer_id, self.dim, self.kv_lora_rank, 1, 1, 29, torch.int32)
+        self.wkv_a2 = Linear_int(layer_id, self.dim, self.qk_rope_head_dim, 1, 1, 29, torch.int32)
+        # self.kv_norm = RMSNorm(self.kv_lora_rank)
+        self.kv_norm = RMSNorm_int(self.kv_lora_rank, torch.int32)
+        # key和value向上投影矩阵的列并行线性层, shape [32768, 512], Float8_e4m3fnTensor
+        # kv_lora_rank=512, n_heads = 128, qk_nope_head_dim = 128, v_head_dim = 128
+        # self.wkv_b = ColumnParallelLinear(layer_id, self.kv_lora_rank, self.n_heads * (self.qk_nope_head_dim + self.v_head_dim))
+        self.wkv_b_1 = ColumnParallelLinear_rescale_int(layer_id, self.kv_lora_rank, self.n_heads * self.qk_nope_head_dim, 1, 1, torch.int32)
+        self.wkv_b_2 = ColumnParallelLinear_rescale_int(layer_id, self.kv_lora_rank, self.n_heads * self.v_head_dim, 1, 1, torch.int32)
+        # 输出投影行并行线性层, shape [7168, 16384], Float8_e4m3fnTensor
+        self.wo = RowParallelLinear_rescale_int(layer_id, self.n_heads * self.v_head_dim, self.dim, 1, 1, 1, torch.int32)
+        # softmax缩放系数, qk_head_dim = 192
+        # self.softmax_scale = self.qk_head_dim ** -0.5
+        # # max_seq_len = 4096 * 4, original_seq_len = 4096
+        # if args.max_seq_len > args.original_seq_len:
+        #     # mscale = 1.0, rope_factor = 40, math.log = ln 自然对数
+        #     mscale = 0.1 * args.mscale * math.log(args.rope_factor) + 1.0
+        #     self.softmax_scale = self.softmax_scale * mscale * mscale
+        self.softmax_scale1 = 94
+        self.softmax_scale2 = 695
+        if attn_impl == "naive":
+            self.register_buffer("k_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.n_local_heads, self.qk_head_dim), persistent=False)
+            self.register_buffer("v_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.n_local_heads, self.v_head_dim), persistent=False)
+        else:
+            # 缓存key和value向下投影表示
+            # self.register_buffer("kv_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.kv_lora_rank), persistent=False)
+            # self.register_buffer("kv_cache", torch.zeros(1, args.max_seq_len, self.kv_lora_rank), persistent=False)
+            self.register_buffer("kv_cache", torch.zeros(1, args.max_seq_len, self.kv_lora_rank, dtype=torch.int64), persistent=False)
+            # 缓存key执行rope操作后的表示
+            # self.register_buffer("pe_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.qk_rope_head_dim), persistent=False)
+            # self.register_buffer("pe_cache", torch.zeros(1, args.max_seq_len, self.qk_rope_head_dim), persistent=False)
+            self.register_buffer("pe_cache", torch.zeros(1, args.max_seq_len, self.qk_rope_head_dim, dtype=torch.int64), persistent=False)
+    # x shape [1, seqLen, 7168], x 的resacle 为 2^21
+    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
+        """
+        Forward pass for the Multi-Headed Attention Layer (MLA).
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, seq_len, dim).
+            start_pos (int): Starting position in the sequence for caching.
+            freqs_cis (torch.Tensor): Precomputed complex exponential values for rotary embeddings.
+            mask (Optional[torch.Tensor]): Mask tensor to exclude certain positions from attention.
+        Returns:
+            torch.Tensor: Output tensor with the same shape as the input.
+        """
+        # 从输入获取batch size和序列长度seqlen，并根据输入序列的起始位置计算输入序列的结束位置end_pos=start_pos+seqlen；
+        bsz, seqlen, _ = x.size()
+        end_pos = start_pos + seqlen
+        # 获取query的投影表示：如果对query投影矩阵进行压缩(即q_lora_rank不为0)，则将输入乘以query的向下投影矩阵wq_a，然后经过归一化层q_norm，
+        # 再乘以向上投影矩阵wq_b，否则直接乘以原始投影矩阵wq；将其维度调整为[batchsize, n_local_threads, qk_head_dim]；
+        if self.q_lora_rank == 0:
+            q = self.wq(x)
+        else:
+            # query向下投影矩阵, shape [7168, 1536], Float8_e4m3fnTensor
+            # x(也就是 attn_normed) 的 scale 为 2^21, wq_a weight 的scale 为 2^30, q_down 的 scale 为 2^21
+            q_down, q_down_rem = self.wq_a(x)
+            # q_down = self.wq_a(x)
+            if snark:
+                dirStr = f'zkdata/pos_{start_pos}/layer_{self.layer_id}'
+                os.makedirs(dirStr, exist_ok=True)
+                saveTensor(f'{dirStr}/wq_a_x.bin', x.cpu())
+                saveTensor(f'{dirStr}/wq_a_w.bin', self.wq_a.weight.view(torch.uint32).cpu())
+                saveTensor(f'{dirStr}/wq_a_y.bin', q_down.cpu())
+                saveTensor(f'{dirStr}/q_norm_r.bin', q_down_rem.cpu())
+                # q_down = (q_down.detach().to(torch.float32) * (2 ** -23)).to(torch.bfloat16)
+            # q_norm 的 rescale 为 2^19
+            (q_normed, rms) = self.q_norm(q_down)
+            if snark:
+                dirStr = f'zkdata/pos_{start_pos}/layer_{self.layer_id}'
+                os.makedirs(dirStr, exist_ok=True)
+                saveTensor(f'{dirStr}/q_norm_x.bin', q_down.cpu())
+                saveTensor(f'{dirStr}/q_norm_weight.bin', self.q_norm.weight.view(torch.uint32).cpu())
+                saveTensor(f'{dirStr}/q_norm_rms.bin', rms.cpu())
+                saveTensor(f'{dirStr}/q_norm_y.bin', q_normed.cpu())
+            # q 的 rescale 为 2^19
+            # q = self.wq_b(q_normed)
+            q_nope = self.wq_b1(q_normed)
+            q_pe = self.wq_b2(q_normed)
+        # 在pytorch中view函数的作用为重构张量的维度
+        # q = q.view(bsz, seqlen, self.n_local_heads, self.qk_head_dim)
+        q_nope = q_nope.view(bsz, seqlen, self.n_local_heads, self.qk_nope_head_dim)
+        q_pe = q_pe.view(bsz, seqlen, self.n_local_heads, self.qk_rope_head_dim)
+        # 将query的投影表示按照最后一个维度拆分，前面qk_nope_head_dim维(128)作为query不包含位置信息的表示q_nope，后面qk_rope_head_dim维(64)添加rope位置信息
+        # (调用apply_rotary_emb函数，参考秀才经商：DeepSeek源码解析之RoPE)作为query包含位置信息的表示q_pe(即公式39)；
+        # q_nope 的维度[batch, seqLen, 128, 128], q_pe 的维度 [batch, seqLen, 128, 64]
+        # q_nope, q_pe 的 rescale 为 2^19
+        # q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        # freqs_cis  的 rescale 为 2^42, 计算之后 q_pe 的 rescale 为 2^19
+        if snark:
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/q_pe_x.bin', q_pe.cpu())
+            saveTensor(f'zkdata/freqs_cis.bin', freqs_cis.cpu())
+        q_pe = apply_rotary_emb(q_pe, freqs_cis)
+        if snark:
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/q_pe_y.bin', self.q_norm.weight.view(torch.uint32).cpu())
+        # 获取key和value的联合表示kv(即公式41中的)和包含位置信息的key表示k_pe(即公式43中的)：输入乘以向下投影矩阵wkv_a后，按照最后一个维度拆分，
+        # 前面kv_lora_rank维作为key和value的联合表示，后面qk_rope_head_dim维添加rope位置信息(调用apply_rotary_emb)后得到包含rope位置信息的key表示；
+        # x 的resacle 为 2^21, kv shape [batch, seqLen, 512], kv 的resacle 为 2^21
+        kv, kv_rem = self.wkv_a1(x)
+        if snark:
+            dirStr = f'zkdata/pos_{start_pos}/layer_{self.layer_id}'
+            os.makedirs(dirStr, exist_ok=True)
+            saveTensor(f'{dirStr}/wkv_a1_x.bin', x.cpu())
+            saveTensor(f'{dirStr}/wkv_a1_w.bin', self.wkv_a1.weight.view(torch.uint32).cpu())
+            saveTensor(f'{dirStr}/wkv_a1_y.bin', kv.cpu())
+            saveTensor(f'{dirStr}/wkv_a1_r.bin', kv_rem.cpu())
+        k_pe, _ = self.wkv_a2(x)
+        # print(f'k_pe 1 shape: {k_pe.shape}', flush=True)
+        # unsqueeze()用于增加一个维度, k_pe.unsqueeze(2) 把 k_pe reshape 成 [batch, seqLen, 1, dim]
+        # # kv, k_pe 的resacle 为 2^21
+        k_pe = apply_rotary_emb(k_pe.unsqueeze(2), freqs_cis)
+        # print(f'k_pe 2 shape: {k_pe.shape}', flush=True)
+        if attn_impl == "naive":
+            q = torch.cat([q_nope, q_pe], dim=-1)
+            kv = self.wkv_b(self.kv_norm(kv))
+            kv = kv.view(bsz, seqlen, self.n_local_heads, self.qk_nope_head_dim + self.v_head_dim)
+            k_nope, v = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            k = torch.cat([k_nope, k_pe.expand(-1, -1, self.n_local_heads, -1)], dim=-1)
+            self.k_cache[:bsz, start_pos:end_pos] = k
+            self.v_cache[:bsz, start_pos:end_pos] = v
+            scores = torch.einsum("bshd,bthd->bsht", q, self.k_cache[:bsz, :end_pos]) * self.softmax_scale
+        else:
+            # 计算query和key的注意力：
+            # query中不包含位置信息的q_nope(乘以了key的向上投影矩阵后)与缓存kv_cache中的key表示求内积；
+            # query中包含位置信息的q_pe与缓存pe_cache中的key表示求内积；
+            # 两者相加后乘以softmax缩放系数softmax_scale
+            # q_nope 的维度[batch, seqLen, 128, 128], wkv_b_1 shape: [128, 128, 512]
+            # q_nope rescale 2^19, wkv_b_1 rescale 2 ** 32
+            # q_nope = torch.einsum("bshd,hdc->bshc", q_nope, wkv_b_1)
+            # 调用 einsum_bshd_hdc_bshc 之后, q_nope维度 [batch, seqLen, 128, 512]
+            wkv_b_1 = self.wkv_b_1.weight.view(self.n_local_heads, -1, self.kv_lora_rank)
+            q_nope = einsum_bshd_hdc_bshc(q_nope.contiguous(), wkv_b_1.contiguous(), self.wkv_b_1.scale.item())
+            # print('q_nope type: ' + str(q_nope.type()))
+            # print('q_nope shape: ' + str(q_nope.shape))
+            # kv_normed 的 rescale 为 2^23
+            (kv_normed, rms) = self.kv_norm(kv)
+            # kv_cache 的 rescale 为 2^23, shape [batch, seqLen, 512],
+            self.kv_cache[:bsz, start_pos:end_pos] = kv_normed
+            # self.kv_cache[:bsz, start_pos:end_pos] = kv2
+            # kv = (kv.detach().to(torch.float32) * (2 ** -23)).to(torch.bfloat16)
+            # pe_cache 的 rescale 为 2^21
+            self.pe_cache[:bsz, start_pos:end_pos] = k_pe.squeeze(2)
+            # q_nope rescale: 2^19, kv_cache rescale: 2^23
+            # q_nope 的维度 [batch, seqLen, 128, 512], kv_cache 维度 (batch, args.max_seq_len, 512)
+            # score1 = torch.einsum("bshc,btc->bsht", q_nope, self.kv_cache[:bsz, :end_pos])
+            kv_cache1 = self.kv_cache[:bsz, :end_pos]
+            # score1 = einsum_bshc_btc_bsht(q_nope.contiguous(), kv_cache1.contiguous(), 25)
+            # score1 的 rescale 为 2^19
+            score1 = einsum_bshc_btc_bsht(q_nope.contiguous(), kv_cache1.contiguous(), 23)
+            # print(f'kv_cache1 type: {kv_cache1.type()}, shape: {kv_cache1.shape}', flush=True)
+            # score1 = (score1.detach().to(torch.float32) * (2 ** -21)).to(torch.bfloat16)
+            # score2 = torch.einsum("bshr,btr->bsht", q_pe, self.pe_cache[:bsz, :end_pos])
+            pe_cache1 = self.pe_cache[:bsz, :end_pos]
+            # score2 = einsum_bshc_btc_bsht(q_pe.contiguous(), pe_cache1.contiguous(), 23)
+            # q_pe 的 rescale 为 2^19, scores2 的rescale 为  2^19
+            score2 = einsum_bshc_btc_bsht(q_pe.contiguous(), pe_cache1.contiguous(), 21)
+            # score2 = (score2.detach().to(torch.float32) * (2 ** -21)).to(torch.bfloat16)
+            # scores = (score1 + score2) * self.softmax_scale
+            # scores  的 rescale 为 2 ** 19
+            scores = (score1 + score2) * self.softmax_scale1 // self.softmax_scale2
+            # scores = torch.round(((score1 + score2) * self.softmax_scale1).to(torch.float32) / self.softmax_scale2).to(torch.int64)
+        # mask 在 unsqueeze(1) 之后的 shape 为 [seqLen, 1, senLen], scores 的shape 为 [batch, seqLen, heads , t]
+        if mask is not None:
+            # print('mask type: ' + str(mask.type()))
+            # print('mask shape: ' + str(mask.shape))
+            scores += mask.unsqueeze(1)
+        # query和key的内积按照最后一个维度计算softmax值；
+        # scores = scores.softmax(dim=-1, dtype=torch.float32).type_as(x)
+        scores_new = torch.empty_like(scores, dtype=torch.int64, device='cuda')
+        # scores 和  scores_new 的 rescale 为 2 ** 19, shape: [bsz, seqLen, headCount, seqLen]
+        # # softmax_q19 会破坏 scores 的原始数据，先拷贝一份数据
+        if snark:
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/scores_softmax_x.bin', scores.contiguous().cpu())
+        softmax_q19(scores.contiguous(), scores_new)
+        if snark:
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/scores_softmax_y.bin', scores_new.cpu())
+        if attn_impl == "naive":
+            x = torch.einsum("bsht,bthd->bshd", scores, self.v_cache[:bsz, :end_pos])
+        else:
+            kv_cache2 = self.kv_cache[:bsz, :end_pos]
+            # kv_cache2 = (kv_cache2.detach().to(torch.float32) * (2 ** -25)).to(torch.bfloat16)
+            # x = (x.detach().to(torch.float32) * (2 ** -23)).to(torch.bfloat16)
+            # 计算最终输出：
+            # 注意力分数乘以kv缓存后，再乘以value的向上投影矩阵wkv_b(实现公式45和46)；
+            # 乘以输出投影矩阵wo(公式47)；
+            # x = torch.einsum("bsht,btc->bshc", scores_new, kv_cache2)
+            # scores_new 的 rescale 为 2^19, kv_cache2 的 rescale 为 2^23, bshc 的 rescale 为 2^19
+            # scores_new shape: [1, 8, 128, 8], bshc shape: [1, 8, 128, 512]
+            # bshc = einsum_bsht_btc_bshc(scores_new.contiguous(), kv_cache2.contiguous(), 25)
+            bshc = einsum_bsht_btc_bshc(scores_new.contiguous(), kv_cache2.contiguous(), 23)
+            # # v_head_dim = 128, kv_lora_rank = 512, n_local_heads = 128
+            # wkv_b_2 = wkv_b[:, -self.v_head_dim:]
+            # # print('wkv_b 2 type: ' + str(wkv_b_2.type()))
+            # # print('wkv_b 2 shape: ' + str(wkv_b_2.shape))
+            wkv_b_2 = self.wkv_b_2.weight
+            wkv_b_2 = wkv_b_2.view(self.n_local_heads, -1, self.kv_lora_rank)
+            # wkv_b_2 = (wkv_b_2.detach().to(torch.float32) * (2 ** -self.wkv_b_2.scale.item())).to(torch.bfloat16)
+            # x = torch.einsum("bshc,hdc->bshd", x, wkv_b_2)
+            # bshc 的 rescale 为 2^19, wkv_b_2 的 rescale 为  self.wkv_b_2.scale
+            # x 的 rescale 为 2 ** 19
+            # bshc shape: [1, seqLen, 128, 512], wkv_b_2 shape: [128, 128, 512]
+            x = einsum_bshc_hdc_bshd(bshc.contiguous(), wkv_b_2.contiguous(), self.wkv_b_2.scale.item())
+            # x = (x.detach().to(torch.float32) * (2 ** -21)).to(torch.bfloat16)
+        # x 返回的的 shape [1, seqLen, 7168]
+        x = self.wo(x.flatten(2))
+        return x
+class MLP(nn.Module):
+    """
+    Multi-Layer Perceptron (MLP) used as a feed-forward layer.
+    Attributes:
+        w1 (nn.Module): Linear layer for input-to-hidden transformation.
+        w2 (nn.Module): Linear layer for hidden-to-output transformation.
+        w3 (nn.Module): Additional linear layer for feature transformation.
+    """
+    def __init__(self, layer_id, dim: int, inter_dim: int):
+        """
+        Initializes the MLP layer.
+        Args:
+            dim (int): Input and output dimensionality.
+            inter_dim (int): Hidden layer dimensionality.
+        """
+        super().__init__()
+        self.w1 = ColumnParallelLinear(layer_id, dim, inter_dim)
+        self.w2 = RowParallelLinear(layer_id, inter_dim, dim)
+        self.w3 = ColumnParallelLinear(layer_id, dim, inter_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for the MLP layer.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Output tensor after MLP computation.
+        """
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class MLP_int(nn.Module):
+    """
+    Multi-Layer Perceptron (MLP) used as a feed-forward layer.
+    Attributes:
+        w1 (nn.Module): Linear layer for input-to-hidden transformation.
+        w2 (nn.Module): Linear layer for hidden-to-output transformation.
+        w3 (nn.Module): Additional linear layer for feature transformation.
+    """
+    def __init__(self, layer_id, dim: int, inter_dim: int):
+        """
+        Initializes the MLP layer.
+        Args:
+            dim (int): Input and output dimensionality.
+            inter_dim (int): Hidden layer dimensionality.
+        """
+        super().__init__()
+        self.layer_id = layer_id
+        self.w1 = ColumnParallelLinear_rescale_int(layer_id, dim, inter_dim, 1, 1, torch.int32)
+        self.w2 = RowParallelLinear_rescale_int(layer_id, inter_dim, dim, 1, 1, 1, torch.int32)
+        self.w3 = ColumnParallelLinear_rescale_int(layer_id, dim, inter_dim, 1, 1, torch.int32)
+    # 输入的 x 的rescale 为 2^23, [bsz, seqLen, 7168]
+    def forward(self, start_pos: int, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for the MLP layer.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Output tensor after MLP computation.
+        """
+        # r1 shape: [bsz, seqLen, inter_dim], r1 rescale: 2^23
+        r1 = self.w1(x)
+        # s1 = F.silu(r1)
+        # s1 shape: [bsz, seqLen, inter_dim], s1 rescale: 2^23
+        s1 = torch.empty_like(r1, dtype=torch.int64, device='cuda')
+        # silu_q25(r1, s1)
+        if snark:
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/mlp_silu_x.bin', r1.contiguous().cpu())
+        silu_q23(r1, s1)
+        if snark:
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/mlp_silu_y.bin', s1.cpu())
+        # r2 rescale: 2^23, shape: [1, seqLen, inter_dim]
+        r2 = self.w3(x)
+        # 返回的 shape [bsz, seqLen, dim]
+        q = self.w2(s1 * r2 // (1 << 23))
+        return q
+class Gate(nn.Module):
+    """
+    Gating mechanism for routing inputs in a mixture-of-experts (MoE) model.
+    Attributes:
+        dim (int): Dimensionality of input features.
+        topk (int): Number of top experts activated for each input.
+        n_groups (int): Number of groups for routing.
+        topk_groups (int): Number of groups to route inputs to.
+        score_func (str): Scoring function ('softmax' or 'sigmoid').
+        route_scale (float): Scaling factor for routing weights.
+        weight (torch.nn.Parameter): Learnable weights for the gate.
+        bias (Optional[torch.nn.Parameter]): Optional bias term for the gate.
+    """
+    def __init__(self, layer_id: int, args: ModelArgs):
+        """
+        Initializes the Gate module.
+        Args:
+            args (ModelArgs): Model arguments containing gating parameters.
+        """
+        super().__init__()
+        self.layer_id = layer_id
+        self.dim = args.dim
+        # n_activated_experts = 8
+        self.topk = args.n_activated_experts
+        # n_expert_groups = 8
+        self.n_groups = args.n_expert_groups
+        # n_limited_groups = 4
+        self.topk_groups = args.n_limited_groups
+        # score_func = 'sigmoid'
+        self.score_func = args.score_func
+        # route_scale = 2.5
+        self.route_scale = args.route_scale
+        # n_routed_experts = 256
+        # self.weight = nn.Parameter(torch.empty(args.n_routed_experts, args.dim))
+        self.register_buffer("weight", torch.empty(args.n_routed_experts, args.dim, dtype=torch.int32))
+        self.register_buffer("scale", torch.tensor(0, dtype=torch.int32))
+        # self.bias = nn.Parameter(torch.empty(args.n_routed_experts, dtype=torch.int32)) if self.dim == 7168 else None
+        if self.dim == 7168:
+            self.register_buffer("bias", torch.empty(args.n_routed_experts, dtype=torch.int32))
+        else:
+            self.bias = None
+    # x 的 rescale 为 2^23
+    def forward(self, start_pos: int, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass for the gating mechanism.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Routing weights and selected expert indices.
+        """
+        x = x.view(1, -1, self.dim)
+        # scores = linear(x, self.weight)
+        # self.weight shape: [256, 7168]
+        # 当前 scores shape: [1, seqLen, 256]
+        # rescale = 2 ** self.scale.item()
+        rescale = self.scale.item()
+        # scores 的 rescale 为 2^23
+        scores, scores_rem = linear_int(x, self.weight, 1, 1, rescale)
+        # scores = int64_bmm_with_bias(x, self.weight, bias, 1, 1, self.scale)
+        # x shape: [seqLen, 7168]
+        x = x.view(-1, self.dim)
+        if self.score_func == "softmax":
+            scores = scores.softmax(dim=-1, dtype=torch.float32)
+        else:
+            # scores = scores.sigmoid()
+            C = torch.empty_like(scores, dtype=torch.int64, device='cuda')
+            if snark:
+                saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_x.bin', scores.cpu())
+                saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_r.bin', scores_rem.cpu())
+            sigmoid_q23(scores, C)
+            if snark:
+                saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_y.bin', C.cpu())
+            # 当前 scores shape: [seqLen, 256]
+            scores = C.squeeze(0)
+        # bias的rescale为2^23
+        original_scores = scores
+        if self.bias is not None:
+            # scores = scores + self.bias
+            # 当前 scores shape: [seqLen, 256]
+            scores = scores + self.bias
+        if snark:
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/gate_original_scores.bin', original_scores.contiguous().cpu())
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/gate_bias.bin', self.bias.view(torch.uint32).cpu())
+        # n_groups = 8
+        if self.n_groups > 1:
+            # x.size(0) = 8，当前 scores shape: [seqLen, 8, 32]
+            scores = scores.view(x.size(0), self.n_groups, -1)
+            # print(f'scores shape 111: {scores.shape}', flush=True)
+            if self.bias is None:
+                group_scores = scores.amax(dim=-1)
+            else:
+                # topk 返回 -1维度上 最大的 前 2 个值，同时返回值和索引，[0] 表示 取值，sum(-1) 再把最大的两个值相加.
+                # 256维，分成8个组，每个组挑最大的两个数相加，得到 [seqLen, 8] 的结果，代表 8 个组的 最大两个值的和。
+                # group_scores 的 shape: [8, 8]
+                group_scores = scores.topk(2, dim=-1)[0].sum(dim=-1)
+                # print(group_scores[0], flush=True)
+                # print(f'group_scores shape: {group_scores.shape}')
+            # topk_groups = 4, 从 8 个group中选择最大的 4个，返回其索引，比如返回 [[0, 2, 4, 6], ...]
+            # indices shape: [seqLen, 4]
+            indices = group_scores.topk(self.topk_groups, dim=-1)[1]
+            # print(indices[0], flush=True)
+            # mask shape: [seqLen, 8]
+            # scatter_: 按照给定索引，把某个源张量的值写入到目标张量对应位置。 Tensor.scatter_(dim, index, src, reduce=None)
+            # 比如 mask 为[[False, True, False, True, False, True, False, True], ...]
+            # mask: 每一行最大的4个值相对应的 mask 为 False
+            mask = scores.new_ones(x.size(0), self.n_groups, dtype=bool).scatter_(1, indices, False)
+            # print(mask[0], flush=True)
+            # 把满足布尔 mask 的位置替换成 "-inf", mask.unsqueeze(-1) shape: [8, 8, 1]
+            # 把 scores 中 淘汰掉的4个group中的每一个值设置为 "-inf",总共设置 128个 "-inf"，占每一行中的一半
+            # scores shape: [seqLen, 256]
+            # scores = scores.masked_fill_(mask.unsqueeze(-1), float("-inf")).flatten(1)
+            scores = scores.masked_fill_(mask.unsqueeze(-1), -(1 << 42)).flatten(1)
+        # 没有淘汰掉的group中的 128个值中，选择最大的8个值，返回其下标
+        # self.topk = 8, indices shape: [8, 8]
+        indices = torch.topk(scores, self.topk, dim=-1)[1]
+        # print(indices[0], flush=True)
+        # gather 用来按照索引从一个张量中取值，按照8个最大值的下标，获取其值
+        # weights shape: [8, 8]
+        weights = original_scores.gather(1, indices)
+        if snark:
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/gate_indices.bin', indices.contiguous().cpu())
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/gate_weights.bin', weights.contiguous().cpu())
+        # print(f'weights shape: {weights.shape}')
+        if self.score_func == "sigmoid":
+            sum1 = weights.sum(dim=-1, keepdim=True)
+            # weights = (weights * (2 ** 25) + sum1 // 2) // sum1
+            weights = (weights * (2 ** 23)) // sum1
+            # weights /= weights.sum(dim=-1, keepdim=True)
+        #self.route_scale = 2.5
+        # weights *= self.route_scale
+        weights = weights * 5 // 2
+        # weights = (weights.to(torch.float32) * (2 ** -23)).to(torch.bfloat16)
+        # return weights.type_as(x), indices
+        return weights, indices
+class Expert_int(nn.Module):
+    """
+    Expert layer for Mixture-of-Experts (MoE) models.
+    Attributes:
+        w1 (nn.Module): Linear layer for input-to-hidden transformation.
+        w2 (nn.Module): Linear layer for hidden-to-output transformation.
+        w3 (nn.Module): Additional linear layer for feature transformation.
+    """
+    def __init__(self, layer_id, idx, dim: int, inter_dim: int):
+        """
+        Initializes the Expert layer.
+        Args:
+            dim (int): Input and output dimensionality.
+            inter_dim (int): Hidden layer dimensionality.
+        """
+        super().__init__()
+        # # w1 shape: [2048, 7168]
+        # self.w1 = Linear(layer_id, dim, inter_dim)
+        # # w2 shape: [7168, 2048]
+        # self.w2 = Linear(layer_id, inter_dim, dim)
+        # # w3 shape: [2048, 7168]
+        # self.w3 = Linear(layer_id, dim, inter_dim)
+        self.layer_id = layer_id
+        self.idx = idx
+        self.w1 = Linear_rescale_int(layer_id, dim, inter_dim, 1, 1, torch.int32)
+        self.w2 = Linear_rescale_int(layer_id, inter_dim, dim, 1, 1, torch.int32)
+        self.w3 = Linear_rescale_int(layer_id, dim, inter_dim, 1, 1, torch.int32)
+    def forward(self, start_pos: int, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for the Expert layer.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Output tensor after expert computation.
+        """
+        # 返回的 shape [bsz, seqLen, 7168]
+        # return self.w2(F.silu(self.w1(x)) * self.w3(x))
+        # r1 shape: [bsz, seqLen, 18432], r1 rescale: 2^23
+        r1 = self.w1(x)
+        # s1 = F.silu(r1)
+        # s1 shape: [bsz, seqLen, 18432], s1 rescale: 2^23
+        s1 = torch.empty_like(r1, dtype=torch.int64, device='cuda')
+        # silu_q25(r1, s1)
+        if snark:
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/expert_{self.idx}_silu_x.bin', r1.contiguous().cpu())
+        silu_q23(r1, s1)
+        if snark:
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/expert_{self.idx}_silu_y.bin', s1.cpu())
+        # r2 rescale: 2^23
+        r2 = self.w3(x)
+        # 返回的 shape [bsz, seqLen, 7168]
+        q = self.w2((s1 * r2) >> 23)
+        return q
+class MoE(nn.Module):
+    """
+    Mixture-of-Experts (MoE) module.
+    Attributes:
+        dim (int): Dimensionality of input features.
+        n_routed_experts (int): Total number of experts in the model.
+        n_local_experts (int): Number of experts handled locally in distributed systems.
+        n_activated_experts (int): Number of experts activated for each input.
+        gate (nn.Module): Gating mechanism to route inputs to experts.
+        experts (nn.ModuleList): List of expert modules.
+        shared_experts (nn.Module): Shared experts applied to all inputs.
+    """
+    def __init__(self, layer_id, args: ModelArgs, ckpt_path):
+        """
+        Initializes the MoE module.
+        Args:
+            args (ModelArgs): Model arguments containing MoE parameters.
+        """
+        super().__init__()
+        self.layer_id = layer_id
+        self.ckpt_path = ckpt_path
+        self.dim = args.dim
+        self.moe_inter_dim = args.moe_inter_dim
+        assert args.n_routed_experts % world_size == 0, f"Number of experts must be divisible by world size (world_size={world_size})"
+        self.n_routed_experts = args.n_routed_experts
+        self.n_local_experts = args.n_routed_experts // world_size
+        self.n_activated_experts = args.n_activated_experts
+        self.experts_start_idx = rank * self.n_local_experts
+        self.experts_end_idx = self.experts_start_idx + self.n_local_experts
+        self.gate = Gate(layer_id, args)
+        # moe_inter_dim = 2048
+        # self.experts = nn.ModuleList([Expert(layer_id, args.dim, args.moe_inter_dim) if self.experts_start_idx <= i < self.experts_end_idx else None
+        #                               for i in range(self.n_routed_experts)])
+        # self.experts = torch.nn.ModuleList()
+        # dim = 7168, n_shared_experts = 1, moe_inter_dim = 2048
+        self.shared_experts = MLP_int(layer_id, args.dim, args.n_shared_experts * args.moe_inter_dim)
+    # x 的 rescale 为 2^23, shape: [1, seqLen, 7168]
+    def forward(self, start_pos: int, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for the MoE module.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Output tensor after expert routing and computation.
+        """
+        # ffn_normed 的 rescale 为 2^23
+        # x = (x.to(torch.float32) * (2 ** -23)).to(torch.bfloat16)
+        # z rescale: 2^23, z 的 shape [seqLen, 7168]
+        z = self.shared_experts(start_pos, x)
+        # x shape 之前为: [bsz, seqLen, 7168], 之后为 [8, 7168]
+        shape = x.size()
+        x = x.view(-1, self.dim)
+        # weights shape: [seqLen, 8], indices shape: [seqLen, 8]
+        # weights 的 rescale 为 2^23
+        weights, indices = self.gate(start_pos, x)
+        # y shape: [seqLen, 7168]
+        y = torch.zeros_like(x)
+        # torch.bincount 用来统计非负整数张量中各个数值出现的次数，类似于直方图计数
+        # torch.bincount(input, weights=None, minlength=0) -> Tensor, weights: 可选的一维浮点张量，和 input 形状一致。若提供，就不是“次数统计”，而是“权重和”
+        # 统计 256 个 专家 出现的次数
+        counts = torch.bincount(indices.flatten(), minlength=self.n_routed_experts).tolist()
+        for i in range(self.experts_start_idx, self.experts_end_idx):
+            if counts[i] == 0:
+                continue
+            # expert = self.experts[i]
+            with torch.device("cuda"):
+                expert = Expert_int(self.layer_id, i, self.dim, self.moe_inter_dim)
+            # load_model(expert, f'/data3/DeepSeek-V3-Demo1/experts-{self.layer_id}/{i}.safetensors')
+            expertModelPath = os.path.join(self.ckpt_path, f"experts-{self.layer_id}/{i}.safetensors")
+            load_model(expert, expertModelPath)
+            # 第 idx 个 token, 专家 i 出现的编号是 top
+            # 比如
+            # [0, 1, 3, 2, 5, 4, 6, 9]
+            # [7, 8, 3, 12, 5, 11, 6, 1]
+            # [16, 10, 3, 2, 15, 4, 6, 9]
+            # [10, 21, 3, 2, 5, 4, 1, 9]
+            # torch.where(indices == 1) 返回的结果是 ([0, 1, 3], [1, 7, 6])
+            idx, top = torch.where(indices == i)
+            # expert(x[idx]) 返回的 shape [seqLen, 2048], weights[idx, top, None] 的 shape 为 [seqLen, 1], 包含一个 weight 值
+            # y[idx] += expert(x[idx]) * weights[idx, top, None]
+            x2 = x[idx].unsqueeze(0)
+            y2 = expert(start_pos, x2)
+            y2 = y2.view(-1, self.dim)
+            # y[idx] += y2 * weights[idx, top, None] // (1 << 25)
+            y[idx] += y2 * weights[idx, top, None] // (1 << 23)
+        # z = self.shared_experts(x)
+        if world_size > 1:
+            dist.all_reduce(y)
+        return (y + z).view(shape)
+def getBF8PrintStr(ele):
+    v = int(ele.cpu().view(torch.uint8).item())
+    ex = v >> 3 & 0xF
+    r = v & 0x7
+    if ex == 15 and r == 7:
+        print(f'BF8 Nan: {ex} {r} !!!', flush=True)
+    elif ex == 0:
+        print(f'BF8 subnormal: {ex} {r} !!!', flush=True)
+    if v & 0x80:
+        vstr = f'-{ex} {r}'
+    else:
+        vstr =  f'{ex} {r}'
+    return vstr
+class Block(nn.Module):
+    """
+    Transformer block combining attention and feed-forward layers.
+    Attributes:
+        attn (nn.Module): Attention layer (MLA).
+        ffn (nn.Module): Feed-forward network (MLP or MoE).
+        attn_norm (nn.Module): Layer normalization for attention.
+        ffn_norm (nn.Module): Layer normalization for feed-forward network.
+    """
+    def __init__(self, layer_id: int, args: ModelArgs, ckpt_path):
+        """
+        Initializes the Transformer block.
+        Args:
+            layer_id (int): Layer index in the transformer.
+            args (ModelArgs): Model arguments containing block parameters.
+        """
+        super().__init__()
+        self.layer_id = layer_id
+        self.ckpt_path = ckpt_path
+        self.attn = MLA(layer_id, args)
+        self.ffn = MLP_int(layer_id, args.dim, args.inter_dim) if layer_id < args.n_dense_layers else MoE(layer_id, args, ckpt_path)
+        # print('args.dim: ' + str(args.dim))
+        # args.dim = 7168
+        self.attn_norm = RMSNorm_int(args.dim, torch.int32)
+        self.ffn_norm = RMSNorm_int(args.dim, torch.int32)
+        # self.ffn_norm = RMSNorm(args.dim)
+    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]) -> torch.Tensor:
+        """
+        Forward pass for the Transformer block.
+        Args:
+            x (torch.Tensor): Input tensor.
+            start_pos (int): Starting position in the sequence.
+            freqs_cis (torch.Tensor): Precomputed complex exponential values for rotary embeddings.
+            mask (Optional[torch.Tensor]): Mask tensor to exclude certain positions from attention.
+        Returns:
+            torch.Tensor: Output tensor after block computation.
+        """
+        x_abs = x.abs()
+        x_abs_min = x_abs.min().item()
+        x_abs_max = x_abs.max().item()
+        print(f'x abs min: {x_abs_min}, max: {x_abs_max}', flush=True)
+        # self.attn_norm(x): 在进行attention之前，先将7168维的embeding 进行 归一化
+        # attn_norm 的 scale 为 2^21, x 的 scale 为 2^31
+        (atten_normed, rms) = self.attn_norm(x)
+        if snark:
+            os.makedirs(f'zkdata/pos_{start_pos}/layer_{self.layer_id}', exist_ok=True)
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/attn_norm_x.bin', x.cpu())
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/attn_norm_weight.bin', self.attn_norm.weight.view(torch.uint32).cpu())
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/attn_norm_y.bin', atten_normed.cpu())
+            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/attn_norm_rms.bin', rms.cpu())
+        # attned 的 rescale 是 2^19, shape: [1, seqLen, 7168]
+        attned = self.attn(atten_normed, start_pos, freqs_cis, mask)
+        # 调整 rescale，因为 x 的 rescale 是 2^31, attned 的 rescale 是 2^19，因此要乘以 2^12
+        # x = x + attned * (2 ** 10)
+        x = x + attned * (2 ** 12)
+        # ffn_normed 的 rescale 为 2^23
+        (ffn_normed, rms) = self.ffn_norm(x)
+        ffned = self.ffn(start_pos, ffn_normed)
+        # x = x + ffned * (2 ** 6)
+        x = x + ffned * (2 ** 8)
+        # 返回的 x 的rescale 为 2^31
+        return x
+# Transformer 类在初始化中就已经明确好了自己的进程（rank），并且可以发现它是由比较经典的transformer组件构成的：
+# embedding层（self.embed）、堆叠的decoding block（self.layers），标准的RMSnorm层（self.norm）与最后将隐藏状态投射到词表分布的output层（self.head）
+# 根据前面提及的初始化的参数来看，词表大小为129280，模型的hidden dim为7168，堆叠的decode block一共有61个。维度变换会在下面举例说明。
+# Transformer 由61个Block组成，每个Block有 attn 和 ffd
+# Transformer类在初始化中就已经明确好了自己的进程（rank），并且可以发现它是由比较经典的transformer组件构成的
+# embedding层（self.embed）、堆叠的decoding block（self.layers），标准的RMSnorm层（self.norm）与最后将隐藏状态投射到词表分布的output层（self.head）。
+class Transformer(nn.Module):
+    """
+    Transformer model with positional embeddings, multiple layers, and output projection.
+    Attributes:
+        max_seq_len (int): Maximum sequence length for the transformer.
+        embed (nn.Module): Embedding layer for input tokens.
+        layers (torch.nn.ModuleList): List of transformer blocks.
+        norm (nn.Module): Layer normalization applied after all blocks.
+        head (nn.Module): Output projection layer mapping to vocabulary size.
+        freqs_cis (torch.Tensor): Precomputed complex exponential values for rotary(旋转的) embeddings.
+    """
+    def __init__(self, args: ModelArgs):
+        """
+        Initializes the Transformer model.
+        Args:
+            args (ModelArgs): Model arguments containing transformer parameters.
+        """
+        global world_size, rank
+        world_size = dist.get_world_size() if dist.is_initialized() else 1
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        Linear.dtype = torch.float8_e4m3fn if args.dtype == "fp8" else torch.bfloat16
+        super().__init__()
+        self.args = args
+        self.max_seq_len = args.max_seq_len
+        self.embed = ParallelEmbedding(args.vocab_size, args.dim)
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(args.n_layers):
+        #     self.layers.append(Block(layer_id, args))
+            self.layers.append(nn.Module())
+        self.norm = RMSNorm_int(args.dim, torch.int64)
+        # self.head = ColumnParallelLinear(-1, args.dim, args.vocab_size, dtype=torch.get_default_dtype())
+        # 模型中的 head 的 rescale 为 2^43, 使用的过程中的rescale为 2^35, head 输入的 rescale为 2^15, 输出的 rescale为 2^21
+        # self.head = ColumnParallelLinear_int(-1, args.dim, args.vocab_size, 1, (1 << 8), (1 << 29), torch.int64)
+        self.head = ColumnParallelLinear_int(-1, args.dim, args.vocab_size, 1, (1 << 8), 29, torch.int64)
+        # self.head = ColumnParallelLinear_int(-1, args.dim, args.vocab_size, 1, (1 << 8), (1 << 31), torch.int64)
+        # self.head = ColumnParallelLinear_int(-1, args.dim, args.vocab_size, (1 << 5), (1 << 11), (1 << 21), torch.int64)
+        # register_buffer()注册了名为 "freqs_cis" 的缓冲区，缓冲区的值由 precompute_freqs_cis(args) 提供，并且由于设置了 persistent=False，
+        # 该缓冲区不会被保存到模型的状态字典中。缓冲区注册的张量是该Transformer类的位置编码。
+        # register_buffer 用于注册一个非参数张量（tensor），这个张量虽然不是模型的可学习参数，但仍然是模型状态的一部分。
+        # 与参数不同，缓冲区不会在反向传播中计算梯度，也不会被优化器更新，但它会随模型一起移动到相应的设备（如 GPU）上。
+        # persistent=False表示这个参数表示该缓冲区不属于持久状态（persistent state）。也就是说，当你调用 model.state_dict() 保存模型时，
+        # 这个缓冲区不会被包含进去。位置编码可以在模型加载后重新计算，不需要存储。
+        self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)
+    @torch.inference_mode()
+    def prep_inference(self, tokens: torch.Tensor, start_pos: int = 0):
+        # softmax_init()
+        softmax_init_q19()
+        softmax_init_q21()
+        silu_init_q23()
+        seqlen = tokens.size(1)
+        # h 是经过embed之后的结果，embed将文本表达转化为词嵌入，h的形状为 (batch_size, seq_len, 7168)
+        h = self.embed(tokens)
+        # h = h.to(torch.bfloat16) * (1.0 / (1 << 44))
+        return (h, start_pos, seqlen)
+    @torch.inference_mode()
+    def layer_inference(self, layer_id, h, start_pos, seqlen):
+        freqs_cis = self.freqs_cis[start_pos:start_pos+seqlen]
+        mask = None
+        # triu = triangle up
+        # 返回上三角矩阵
+        # 参数 k=0 代表主对角线，k 为正数则从主对角线开始向上数第 k 条，k 为负数则从主对角线开始向下数第 k 条
+        if seqlen > 1:
+            # mask = torch.full((seqlen, seqlen), float("-inf"), device="cuda").triu_(1)
+            mask = torch.full((seqlen, seqlen), -(64 << 36), dtype=torch.int64, device="cuda").triu_(1)
+        h = self.layers[layer_id](h, start_pos, freqs_cis, mask)
+        h_abs = (h.to(torch.float32) * (2 ** -31)).to(torch.bfloat16).abs()
+        h_abs_max = h_abs.max()
+        h_abs[h_abs < (2 ** -125)] = h_abs_max
+        h_abs_min = h_abs.min()
+        h_abs_min_str = getBF16PrintStr(h_abs_min)
+        h_abs_max_str = getBF16PrintStr(h_abs_max)
+        print(f'h_abs min: {h_abs_min_str}, max: {h_abs_max_str}')
+        # 返回的 h 的rescale 为 2^31
+        return h
+    @torch.inference_mode()
+    def finish_inference(self, h):
+        # norm的结果的scale = 2^15, h 的 scale = 2^15
+        h = self.norm(h)[0][:, -1]
+        # logits 的rescale 为 2^21
+        logits = self.head(h[None, :])
+        if world_size > 1:
+            all_logits = [torch.empty_like(logits) for _ in range(world_size)]
+            dist.all_gather(all_logits, logits)
+            logits = torch.cat(all_logits, dim=-1)
+        # logits 的 scale = 2^21
+        return logits
+    # # 这里开始推理了，torch.inference_mode 这句话 关闭梯度计算 并 禁止 autograd 构建计算图，同时比 torch.no_grad() 还高效，专门为推理场景优化
+    # @torch.inference_mode()
+    # def forward(self, tokens: torch.Tensor, start_pos: int = 0):
+    #     """
+    #     Forward pass for the Transformer model.
+    #     Args:
+    #         tokens (torch.Tensor): Input tensor of token IDs with shape (batch_size, seq_len).
+    #         start_pos (int, optional): Starting position in the sequence for rotary(旋转的) embeddings. Defaults to 0.
+    #     Returns:
+    #         torch.Tensor: Logits tensor of shape (batch_size, vocab_size).
+    #     """
+    #     seqlen = tokens.size(1)
+    #     # h 是经过embed之后的结果，embed将文本表达转化为词嵌入，h的形状为 (batch_size, seq_len, 7168)
+    #     h = self.embed(tokens)
+    #     freqs_cis = self.freqs_cis[start_pos:start_pos+seqlen]
+    #     print('freqs_cis: ' + str(freqs_cis.tolist()))
+    #     mask = None
+    #     # triu = triangle up
+    #     # 返回上三角矩阵
+    #     # 参数 k=0 代表主对角线，k 为正数则从主对角线开始向上数第 k 条，k 为负数则从主对角线开始向下数第 k 条
+    #     if seqlen > 1:
+    #         mask = torch.full((seqlen, seqlen), float("-inf"), device=tokens.device).triu_(1)
+    #     for layer in self.layers:
+    #         h = layer(h, start_pos, freqs_cis, mask)
+    #     # 只取最后一个 token
+    #     h = self.norm(h)[:, -1]
+    #     logits = self.head(h)
+    #     if world_size > 1:
+    #         all_logits = [torch.empty_like(logits) for _ in range(world_size)]
+    #         dist.all_gather(all_logits, logits)
+    #         logits = torch.cat(all_logits, dim=-1)
+    #     return logits
+if __name__ == "__main__":
+    torch.set_default_dtype(torch.bfloat16)
+    torch.set_default_device("cuda")
+    torch.manual_seed(0)
+    args = ModelArgs()
+    x = torch.randint(0, args.vocab_size, (2, 128))
+    model = Transformer(0, args)
+    print(model(x).size())

inference/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch==2.4.1
+triton==3.0.0
+transformers==4.46.3
+safetensors==0.4.5