import os
import shutil
from argparse import ArgumentParser
from glob import glob
from tqdm import tqdm, trange

import torch
import ctypes
from safetensors.torch import safe_open, save_file
from kernel import weight_dequant


mapping = {
    "embed_tokens": ("embed", 0),
    "input_layernorm": ("attn_norm", None),
    "post_attention_layernorm": ("ffn_norm", None),
    "q_proj": ("wq", 0),
    "q_a_proj": ("wq_a", None),
    "q_a_layernorm": ("q_norm", None),
    "q_b_proj": ("wq_b", 0),
    "kv_a_proj_with_mqa": ("wkv_a", None),
    "kv_a_layernorm": ("kv_norm", None),
    "kv_b_proj": ("wkv_b", 0),
    "o_proj": ("wo", 1),
    "gate": ("gate", None),
    "gate_proj": ("w1", 0),
    "down_proj": ("w2", 1),
    "up_proj": ("w3", 0),
    "norm": ("norm", None),
    "lm_head": ("head", 0),
    "scale": ("scale", None),
}

EmbedsInOneFile = 256
EmbedsZKDir = "../zkdata/embeds/"

wkv_b_1_rescales = [32, 34, 37, 36, 33, 32, 33, 33, 30, 32,
                   32, 30, 31, 30, 29, 30, 29, 30, 29, 29,
                   29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
                   29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
                   29, 29, 29, 29, 29, 29, 29, 29, 30, 30,
                   29, 29, 30, 30, 30, 30, 29, 30, 30, 29, 30]

wkv_b_2_rescales = [31, 32, 32, 31, 32, 30, 30, 30, 30, 30,
                   30, 30, 30, 29, 29, 29, 29, 30, 29, 29,
                   29, 29, 29, 29, 30, 30, 30, 29, 29, 29,
                   29, 29, 30, 29, 30, 29, 30, 29, 29, 29,
                   30, 29, 29, 29, 29, 30, 29, 30, 30, 30,
                   29, 29, 29, 30, 30, 29, 29, 29, 30, 30, 30]

wo_rescales = [31, 32, 32, 32, 32, 31, 32, 31, 31, 31,
              31, 31, 31, 31, 30, 31, 31, 32, 31, 31,
              31, 30, 30, 30, 30, 30, 30, 30, 30, 30,
              30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
              30, 30, 30, 31, 30, 31, 30, 30, 31, 31,
              31, 30, 31, 31, 31, 30, 31, 31, 31, 31, 32 ]

gate_rescales = [0, 0, 0, 33, 32, 32, 32, 31, 32, 31, 30,
                32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
                32, 31, 32, 31, 32, 32, 32, 32, 31, 32,
                32, 31, 32, 32, 32, 32, 32, 32, 32, 32,
                32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
                32, 32, 32, 33, 33, 33, 33, 33, 32, 32 ]

w1_rescales = [32, 32, 32]
w2_rescales = [31, 32, 31]
w3_rescales = [32, 33, 32]

shared_w1_rescales = [0, 0, 0, 30, 30, 29, 29, 29, 28, 29,
                      29, 28, 29, 29, 29, 29, 29, 29, 29, 29,
                      29, 29, 29, 30, 30, 30, 30, 30, 30, 30,
                      30, 30, 30, 30, 29, 29, 30, 29, 29, 30,
                      29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
                      29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29]

shared_w2_rescales = [0, 0, 0, 30, 30, 30, 30, 30, 29, 29,
                      30, 29, 29, 29, 30, 30, 30, 30, 30, 29,
                      29, 29, 29, 29, 29, 29, 29, 30, 30, 29,
                      29, 29, 29, 29, 29, 29, 29, 30, 29, 29,
                      29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
                      29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30]

shared_w3_rescales = [0, 0, 0, 30, 30, 30, 30, 30, 29, 29,
                      30, 29, 29, 29, 30, 30, 30, 29, 30, 29,
                      29, 29, 29, 29, 29, 29, 30, 30, 30, 30,
                      29, 29, 29, 29, 29, 29, 29, 30, 30, 29,
                      30, 29, 29, 29, 29, 30, 29, 29, 30, 30,
                      29, 30, 30, 30, 29, 29, 30, 30, 30, 29, 28]

layer_state_dict0 = [{} for _ in range(61)]
layer_state_dict = [{} for _ in range(61)]

experts = [ [{} for _j in range(256)] for _i in range(61)]

def getF32PrintStr(ele):
    v = int(ele.cpu().view(torch.uint32).item())
    ex = str((v >> 23 & 0xFF) - 127)
    r = '(1+' + str(v & 0x7FFFFF) + '/8388608)'
    if v & 0x80000000:
        vstr = '-' + r + '*2^' + ex
    else:
        vstr = r + '*2^' + ex
    return vstr

def getBF16PrintStr(ele):
    v = int(ele.cpu().view(torch.uint16).item())
    ex = v >> 7 & 0xFF
    r = '(1+' + str(v & 0x7F) + '/128)'
    rraw = v & 0x7F

    if v & 0x8000:
        vstr = '-' + r + '*2^' + str(ex - 127)
    else:
        vstr = r + '*2^' + str(ex - 127)
    return vstr

def getBF8PrintStr(ele):
    v = int(ele.cpu().view(torch.uint8).item())
    ex = str((v >> 3 & 0xF) - 7)
    r = '(1+' + str(v & 0x7) + '/8)'

    if v & 0x80:
        vstr = '-' + r + '*2^' + ex
    else:
        vstr = r + '*2^' + ex

    if ex == -7 or ex == 8:
        print(vstr)
    return vstr

def mem(i):
    a = torch.cuda.memory_allocated()/1024**2
    r = torch.cuda.memory_reserved()/1024**2
    m = torch.cuda.max_memory_allocated()/1024**2
    print(f"{i} allocated={a:.1f}MB, reserved={r:.1f}MB, max={m:.1f}MB", flush=True)

def handle_expert_w(layer_id, expert_id, idx, param_weight, weight_name, scale, typ, shape, experts_save_path):
    global layer_state_dict0
    global experts

    scale_name = weight_name.replace('weight', 'scale')
    param_scale = layer_state_dict0[layer_id][scale_name]

    weight = weight_dequant(param_weight.cuda(), param_scale.cuda())
    # scale = experts_w3_rescales[layer_id][expert_id]
    rescale = 2 ** scale
    param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
    # layer_state_dict[layer_id][weight_name] = param_int.cpu()
    # layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
    weight_name2 = f'w{idx}.weight'
    scale_name2 = f'w{idx}.scale'
    experts[layer_id][expert_id][weight_name2] = param_int
    experts[layer_id][expert_id][scale_name2] = torch.tensor(scale, dtype=torch.int32)

    if len(experts[layer_id][expert_id]) == 6: # w1, w2, w3 以及对应的 scale
        save_file(experts[layer_id][expert_id], os.path.join(experts_save_path, f"{expert_id}.safetensors"))
        experts[layer_id][expert_id] = {}

    print(f'layer {layer_id} expert {expert_id} w{idx} type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {scale_name}')

def saveTensor(fileName, t):
    with open(fileName, "w", encoding="utf-8") as f:
        t = t.detach()
        if t.device.type != "cpu":
            t = t.cpu()
        t = t.contiguous()
        with open(fileName, "wb") as f:
            f.write(t.numpy().tobytes(order="C"))

def main(hf_ckpt_path, save_path, n_experts, mp):
    """
    Converts and saves model checkpoint files into a specified format.

    Args:
        hf_ckpt_path (str): Path to the directory containing the input checkpoint files.
        save_path (str): Path to the directory where the converted checkpoint files will be saved.
        n_experts (int): Total number of experts in the model.
        mp (int): Model parallelism factor.
        
    Returns:
        None
    """
    torch.cuda.set_device(0)
    # 设置pytorch计算时的默认数据类型。这里使用的是BF16
    torch.set_default_dtype(torch.bfloat16)
    # 限制 PyTorch 在 CPU 计算时最多使用 8 个线程，防止过多线程竞争资源：
    torch.set_num_threads(8)
    # 设定随机种子，保证不同进程初始化时随机数相同。
    torch.manual_seed(965)

    # n_local_experts = n_experts // mp
    # state_dicts = [{} for _ in range(mp)]

    head_state_dict = {}
    norm_state_dict = {}
    embed_state_dict = {}

    experts_w1_rescales = []
    experts_w2_rescales = []
    experts_w3_rescales = []

    with open("w1.txt", "r", encoding="utf-8") as f1:
        for line in f1:
            layer_line = line.strip().split()
            int_list = [int(s) for s in layer_line]
            experts_w1_rescales.append(int_list)

    with open("w2.txt", "r", encoding="utf-8") as f2:
        for line in f2:
            layer_line = line.strip().split()
            int_list = [int(s) for s in layer_line]
            experts_w2_rescales.append(int_list)

    with open("w3.txt", "r", encoding="utf-8") as f3:
        for line in f3:
            layer_line = line.strip().split()
            int_list = [int(s) for s in layer_line]
            experts_w3_rescales.append(int_list)

    # Tqdm 是一个快速，可扩展的Python进度条，可以在 Python 长循环中添加一个进度提示信息，用户只需要封装任意的迭代器 tqdm(iterator)。
    # glob是python自己带的一个文件操作相关模块，用它可以查找符合自己目的的文件，类似于Windows下的文件搜索
    for file_path in tqdm(glob(os.path.join(hf_ckpt_path, "*.safetensors"))):
        with safe_open(file_path, framework="pt", device="cpu") as f:
            print('Opening ' + file_path, flush=True)
            for name in f.keys():
                # print('name 1: ', name)
                if "model.layers.61" in name:
                    continue

                param: torch.Tensor = f.get_tensor(name)
                if name.startswith("model."):
                    name = name[len("model."):]
                name = name.replace("self_attn", "attn")
                name = name.replace("mlp", "ffn")
                name = name.replace("weight_scale_inv", "scale")
                name = name.replace("e_score_correction_bias", "bias")
                key = name.split(".")[-2]
                assert key in mapping, f"Key {key} not found in mapping"
                # print('key::: ' + key)
                new_key, dim = mapping[key]
                # print('dim::: ' + str(dim))
                name = name.replace(key, new_key)

                ns = name.split(".")
                comp = ns[0]
                if comp == 'head':
                    name2 = name[len('head.'):]
                    print('head: ' + name2)

                    param_int =  (param.to(torch.float32) * (2 ** 43)).round().to(torch.int64)
                    head_state_dict[name2] = param_int
                elif comp == 'norm':
                    name2 = name[len('norm.'):]
                    print('norm: ' + name2)

                    param_int =  (param.to(torch.float32) * (2 ** 15)).round().to(torch.int64)
                    norm_state_dict[name2] = param_int
                elif comp == 'embed':
                    name2 = name[len('embed.'):]
                    print('embed: ' + name2)

                    param_int =  (param.to(torch.float32) * (2 ** 31)).round().to(torch.int64)
                    embed_state_dict[name2] = param_int

                    os.makedirs(EmbedsZKDir, exist_ok=True)
                    fileCount = param_int.shape[0] // EmbedsInOneFile
                    for i in range(0, fileCount):
                        saveTensor(EmbedsZKDir + str(i) + '.bin', param_int[i * EmbedsInOneFile : (i+1) * EmbedsInOneFile].cpu())
                elif comp == 'layers':
                    layer_id = int(ns[1])
                    name2 = '.'.join(ns[2:])
                    layer_state_dict0[layer_id][name2] = param

    print('Finish loading state dict from disk! ++++++++++')

    # for layer_id, states in enumerate(layer_state_dict0):
    for layer_id in range(len(layer_state_dict0)):
        os.makedirs(f'{save_path}/experts-{layer_id}', exist_ok=True)

        states = layer_state_dict0[layer_id]

        for name, param in states.items():
            ns = name.split(".")
            typ = param.type()
            shape = param.shape

            if ns[0] == 'attn_norm':
                print(f'layer {layer_id} {name}, type: {typ}', flush=True)
                if ns[1] == 'weight':
                    param_int = (param.to(torch.float32) * (2 ** 21)).round().to(torch.int32)
                    layer_state_dict[layer_id][name] = param_int
            elif ns[0] == 'ffn_norm':
                print(f'layer {layer_id} {name}, type: {typ}', flush=True)
                if ns[1] == 'weight':
                    param_int2 = (param.to(torch.float32) * (2 ** 23)).round().to(torch.int32)
                    layer_state_dict[layer_id][name] = param_int2
            elif ns[0] == 'ffn':
                if len(ns) == 3:
                    if ns[1] == 'w1' and ns[2] == 'scale':
                        continue
                    elif ns[1] == 'w1' and ns[2] == 'weight':
                        param_weight = param.cuda()
                        weight_name = name

                        scale_name = name.replace('weight', 'scale')
                        param_scale = states[scale_name]

                        weight = weight_dequant(param_weight, param_scale.cuda())
                        scale = w1_rescales[layer_id]
                        rescale = 2 ** scale
                        param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
                        layer_state_dict[layer_id][weight_name] = param_int.cpu()
                        layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)

                        print(f'layer {layer_id} w1 weight, type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {name}', flush=True)
                    elif ns[1] == 'w2' and ns[2] == 'scale':
                        continue
                    elif ns[1] == 'w2' and ns[2] == 'weight':
                        param_weight = param.cuda()
                        weight_name = name

                        scale_name = name.replace('weight', 'scale')
                        param_scale = states[scale_name]

                        weight = weight_dequant(param_weight, param_scale.cuda())
                        scale = w2_rescales[layer_id]
                        rescale = 2 ** scale
                        param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
                        layer_state_dict[layer_id][weight_name] = param_int.cpu()
                        layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)

                        print(f'layer {layer_id} w2 weight, type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {name}', flush=True)
                    elif ns[1] == 'w3' and ns[2] == 'scale':
                        continue
                    elif ns[1] == 'w3' and ns[2] == 'weight':
                        param_weight = param.cuda()
                        weight_name = name

                        scale_name = name.replace('weight', 'scale')
                        param_scale = states[scale_name]

                        weight = weight_dequant(param_weight, param_scale.cuda())
                        scale = w3_rescales[layer_id]
                        rescale = 2 ** scale
                        param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
                        layer_state_dict[layer_id][weight_name] = param_int.cpu()
                        layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)

                        print(f'layer {layer_id} w3 weight, type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {name}', flush=True)

                    elif ns[1] == 'gate' and ns[2] == 'weight':
                        gate_rescale = 2 ** gate_rescales[layer_id]
                        gate_int = (param.to(torch.float32) * gate_rescale).round().to(torch.int32)
                        layer_state_dict[layer_id][name] = gate_int.cpu()
                        rescale_name = name.replace('weight', 'scale')
                        layer_state_dict[layer_id][rescale_name] = torch.tensor(gate_rescales[layer_id], dtype=torch.int32)
                        print(f'layer {layer_id}: gate_weight_name: {name}, gate_scale_name: {rescale_name}')
                    elif ns[1] == 'gate' and ns[2] == 'bias':
                        bias_int = (param.to(torch.float32) * (2 ** 23)).round().to(torch.int32)
                        layer_state_dict[layer_id][name] = bias_int.cpu()
                        print(f'layer {layer_id} bias: {name}')
                    else:
                        layer_state_dict[layer_id][name] = param
                elif len(ns) == 4:
                    if ns[1] == 'shared_experts':
                        if (ns[2] == 'w1' or ns[2] == 'w2' or ns[2] == 'w3') and ns[3] == 'scale':
                            continue
                        elif ns[2] == 'w1' and ns[3] == 'weight':
                            param_weight = param.cuda()
                            weight_name = name

                            scale_name = name.replace('weight', 'scale')
                            param_scale = states[scale_name]

                            weight = weight_dequant(param_weight, param_scale.cuda())
                            scale = shared_w1_rescales[layer_id]
                            rescale = 2 ** scale
                            param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
                            layer_state_dict[layer_id][weight_name] = param_int.cpu()
                            layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
                            print(f'layer {layer_id} shared_expert w1 type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {scale_name}')
                        elif ns[2] == 'w2' and ns[3] == 'weight':
                            param_weight = param.cuda()
                            weight_name = name

                            scale_name = name.replace('weight', 'scale')
                            param_scale = states[scale_name]

                            weight = weight_dequant(param_weight, param_scale.cuda())
                            scale = shared_w2_rescales[layer_id]
                            rescale = 2 ** scale
                            param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
                            layer_state_dict[layer_id][weight_name] = param_int.cpu()
                            layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
                            print(f'layer {layer_id} shared_expert w2 type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {scale_name}')
                        elif ns[2] == 'w3' and ns[3] == 'weight':
                            param_weight = param.cuda()
                            weight_name = name

                            scale_name = name.replace('weight', 'scale')
                            param_scale = states[scale_name]

                            weight = weight_dequant(param_weight, param_scale.cuda())
                            scale = shared_w3_rescales[layer_id]
                            rescale = 2 ** scale
                            param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
                            layer_state_dict[layer_id][weight_name] = param_int.cpu()
                            layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)
                            print(f'layer {layer_id} shared_expert w3 type: {typ}, shape: {shape}, weight_name: {weight_name}, scale_name: {scale_name}')
                        else:
                            layer_state_dict[layer_id][name] = param
                    else:
                        layer_state_dict[layer_id][name] = param
                elif len(ns) == 5:
                    if ns[1] == 'experts':
                        expert_id = int(ns[2])
                        if (ns[3] == 'w1' or ns[3] == 'w2' or ns[3] == 'w3') and ns[4] == 'scale':
                            continue
                        elif ns[3] == 'w1' and ns[4] == 'weight':
                            scale = experts_w1_rescales[layer_id][expert_id]
                            handle_expert_w(layer_id, expert_id, 1, param, name, scale, typ, shape, f'{save_path}/experts-{layer_id}')
                        elif ns[3] == 'w2' and ns[4] == 'weight':
                            scale = experts_w2_rescales[layer_id][expert_id]
                            handle_expert_w(layer_id, expert_id, 2, param, name, scale, typ, shape, f'{save_path}/experts-{layer_id}')
                        elif ns[3] == 'w3' and ns[4] == 'weight':
                            scale = experts_w3_rescales[layer_id][expert_id]
                            handle_expert_w(layer_id, expert_id, 3, param, name, scale, typ, shape, f'{save_path}/experts-{layer_id}')
                        else:
                            layer_state_dict[layer_id][name] = param
                else:
                    layer_state_dict[layer_id][name] = param
            elif ns[0] == 'attn':
                if len(ns) == 3:
                    if ns[1] == 'wq_a' and ns[2] == 'scale':
                        continue
                    elif ns[1] == 'wq_a' and ns[2] == 'weight':
                        param_weight = param.cuda()
                        weight_name = name

                        scale_name = name.replace('weight', 'scale')
                        param_scale = states[scale_name]

                        weight = weight_dequant(param_weight, param_scale.cuda())

                        weight_int = (weight.to(torch.float32) * (2 ** 30)).round().to(torch.int32)

                        layer_state_dict[layer_id][weight_name] = weight_int.cpu()

                        print(f'layer {layer_id} wq_a weight, type: {typ}, shape: {shape}', flush=True)
                    elif ns[1] == 'q_norm':
                        print(f'layer {layer_id} q_norm, type: {typ}, shape: {shape}', flush=True)

                        param_int3 = (param.to(torch.float32) * (2 ** 19)).round().to(torch.int32)
                        layer_state_dict[layer_id][name] = param_int3
                    elif ns[1] == 'kv_norm':
                        print(f'layer {layer_id} kv_norm, type: {typ}, shape: {shape}', flush=True)

                        param_int4 = (param.to(torch.float32) * (2 ** 23)).round().to(torch.int32)
                        layer_state_dict[layer_id][name] = param_int4
                    elif ns[1] == 'wq_b' and ns[2] == 'scale':
                        continue
                    elif ns[1] == 'wq_b' and ns[2] == 'weight':
                        param_weight = param.cuda()
                        weight_name = name

                        scale_name = name.replace('weight', 'scale')
                        param_scale = states[scale_name]

                        weight = weight_dequant(param_weight, param_scale.cuda())

                        weight_int = (weight.to(torch.float32) * (2 ** 30)).round().to(torch.int32)

                        weight_int = weight_int.view(128, 192, 1536)
                        wq_b1, wq_b2 = torch.split(weight_int, [128, 64], dim=-2)

                        print(f'layer {layer_id} wq_b1 weight, shape: {wq_b1.shape}, wq_b2 weight, shape: {wq_b2.shape}', flush=True)

                        wq_b1 = wq_b1.reshape(128 * 128, 1536)
                        wq_b2 = wq_b2.reshape(128 * 64, 1536)
                        wq_b1_name = weight_name.replace('wq_b', 'wq_b1')
                        wq_b2_name = weight_name.replace('wq_b', 'wq_b2')

                        # layer_state_dict[layer_id][weight_name] = weight_int.cpu()
                        layer_state_dict[layer_id][wq_b1_name] = wq_b1.cpu()
                        layer_state_dict[layer_id][wq_b2_name] = wq_b2.cpu()

                        print(f'layer {layer_id} wq_b weight, type: {typ}, shape: {shape}', flush=True)
                    elif ns[1] == 'wkv_a' and ns[2] == 'scale':
                        continue
                    elif ns[1] == 'wkv_a' and ns[2] == 'weight':
                        param_weight = param.cuda()
                        weight_name = name

                        scale_name = name.replace('weight', 'scale')
                        param_scale = states[scale_name]

                        weight = weight_dequant(param_weight, param_scale.cuda())

                        weight_int = (weight.to(torch.float32) * (2 ** 29)).round().to(torch.int32)

                        # layer_state_dict[layer_id][weight_name] = weight_int.cpu()

                        weight_int = weight_int.view(576, 7168)
                        wkv_a1, wkv_a2 = torch.split(weight_int, [512, 64], dim=-2)

                        print(f'layer {layer_id} wkv_a1 weight, shape: {wkv_a1.shape}, wkv_a2 weight, shape: {wkv_a2.shape}', flush=True)

                        wkv_a1_name = weight_name.replace('wkv_a', 'wkv_a1')
                        wkv_a2_name = weight_name.replace('wkv_a', 'wkv_a2')

                        # layer_state_dict[layer_id][weight_name] = weight_int.cpu()
                        layer_state_dict[layer_id][wkv_a1_name] = wkv_a1.cpu()
                        layer_state_dict[layer_id][wkv_a2_name] = wkv_a2.cpu()

                        print(f'layer {layer_id} wkv_a weight, type: {typ}, shape: {shape}', flush=True)
                    elif ns[1] == 'wkv_b' and ns[2] == 'scale':
                        continue
                    elif ns[1] == 'wkv_b' and ns[2] == 'weight':
                        param_weight = param.cuda()
                        weight_name = name

                        scale_name = name.replace('weight', 'scale')
                        param_scale = states[scale_name]

                        weight = weight_dequant(param_weight, param_scale.cuda())

                        wkv_b = weight.view(128, 256, 512)

                        wkv_b_1 = wkv_b[:, :128]
                        wkv_b_1 = wkv_b_1.reshape(128 * 128, 512)
                        scale1 = wkv_b_1_rescales[layer_id]
                        wkv_b_1_rescale = 2 ** scale1
                        wkv_b_1_int = torch.round(wkv_b_1.to(torch.float32) * wkv_b_1_rescale).to(torch.int32)

                        wkv_b_2 = wkv_b[:, -128:]
                        wkv_b_2 = wkv_b_2.reshape(128 * 128, 512)
                        scale2 = wkv_b_2_rescales[layer_id]
                        wkv_b_2_rescale = 2 ** scale2
                        wkv_b_2_int = torch.round(wkv_b_2.to(torch.float32) * wkv_b_2_rescale).to(torch.int32)

                        wkv_b_1_name = weight_name.replace("wkv_b", "wkv_b_1")
                        wkv_b_1_scale_name = scale_name.replace("wkv_b", "wkv_b_1")
                        layer_state_dict[layer_id][wkv_b_1_name] = wkv_b_1_int.cpu()
                        layer_state_dict[layer_id][wkv_b_1_scale_name] = torch.tensor(scale1, dtype=torch.int32) 

                        wkv_b_2_name = weight_name.replace("wkv_b", "wkv_b_2")
                        wkv_b_2_scale_name = scale_name.replace("wkv_b", "wkv_b_2")
                        layer_state_dict[layer_id][wkv_b_2_name] = wkv_b_2_int.cpu()
                        layer_state_dict[layer_id][wkv_b_2_scale_name] = torch.tensor(scale2, dtype=torch.int32)

                        print(f'layer {layer_id} wkv_b, type: {typ}, shape: {shape}, wkv_b_1 weight: {wkv_b_1_name}, wkv_b_1 scale: {wkv_b_1_scale_name}, wkv_b_2 weight: {wkv_b_2_name}, wkv_b_2 scale: {wkv_b_2_scale_name}', flush=True)
                    elif ns[1] == 'wo' and ns[2] == 'scale':
                        continue
                    elif ns[1] == 'wo' and ns[2] == 'weight':
                        param_weight = param.cuda()
                        weight_name = name

                        scale_name = name.replace('weight', 'scale')
                        param_scale = states[scale_name]

                        weight = weight_dequant(param_weight, param_scale.cuda())

                        scale = wo_rescales[layer_id]
                        rescale = 2 ** scale

                        if layer_id != 58:
                            param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
                        else:
                            wo_abs = weight.abs().cpu()
                            maxpos = wo_abs.argmax()
                            row, col = divmod(maxpos.item(), weight.size(1))
                            print(f'maxpos: {maxpos}, {row} {col}', flush=True)

                            vstr = getBF16PrintStr(weight[row][col])
                            print(f'weight[{row}][{col}]: {vstr}', flush=True)
                            weight[row][col] = 0
                            param_int = (weight.to(torch.float32) * rescale).round().to(torch.int32)
                            param_int[row][col] = -(2 ** 31)

                        layer_state_dict[layer_id][weight_name] = param_int.cpu()
                        layer_state_dict[layer_id][scale_name] = torch.tensor(scale, dtype=torch.int32)

                        print(f'layer {layer_id} wo weight, type: {typ}, shape: {shape}, weight: {weight_name}, scale: {scale_name}', flush=True)
                    else:
                        layer_state_dict[layer_id][name] = param
                else:
                    layer_state_dict[layer_id][name] = param
            else:
                layer_state_dict[layer_id][name] = param

        save_file(layer_state_dict[layer_id], os.path.join(save_path, f"layer-{layer_id}.safetensors"))
        print(f'Finish saving layer {layer_id}', flush=True)
        layer_state_dict0[layer_id] = {}
        layer_state_dict[layer_id] = {}

    print('Finish opening')

    os.makedirs(save_path, exist_ok=True)

    print(layer_state_dict)
    print(experts)

    save_file(head_state_dict, os.path.join(save_path, f"head_int.safetensors"))
    save_file(norm_state_dict, os.path.join(save_path, f"norm_int.safetensors"))
    save_file(embed_state_dict, os.path.join(save_path, f"embed_int.safetensors"))
    # for i, st in enumerate(layer_state_dict):
    #     # print(f'{i} {st['attn_norm.weight']}', flush=True)
    #     # print(f'{i} {st['ffn_norm.weight']}', flush=True)
    #     save_file(st, os.path.join(save_path, f"layer-{i}.safetensors"))
    #     print(f'Finish saving layer {i}', flush=True)

    # for i in trange(mp):
    #     save_file(state_dicts[i], os.path.join(save_path, f"model{i}-mp{mp}.safetensors"))

    # print('Finish saving files')

    for file_path in glob(os.path.join(hf_ckpt_path, "*token*")):
        new_file_path = os.path.join(save_path, os.path.basename(file_path))
        shutil.copyfile(file_path, new_file_path)


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("--hf-ckpt-path", type=str, required=True)
    parser.add_argument("--save-path", type=str, required=True)
    parser.add_argument("--n-experts", type=int, required=True)
    parser.add_argument("--model-parallel", type=int, required=True)
    args = parser.parse_args()
    assert args.n_experts % args.model_parallel == 0, "Number of experts must be divisible by model parallelism"
    main(args.hf_ckpt_path, args.save_path, args.n_experts, args.model_parallel)