File size: 6,751 Bytes
ee3e701 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import argparse
import json
import math
import os
import re
import tempfile
import sys
import torch
from model_hf.modeling_internlm import InternLMConfig, InternLMForCausalLM
sys.path.append('../')
def convert2hf(model_config, states_tp_pps):
with tempfile.TemporaryDirectory() as folder:
states = merge_pp(states_tp_pps)[0]
dims_per_head = model_config["hidden_size"] // model_config["num_attention_heads"]
base = 10000.0
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
current_states = {}
vq_model_embed_weight = states.pop('embedding.vq_model.quantize.embedding.weight')
embed_proj_weight = states.pop('embedding.embed_proj.weight')
current_states["model.embed_tokens.weight"] = vq_model_embed_weight.mm(embed_proj_weight.T)
current_states["model.norm.weight"] = states.pop("norm.weight")
current_states["lm_head.weight"] = states.pop("head.weight")
for i in range(model_config["num_layers"]):
states.pop(f"blocks.{i}.mixer.rotary_emb.inv_freq", None)
wqkv = states.pop(f"blocks.{i}.mixer.Wqkv.weight").reshape(
3, model_config["num_attention_heads"], -1, model_config["hidden_size"]
)
bqkv = states.pop(f"blocks.{i}.mixer.Wqkv.bias").reshape(3, model_config["num_attention_heads"], -1)
current_states[f"model.layers.{i}.self_attn.q_proj.weight"] = wqkv[0].reshape(
-1, model_config["hidden_size"]
)
current_states[f"model.layers.{i}.self_attn.q_proj.bias"] = bqkv[0].reshape(-1)
current_states[f"model.layers.{i}.self_attn.k_proj.weight"] = wqkv[1].reshape(
-1, model_config["hidden_size"]
)
current_states[f"model.layers.{i}.self_attn.k_proj.bias"] = bqkv[1].reshape(-1)
current_states[f"model.layers.{i}.self_attn.v_proj.weight"] = wqkv[2].reshape(
-1, model_config["hidden_size"]
)
current_states[f"model.layers.{i}.self_attn.v_proj.bias"] = bqkv[2].reshape(-1)
current_states[f"model.layers.{i}.self_attn.o_proj.weight"] = states.pop(
f"blocks.{i}.mixer.out_proj.weight"
)
current_states[f"model.layers.{i}.self_attn.o_proj.bias"] = states.pop(f"blocks.{i}.mixer.out_proj.bias")
current_states[f"model.layers.{i}.mlp.gate_proj.weight"] = states.pop(f"blocks.{i}.mlp.w1.weight")
current_states[f"model.layers.{i}.mlp.down_proj.weight"] = states.pop(f"blocks.{i}.mlp.w3.weight")
current_states[f"model.layers.{i}.mlp.up_proj.weight"] = states.pop(f"blocks.{i}.mlp.w2.weight")
current_states[f"model.layers.{i}.input_layernorm.weight"] = states.pop(f"blocks.{i}.norm1.weight")
current_states[f"model.layers.{i}.post_attention_layernorm.weight"] = states.pop(f"blocks.{i}.norm2.weight")
current_states[f"model.layers.{i}.self_attn.rotary_emb.inv_freq"] = inv_freq
config = InternLMConfig(
hidden_size=model_config["hidden_size"],
intermediate_size=compute_intermediate_size(model_config["hidden_size"]),
num_attention_heads=model_config["num_attention_heads"],
num_hidden_layers=model_config["num_layers"],
rms_norm_eps=1e-06,
bias=True,
)
if model_config["vocab_size"] != -1:
config.vocab_size = model_config["vocab_size"]
config.save_pretrained(folder)
torch.save(current_states, os.path.join(folder, "pytorch_model.bin"))
model = InternLMForCausalLM.from_pretrained(folder, torch_dtype=torch.float16)
del model.config._name_or_path
return config, model
def compute_intermediate_size(n):
return int(math.ceil(n * 8 / 3) + 255) // 256 * 256
def merge_pp(states_tp_pp):
max_tp = len(states_tp_pp)
max_pp = len(states_tp_pp[0])
full_states = []
for tp in range(max_tp):
layer_shift = 0
tp_states = {}
for pp in range(max_pp):
_layer_shift = 0
states = states_tp_pp[tp][pp]
keys = list(states.keys())
for key in keys:
match = re.search("\.\d+\.", key)
if match is not None:
s, e = match.span()
layer_idx = int(key[s + 1: e - 1]) + layer_shift
_layer_shift = max(_layer_shift, int(key[s + 1: e - 1]))
name = key[:s] + f".{layer_idx}." + key[e:]
tp_states[name] = states[key]
else:
tp_states[key] = states[key]
layer_shift += _layer_shift + 1
full_states.append({(key[6:] if key.startswith("model.") else key): value for key, value in tp_states.items()})
return full_states
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--src_folder', type=str, default='/path/to/intermlm_model/') # internlm model folder
parser.add_argument('--tgt_folder', type=str, default='/path/to/hf_model/') # hf model folder
args = parser.parse_args()
def load(fp):
with open(fp, "rb") as f:
pt_data = torch.load(f, map_location="cpu")
return pt_data
folder = args.src_folder
target_folder = args.tgt_folder
model_config = load(os.path.join(folder, "model_config.pt"))
fns = list(os.listdir(folder))
model_fns = []
for fn in fns:
if fn.startswith("model_t") and not fn.endswith("md5"):
model_fns.append(fn)
max_tp, max_pp = -1, -1
for fn in model_fns:
_, tp, pp = os.path.splitext(fn)[0].split("_")
max_pp = max(max_pp, int(pp[2:]) + 1)
max_tp = max(max_tp, int(tp[2:]) + 1)
states_tp_pps = [[]]
for pp in range(max_pp):
model_name = f"model_tp0_pp{pp}.pt"
states = load(os.path.join(folder, model_name))
states_tp_pps[0].append(states)
config, model = convert2hf(model_config, states_tp_pps)
os.makedirs(target_folder, exist_ok=True)
model.save_pretrained(target_folder, max_shard_size="20GB")
# TODO There should be a better way to add this.
with open(os.path.join(target_folder, "config.json")) as fp:
config_dict = json.load(fp)
config_dict["auto_map"]["AutoModel"] = "modeling_internlm.InternLMForCausalLM"
with open(os.path.join(target_folder, "config.json"), "w") as fp:
json.dump(config_dict, fp, indent=2)
|