| | |
| | |
| | import argparse, re, torch |
| | from safetensors.torch import load_file, save_file |
| |
|
| | def cat_if_exists(keys, state, dim=0): |
| | """Helper: concatenate a list of keys if they all exist, else return None.""" |
| | if all(k in state for k in keys): |
| | return torch.cat([state[k] for k in keys], dim=dim) |
| | return None |
| |
|
| | def convert(path_in: str, path_out: str, dtype: str): |
| | src = load_file(path_in) |
| | tgt = {} |
| |
|
| | |
| | tgt["model.embed_tokens.weight"] = src["transformer.wte.weight"].to(dtype) |
| | tgt["model.final_layernorm.weight"] = src["transformer.ln_f.weight"].to(dtype) |
| | tgt["model.final_layernorm.bias"] = src["transformer.ln_f.bias"].to(dtype) |
| | tgt["lm_head.weight"] = src["lm_head.weight"].to(dtype) |
| | tgt["lm_head.bias"] = src["lm_head.bias"].to(dtype) |
| |
|
| | |
| | pat = re.compile(r"transformer\.h\.(\d+)\.") |
| | layer_ids = sorted({int(pat.match(k).group(1)) for k in src if pat.match(k)}) |
| |
|
| | for i in layer_ids: |
| | p_old = f"transformer.h.{i}" |
| | p_new = f"model.layers.{i}" |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | tgt[f"{p_new}.self_attn.k_proj.weight"] = src[f"{p_old}.attn.k_proj.weight"] |
| | tgt[f"{p_new}.self_attn.k_proj.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.k_proj.weight"].shape[0]) |
| |
|
| | tgt[f"{p_new}.self_attn.q_proj.weight"] = src[f"{p_old}.attn.q_proj.weight"] |
| | tgt[f"{p_new}.self_attn.q_proj.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.q_proj.weight"].shape[0]) |
| |
|
| | tgt[f"{p_new}.self_attn.v_proj.weight"] = src[f"{p_old}.attn.v_proj.weight"] |
| | tgt[f"{p_new}.self_attn.v_proj.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.v_proj.weight"].shape[0]) |
| |
|
| | tgt[f"{p_new}.self_attn.dense.weight"] = src[f"{p_old}.attn.out_proj.weight"].to(dtype) |
| | tgt[f"{p_new}.self_attn.dense.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.dense.weight"].shape[0]).to(dtype) |
| |
|
| | |
| | tgt[f"{p_new}.input_layernorm.weight"] = src[f"{p_old}.ln_1.weight"].to(dtype) |
| | tgt[f"{p_new}.input_layernorm.bias"] = src[f"{p_old}.ln_1.bias"].to(dtype) |
| |
|
| | |
| | tgt[f"{p_new}.mlp.fc1.weight"] = src[f"{p_old}.mlp.fc_in.weight"].to(dtype) |
| | tgt[f"{p_new}.mlp.fc1.bias"] = src[f"{p_old}.mlp.fc_in.bias"].to(dtype) |
| | tgt[f"{p_new}.mlp.fc2.weight"] = src[f"{p_old}.mlp.fc_out.weight"].to(dtype) |
| | tgt[f"{p_new}.mlp.fc2.bias"] = src[f"{p_old}.mlp.fc_out.bias"].to(dtype) |
| |
|
| | |
| | save_file(tgt, path_out) |
| | print(f"✓ wrote {len(tgt):,} tensors to {path_out}") |
| |
|
| | if __name__ == "__main__": |
| | ap = argparse.ArgumentParser(description="convert GPT‑2/3 style safetensors to Phi1.5 layout") |
| | ap.add_argument("--in", dest="inp", required=True, help="source .safetensors") |
| | ap.add_argument("--out", dest="outp", required=True, help="destination .safetensors") |
| | ap.add_argument("--dtype", default="float16", |
| | choices=["float16","bfloat16","float32"], |
| | help="cast parameters to this dtype in the output file") |
| | args = ap.parse_args() |
| | convert(args.inp, args.outp, getattr(torch, args.dtype)) |
| |
|