Commit ·
c2f6063
verified ·
0
Parent(s):
Duplicate from allura-forge/phi-j-6b
Browse files- .gitattributes +35 -0
- README.md +25 -0
- config.json +26 -0
- convert.py +83 -0
- merges.txt +0 -0
- model.safetensors +3 -0
- tokenizer.json +0 -0
- tokenizer_config.json +1 -0
- vocab.json +0 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
datasets:
|
| 4 |
+
- EleutherAI/pile
|
| 5 |
+
language:
|
| 6 |
+
- en
|
| 7 |
+
base_model:
|
| 8 |
+
- EleutherAI/gpt-j-6b
|
| 9 |
+
tags:
|
| 10 |
+
- gptj
|
| 11 |
+
- causal-lm
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
<div style="background-color: #ff6961; padding: 10px 15px; display: flex; align-items: center; max-width: 100%; box-sizing: border-box; border-radius: 5px;">
|
| 15 |
+
<span style="color: yellow; font-size: 1.6em; margin-right: 10px; line-height: 1;">⚠️</span>
|
| 16 |
+
<span style="color: black; font-family: sans-serif; font-size: 1em;">
|
| 17 |
+
This model is lightly subtly busted in 20 different ways compared to the original. It is mostly designed for further training (that will implicitly heal it from these subtle busts). You have been warned.
|
| 18 |
+
</span>
|
| 19 |
+
</div>
|
| 20 |
+
|
| 21 |
+
This is a conversion of GPT-J-6b by EleutherAI into a more modern architecture that it still closely maps to (in this case, the Phi 1/1.5/2 architecture). This allows for, primarily, rope scaling, as well as for creating GGUFs (it does not currently support GPT-J's original arch.) See `convert.py` for the file used to convert the weights.
|
| 22 |
+
|
| 23 |
+
Note that I was originally going to use the GPT-NeoX architecture because it felt more befitting, there appears to be [a bug](https://github.com/huggingface/transformers/pull/35610#issuecomment-3111538020) in the most recent versions of Transformers, so Phi it is!
|
| 24 |
+
|
| 25 |
+
Also, the `partial_rotary_factor` is selected to be `0.5` here, despite the fact that this makes no logical sense, as even though it *should* be `0.25` (`rotary_dim / head_dim` = `64 / 256` = `0.25`), `0.25` is completely babblingly incoherent and `0.5` is basically the same as the original. Whatever.
|
config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"PhiForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0,
|
| 6 |
+
"bos_token_id": 50256,
|
| 7 |
+
"eos_token_id": 50256,
|
| 8 |
+
"hidden_act": "gelu_new",
|
| 9 |
+
"hidden_dropout_prob": 0,
|
| 10 |
+
"hidden_size": 4096,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": 16384,
|
| 13 |
+
"layer_norm_eps": 1e-05,
|
| 14 |
+
"max_position_embeddings": 2048,
|
| 15 |
+
"model_type": "phi",
|
| 16 |
+
"num_attention_heads": 16,
|
| 17 |
+
"num_hidden_layers": 28,
|
| 18 |
+
"rotary_emb_base": 10000,
|
| 19 |
+
"partial_rotary_factor": 0.5,
|
| 20 |
+
"tie_word_embeddings": false,
|
| 21 |
+
"torch_dtype": "float16",
|
| 22 |
+
"transformers_version": "4.19.0.dev0",
|
| 23 |
+
"use_cache": true,
|
| 24 |
+
"vocab_size": 50400
|
| 25 |
+
}
|
| 26 |
+
|
convert.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# convert_to_neox.py
|
| 3 |
+
import argparse, re, torch
|
| 4 |
+
from safetensors.torch import load_file, save_file
|
| 5 |
+
|
| 6 |
+
def cat_if_exists(keys, state, dim=0):
|
| 7 |
+
"""Helper: concatenate a list of keys if they all exist, else return None."""
|
| 8 |
+
if all(k in state for k in keys):
|
| 9 |
+
return torch.cat([state[k] for k in keys], dim=dim)
|
| 10 |
+
return None
|
| 11 |
+
|
| 12 |
+
def convert(path_in: str, path_out: str, dtype: str):
|
| 13 |
+
src = load_file(path_in)
|
| 14 |
+
tgt = {}
|
| 15 |
+
|
| 16 |
+
# --- top‑level tensors ---------------------------------------------------
|
| 17 |
+
tgt["model.embed_tokens.weight"] = src["transformer.wte.weight"].to(dtype)
|
| 18 |
+
tgt["model.final_layernorm.weight"] = src["transformer.ln_f.weight"].to(dtype)
|
| 19 |
+
tgt["model.final_layernorm.bias"] = src["transformer.ln_f.bias"].to(dtype)
|
| 20 |
+
tgt["lm_head.weight"] = src["lm_head.weight"].to(dtype)
|
| 21 |
+
tgt["lm_head.bias"] = src["lm_head.bias"].to(dtype)
|
| 22 |
+
|
| 23 |
+
# --- per‑layer tensors ---------------------------------------------------
|
| 24 |
+
pat = re.compile(r"transformer\.h\.(\d+)\.")
|
| 25 |
+
layer_ids = sorted({int(pat.match(k).group(1)) for k in src if pat.match(k)})
|
| 26 |
+
|
| 27 |
+
for i in layer_ids:
|
| 28 |
+
p_old = f"transformer.h.{i}"
|
| 29 |
+
p_new = f"model.layers.{i}"
|
| 30 |
+
|
| 31 |
+
## attention — fuse QKV
|
| 32 |
+
#qkv_w = cat_if_exists(
|
| 33 |
+
# [f"{p_old}.attn.q_proj.weight",
|
| 34 |
+
# f"{p_old}.attn.k_proj.weight",
|
| 35 |
+
# f"{p_old}.attn.v_proj.weight"],
|
| 36 |
+
# src)
|
| 37 |
+
#qkv_b = cat_if_exists(
|
| 38 |
+
# [f"{p_old}.attn.q_proj.bias",
|
| 39 |
+
# f"{p_old}.attn.k_proj.bias",
|
| 40 |
+
# f"{p_old}.attn.v_proj.bias"],
|
| 41 |
+
# src)
|
| 42 |
+
#
|
| 43 |
+
#tgt[f"{p_new}.attention.query_key_value.weight"] = qkv_w.to(dtype)
|
| 44 |
+
#if qkv_b is not None:
|
| 45 |
+
# tgt[f"{p_new}.attention.query_key_value.bias"] = qkv_b.to(dtype)
|
| 46 |
+
#else:
|
| 47 |
+
# tgt[f"{p_new}.attention.query_key_value.bias"] = torch.tensor([0] * qkv_w.shape[0]).to(dtype)
|
| 48 |
+
|
| 49 |
+
tgt[f"{p_new}.self_attn.k_proj.weight"] = src[f"{p_old}.attn.k_proj.weight"]
|
| 50 |
+
tgt[f"{p_new}.self_attn.k_proj.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.k_proj.weight"].shape[0])
|
| 51 |
+
|
| 52 |
+
tgt[f"{p_new}.self_attn.q_proj.weight"] = src[f"{p_old}.attn.q_proj.weight"]
|
| 53 |
+
tgt[f"{p_new}.self_attn.q_proj.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.q_proj.weight"].shape[0])
|
| 54 |
+
|
| 55 |
+
tgt[f"{p_new}.self_attn.v_proj.weight"] = src[f"{p_old}.attn.v_proj.weight"]
|
| 56 |
+
tgt[f"{p_new}.self_attn.v_proj.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.v_proj.weight"].shape[0])
|
| 57 |
+
|
| 58 |
+
tgt[f"{p_new}.self_attn.dense.weight"] = src[f"{p_old}.attn.out_proj.weight"].to(dtype)
|
| 59 |
+
tgt[f"{p_new}.self_attn.dense.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.dense.weight"].shape[0]).to(dtype)
|
| 60 |
+
|
| 61 |
+
# layer norms
|
| 62 |
+
tgt[f"{p_new}.input_layernorm.weight"] = src[f"{p_old}.ln_1.weight"].to(dtype)
|
| 63 |
+
tgt[f"{p_new}.input_layernorm.bias"] = src[f"{p_old}.ln_1.bias"].to(dtype)
|
| 64 |
+
|
| 65 |
+
# MLP
|
| 66 |
+
tgt[f"{p_new}.mlp.fc1.weight"] = src[f"{p_old}.mlp.fc_in.weight"].to(dtype)
|
| 67 |
+
tgt[f"{p_new}.mlp.fc1.bias"] = src[f"{p_old}.mlp.fc_in.bias"].to(dtype)
|
| 68 |
+
tgt[f"{p_new}.mlp.fc2.weight"] = src[f"{p_old}.mlp.fc_out.weight"].to(dtype)
|
| 69 |
+
tgt[f"{p_new}.mlp.fc2.bias"] = src[f"{p_old}.mlp.fc_out.bias"].to(dtype)
|
| 70 |
+
|
| 71 |
+
# ------------------------------------------------------------------------
|
| 72 |
+
save_file(tgt, path_out)
|
| 73 |
+
print(f"✓ wrote {len(tgt):,} tensors to {path_out}")
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
ap = argparse.ArgumentParser(description="convert GPT‑2/3 style safetensors to Phi1.5 layout")
|
| 77 |
+
ap.add_argument("--in", dest="inp", required=True, help="source .safetensors")
|
| 78 |
+
ap.add_argument("--out", dest="outp", required=True, help="destination .safetensors")
|
| 79 |
+
ap.add_argument("--dtype", default="float16",
|
| 80 |
+
choices=["float16","bfloat16","float32"],
|
| 81 |
+
help="cast parameters to this dtype in the output file")
|
| 82 |
+
args = ap.parse_args()
|
| 83 |
+
convert(args.inp, args.outp, getattr(torch, args.dtype))
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ffac8bf9bf940e422949610ee79efa8f332e5171a5b2dac9705be7f132f6a7e
|
| 3 |
+
size 14923363064
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "model_max_length": 2048, "special_tokens_map_file": null, "name_or_path": "gpt-j-6B", "from_slow": true, "tokenizer_class": "GPT2Tokenizer"}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|