Fizzarolli commited on
Commit
c2f6063
·
verified ·
0 Parent(s):

Duplicate from allura-forge/phi-j-6b

Browse files
Files changed (9) hide show
  1. .gitattributes +35 -0
  2. README.md +25 -0
  3. config.json +26 -0
  4. convert.py +83 -0
  5. merges.txt +0 -0
  6. model.safetensors +3 -0
  7. tokenizer.json +0 -0
  8. tokenizer_config.json +1 -0
  9. vocab.json +0 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - EleutherAI/pile
5
+ language:
6
+ - en
7
+ base_model:
8
+ - EleutherAI/gpt-j-6b
9
+ tags:
10
+ - gptj
11
+ - causal-lm
12
+ ---
13
+
14
+ <div style="background-color: #ff6961; padding: 10px 15px; display: flex; align-items: center; max-width: 100%; box-sizing: border-box; border-radius: 5px;">
15
+ <span style="color: yellow; font-size: 1.6em; margin-right: 10px; line-height: 1;">⚠️</span>
16
+ <span style="color: black; font-family: sans-serif; font-size: 1em;">
17
+ This model is lightly subtly busted in 20 different ways compared to the original. It is mostly designed for further training (that will implicitly heal it from these subtle busts). You have been warned.
18
+ </span>
19
+ </div>
20
+
21
+ This is a conversion of GPT-J-6b by EleutherAI into a more modern architecture that it still closely maps to (in this case, the Phi 1/1.5/2 architecture). This allows for, primarily, rope scaling, as well as for creating GGUFs (it does not currently support GPT-J's original arch.) See `convert.py` for the file used to convert the weights.
22
+
23
+ Note that I was originally going to use the GPT-NeoX architecture because it felt more befitting, there appears to be [a bug](https://github.com/huggingface/transformers/pull/35610#issuecomment-3111538020) in the most recent versions of Transformers, so Phi it is!
24
+
25
+ Also, the `partial_rotary_factor` is selected to be `0.5` here, despite the fact that this makes no logical sense, as even though it *should* be `0.25` (`rotary_dim / head_dim` = `64 / 256` = `0.25`), `0.25` is completely babblingly incoherent and `0.5` is basically the same as the original. Whatever.
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "PhiForCausalLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0,
6
+ "bos_token_id": 50256,
7
+ "eos_token_id": 50256,
8
+ "hidden_act": "gelu_new",
9
+ "hidden_dropout_prob": 0,
10
+ "hidden_size": 4096,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 16384,
13
+ "layer_norm_eps": 1e-05,
14
+ "max_position_embeddings": 2048,
15
+ "model_type": "phi",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 28,
18
+ "rotary_emb_base": 10000,
19
+ "partial_rotary_factor": 0.5,
20
+ "tie_word_embeddings": false,
21
+ "torch_dtype": "float16",
22
+ "transformers_version": "4.19.0.dev0",
23
+ "use_cache": true,
24
+ "vocab_size": 50400
25
+ }
26
+
convert.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # convert_to_neox.py
3
+ import argparse, re, torch
4
+ from safetensors.torch import load_file, save_file
5
+
6
+ def cat_if_exists(keys, state, dim=0):
7
+ """Helper: concatenate a list of keys if they all exist, else return None."""
8
+ if all(k in state for k in keys):
9
+ return torch.cat([state[k] for k in keys], dim=dim)
10
+ return None
11
+
12
+ def convert(path_in: str, path_out: str, dtype: str):
13
+ src = load_file(path_in)
14
+ tgt = {}
15
+
16
+ # --- top‑level tensors ---------------------------------------------------
17
+ tgt["model.embed_tokens.weight"] = src["transformer.wte.weight"].to(dtype)
18
+ tgt["model.final_layernorm.weight"] = src["transformer.ln_f.weight"].to(dtype)
19
+ tgt["model.final_layernorm.bias"] = src["transformer.ln_f.bias"].to(dtype)
20
+ tgt["lm_head.weight"] = src["lm_head.weight"].to(dtype)
21
+ tgt["lm_head.bias"] = src["lm_head.bias"].to(dtype)
22
+
23
+ # --- per‑layer tensors ---------------------------------------------------
24
+ pat = re.compile(r"transformer\.h\.(\d+)\.")
25
+ layer_ids = sorted({int(pat.match(k).group(1)) for k in src if pat.match(k)})
26
+
27
+ for i in layer_ids:
28
+ p_old = f"transformer.h.{i}"
29
+ p_new = f"model.layers.{i}"
30
+
31
+ ## attention — fuse QKV
32
+ #qkv_w = cat_if_exists(
33
+ # [f"{p_old}.attn.q_proj.weight",
34
+ # f"{p_old}.attn.k_proj.weight",
35
+ # f"{p_old}.attn.v_proj.weight"],
36
+ # src)
37
+ #qkv_b = cat_if_exists(
38
+ # [f"{p_old}.attn.q_proj.bias",
39
+ # f"{p_old}.attn.k_proj.bias",
40
+ # f"{p_old}.attn.v_proj.bias"],
41
+ # src)
42
+ #
43
+ #tgt[f"{p_new}.attention.query_key_value.weight"] = qkv_w.to(dtype)
44
+ #if qkv_b is not None:
45
+ # tgt[f"{p_new}.attention.query_key_value.bias"] = qkv_b.to(dtype)
46
+ #else:
47
+ # tgt[f"{p_new}.attention.query_key_value.bias"] = torch.tensor([0] * qkv_w.shape[0]).to(dtype)
48
+
49
+ tgt[f"{p_new}.self_attn.k_proj.weight"] = src[f"{p_old}.attn.k_proj.weight"]
50
+ tgt[f"{p_new}.self_attn.k_proj.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.k_proj.weight"].shape[0])
51
+
52
+ tgt[f"{p_new}.self_attn.q_proj.weight"] = src[f"{p_old}.attn.q_proj.weight"]
53
+ tgt[f"{p_new}.self_attn.q_proj.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.q_proj.weight"].shape[0])
54
+
55
+ tgt[f"{p_new}.self_attn.v_proj.weight"] = src[f"{p_old}.attn.v_proj.weight"]
56
+ tgt[f"{p_new}.self_attn.v_proj.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.v_proj.weight"].shape[0])
57
+
58
+ tgt[f"{p_new}.self_attn.dense.weight"] = src[f"{p_old}.attn.out_proj.weight"].to(dtype)
59
+ tgt[f"{p_new}.self_attn.dense.bias"] = torch.tensor([0] * tgt[f"{p_new}.self_attn.dense.weight"].shape[0]).to(dtype)
60
+
61
+ # layer norms
62
+ tgt[f"{p_new}.input_layernorm.weight"] = src[f"{p_old}.ln_1.weight"].to(dtype)
63
+ tgt[f"{p_new}.input_layernorm.bias"] = src[f"{p_old}.ln_1.bias"].to(dtype)
64
+
65
+ # MLP
66
+ tgt[f"{p_new}.mlp.fc1.weight"] = src[f"{p_old}.mlp.fc_in.weight"].to(dtype)
67
+ tgt[f"{p_new}.mlp.fc1.bias"] = src[f"{p_old}.mlp.fc_in.bias"].to(dtype)
68
+ tgt[f"{p_new}.mlp.fc2.weight"] = src[f"{p_old}.mlp.fc_out.weight"].to(dtype)
69
+ tgt[f"{p_new}.mlp.fc2.bias"] = src[f"{p_old}.mlp.fc_out.bias"].to(dtype)
70
+
71
+ # ------------------------------------------------------------------------
72
+ save_file(tgt, path_out)
73
+ print(f"✓ wrote {len(tgt):,} tensors to {path_out}")
74
+
75
+ if __name__ == "__main__":
76
+ ap = argparse.ArgumentParser(description="convert GPT‑2/3 style safetensors to Phi1.5 layout")
77
+ ap.add_argument("--in", dest="inp", required=True, help="source .safetensors")
78
+ ap.add_argument("--out", dest="outp", required=True, help="destination .safetensors")
79
+ ap.add_argument("--dtype", default="float16",
80
+ choices=["float16","bfloat16","float32"],
81
+ help="cast parameters to this dtype in the output file")
82
+ args = ap.parse_args()
83
+ convert(args.inp, args.outp, getattr(torch, args.dtype))
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ffac8bf9bf940e422949610ee79efa8f332e5171a5b2dac9705be7f132f6a7e
3
+ size 14923363064
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "model_max_length": 2048, "special_tokens_map_file": null, "name_or_path": "gpt-j-6B", "from_slow": true, "tokenizer_class": "GPT2Tokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff