kacperwikiel commited on
Commit
4012ebc
·
verified ·
1 Parent(s): 78c54ec

Upload Slayer GPT tokenizer model archive

Browse files
README.md CHANGED
@@ -39,6 +39,56 @@ pip install -r requirements.txt
39
  python scripts/sample_mac.py "Polska jest" 80
40
  ```
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  ## What Is Included
43
 
44
  - `model/ckpt.pt` - runnable nanoGPT-style checkpoint from `/Users/kacper/Local/Ventures/Slayer/gpt2-pl-mac/ckpt.pt`.
 
39
  python scripts/sample_mac.py "Polska jest" 80
40
  ```
41
 
42
+ ## Inference From Hugging Face
43
+
44
+ This is a custom PyTorch checkpoint, so use the included model code instead of `AutoModelForCausalLM`.
45
+
46
+ Option 1: clone the model repo and run the bundled sampler:
47
+
48
+ ```bash
49
+ git lfs install
50
+ git clone https://huggingface.co/SlayerLab/slayer-gpt-tokenizer-model
51
+ cd slayer-gpt-tokenizer-model
52
+ python3 -m venv .venv
53
+ source .venv/bin/activate
54
+ pip install -r requirements.txt
55
+ python scripts/sample_mac.py "Polska jest" 80
56
+ ```
57
+
58
+ Option 2: download only the needed files via `huggingface_hub`:
59
+
60
+ ```bash
61
+ pip install torch tokenizers huggingface-hub
62
+ python examples/inference_from_hf.py "Polska jest" 80
63
+ ```
64
+
65
+ Minimal Python pattern:
66
+
67
+ ```python
68
+ import importlib.util
69
+ import sys
70
+ import torch
71
+ from huggingface_hub import hf_hub_download
72
+ from tokenizers import Tokenizer
73
+
74
+ repo_id = "SlayerLab/slayer-gpt-tokenizer-model"
75
+
76
+ model_py = hf_hub_download(repo_id, "scripts/model.py")
77
+ ckpt_path = hf_hub_download(repo_id, "model/ckpt.pt")
78
+ tok_path = hf_hub_download(repo_id, "tokenizers/polish_bpe_32k.json")
79
+
80
+ spec = importlib.util.spec_from_file_location("slayer_gpt_model", model_py)
81
+ module = importlib.util.module_from_spec(spec)
82
+ sys.modules[spec.name] = module
83
+ spec.loader.exec_module(module)
84
+
85
+ ckpt = torch.load(ckpt_path, map_location="cpu")
86
+ model = module.GPT(module.GPTConfig(**ckpt["model_args"]))
87
+ model.load_state_dict(ckpt["model"])
88
+ model.eval()
89
+ tok = Tokenizer.from_file(tok_path)
90
+ ```
91
+
92
  ## What Is Included
93
 
94
  - `model/ckpt.pt` - runnable nanoGPT-style checkpoint from `/Users/kacper/Local/Ventures/Slayer/gpt2-pl-mac/ckpt.pt`.
examples/inference_from_hf.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Run inference from the Hugging Face model repo without cloning it.
3
+
4
+ Usage:
5
+ pip install torch tokenizers huggingface-hub
6
+ python examples/inference_from_hf.py "Polska jest" 80
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import importlib.util
12
+ import sys
13
+ import time
14
+ from pathlib import Path
15
+
16
+ import torch
17
+ import torch.nn.functional as F
18
+ from huggingface_hub import hf_hub_download
19
+ from tokenizers import Tokenizer
20
+
21
+
22
+ REPO_ID = "SlayerLab/slayer-gpt-tokenizer-model"
23
+ TEMP = 0.7
24
+ TOP_K = 40
25
+ TOP_P = 0.92
26
+ REP_PEN = 1.15
27
+ NGRAM = 3
28
+ EOT = 0
29
+
30
+
31
+ def load_model_module(path: str):
32
+ spec = importlib.util.spec_from_file_location("slayer_gpt_model", path)
33
+ if spec is None or spec.loader is None:
34
+ raise RuntimeError(f"Could not load model module from {path}")
35
+ module = importlib.util.module_from_spec(spec)
36
+ sys.modules[spec.name] = module
37
+ spec.loader.exec_module(module)
38
+ return module
39
+
40
+
41
+ def banned_next_tokens(seq: list[int], n: int) -> set[int]:
42
+ if len(seq) < n - 1:
43
+ return set()
44
+ prefix = tuple(seq[-(n - 1):])
45
+ banned: set[int] = set()
46
+ for i in range(len(seq) - n + 1):
47
+ if tuple(seq[i:i + n - 1]) == prefix:
48
+ banned.add(seq[i + n - 1])
49
+ return banned
50
+
51
+
52
+ @torch.no_grad()
53
+ def generate(model, tokenizer: Tokenizer, prompt: str, max_new_tokens: int, block_size: int, device: str) -> tuple[str, float]:
54
+ idx = torch.tensor(tokenizer.encode(prompt).ids, dtype=torch.long, device=device)[None]
55
+ start = time.time()
56
+ generated = 0
57
+
58
+ for _ in range(max_new_tokens):
59
+ logits, _ = model(idx[:, -block_size:])
60
+ logits = logits[:, -1, :].float()
61
+
62
+ for token_id in set(idx[0].tolist()):
63
+ logits[0, token_id] /= REP_PEN if logits[0, token_id] > 0 else 1 / REP_PEN
64
+
65
+ for token_id in banned_next_tokens(idx[0].tolist(), NGRAM):
66
+ logits[0, token_id] = -float("inf")
67
+
68
+ logits /= TEMP
69
+ kth = torch.topk(logits, TOP_K)[0][..., -1, None]
70
+ logits[logits < kth] = -float("inf")
71
+
72
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
73
+ cumulative = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
74
+ remove = cumulative > TOP_P
75
+ remove[..., 1:] = remove[..., :-1].clone()
76
+ remove[..., 0] = False
77
+ logits[0, sorted_indices[0][remove[0]]] = -float("inf")
78
+
79
+ next_id = torch.multinomial(F.softmax(logits, dim=-1), 1)
80
+ generated += 1
81
+ if next_id.item() == EOT:
82
+ break
83
+ idx = torch.cat([idx, next_id], dim=1)
84
+
85
+ tokens_per_second = generated / max(time.time() - start, 1e-6)
86
+ return tokenizer.decode(idx[0].tolist()), tokens_per_second
87
+
88
+
89
+ def main() -> None:
90
+ prompt = sys.argv[1] if len(sys.argv) > 1 else "Polska jest"
91
+ max_new_tokens = int(sys.argv[2]) if len(sys.argv) > 2 else 80
92
+ device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
93
+
94
+ model_py = hf_hub_download(REPO_ID, "scripts/model.py")
95
+ ckpt_path = hf_hub_download(REPO_ID, "model/ckpt.pt")
96
+ tokenizer_path = hf_hub_download(REPO_ID, "tokenizers/polish_bpe_32k.json")
97
+
98
+ model_module = load_model_module(model_py)
99
+ ckpt = torch.load(ckpt_path, map_location="cpu")
100
+ model = model_module.GPT(model_module.GPTConfig(**ckpt["model_args"]))
101
+ state_dict = ckpt["model"]
102
+ for key in list(state_dict):
103
+ if key.startswith("_orig_mod."):
104
+ state_dict[key[len("_orig_mod."):]] = state_dict.pop(key)
105
+
106
+ model.load_state_dict(state_dict)
107
+ model.eval().to(device)
108
+ tokenizer = Tokenizer.from_file(tokenizer_path)
109
+
110
+ text, tps = generate(
111
+ model,
112
+ tokenizer,
113
+ prompt,
114
+ max_new_tokens,
115
+ ckpt["model_args"]["block_size"],
116
+ device,
117
+ )
118
+ print(f"[repo={REPO_ID} device={device} {tps:.1f} tok/s]\n")
119
+ print(text)
120
+
121
+
122
+ if __name__ == "__main__":
123
+ main()
124
+
metadata/loss_train.csv CHANGED
@@ -1,161 +1,3 @@
1
- 0,10.5432
2
- 10,8.9096
3
- 20,8.3818
4
- 30,7.7284
5
- 40,7.3585
6
- 50,7.2123
7
- 60,6.8500
8
- 70,6.7956
9
- 80,6.4644
10
- 90,6.4187
11
- 100,6.4978
12
- 110,6.2566
13
- 120,6.3528
14
- 130,6.0993
15
- 140,6.0455
16
- 150,6.0754
17
- 160,5.8164
18
- 170,5.8069
19
- 180,5.7759
20
- 190,5.6976
21
- 200,5.6238
22
- 210,5.6289
23
- 220,5.5407
24
- 230,5.4001
25
- 240,5.4523
26
- 250,5.4548
27
- 260,5.2646
28
- 270,5.2490
29
- 280,5.2464
30
- 290,5.2203
31
- 300,5.2502
32
- 310,5.1590
33
- 320,5.0595
34
- 330,5.1221
35
- 340,5.0980
36
- 350,4.9837
37
- 360,4.9870
38
- 370,4.8676
39
- 380,5.0423
40
- 390,4.8983
41
- 400,4.8116
42
- 410,4.7852
43
- 420,4.7880
44
- 430,4.7554
45
- 440,4.7762
46
- 450,4.7746
47
- 460,4.8073
48
- 470,4.5162
49
- 480,4.5992
50
- 490,4.6830
51
- 500,4.6345
52
- 510,4.3883
53
- 520,4.6188
54
- 530,4.4315
55
- 540,4.4713
56
- 550,4.4083
57
- 560,4.3543
58
- 570,4.3069
59
- 580,4.2223
60
- 590,4.3264
61
- 600,4.3473
62
- 610,4.1376
63
- 620,4.2780
64
- 630,4.2489
65
- 640,4.1217
66
- 650,4.1767
67
- 660,4.0496
68
- 670,4.0011
69
- 680,4.0010
70
- 690,4.0702
71
- 700,4.0163
72
- 710,4.0544
73
- 720,4.1402
74
- 730,4.0240
75
- 740,4.1338
76
- 750,4.0968
77
- 760,3.9717
78
- 770,3.8710
79
- 780,3.9123
80
- 790,3.9936
81
- 800,3.9854
82
- 810,3.9391
83
- 820,3.8748
84
- 830,3.9396
85
- 840,4.0900
86
- 850,3.9185
87
- 860,3.9237
88
- 870,3.9972
89
- 880,3.8443
90
- 890,3.8706
91
- 900,3.9335
92
- 910,3.8034
93
- 920,3.8431
94
- 930,3.8501
95
- 940,3.9286
96
- 950,3.8670
97
- 960,3.8986
98
- 970,3.6916
99
- 980,3.7584
100
- 990,3.7107
101
- 1000,3.5749
102
- 1010,3.7844
103
- 1020,3.8467
104
- 1030,3.6829
105
- 1040,3.7354
106
- 1050,3.9265
107
- 1060,3.7477
108
- 1070,3.6859
109
- 1080,3.7451
110
- 1090,3.8840
111
- 1100,3.7716
112
- 1110,3.6441
113
- 1120,3.7806
114
- 1130,3.6817
115
- 1140,3.7985
116
- 1150,3.7247
117
- 1160,3.7286
118
- 1170,3.7495
119
- 1180,3.7451
120
- 1190,3.7496
121
- 1200,3.7041
122
- 1210,3.7436
123
- 1220,3.5851
124
- 1230,3.6694
125
- 1240,3.5732
126
- 1250,3.7169
127
- 1260,3.7615
128
- 1270,3.7332
129
- 1280,3.6454
130
- 1290,3.7745
131
- 1300,3.5835
132
- 1310,3.6660
133
- 1320,3.7584
134
- 1330,3.6219
135
- 1340,3.6977
136
- 1350,3.5445
137
- 1360,3.6224
138
- 1370,3.6865
139
- 1380,3.6163
140
- 1390,3.8143
141
- 1400,3.6447
142
- 1410,3.6732
143
- 1420,3.5276
144
- 1430,3.6848
145
- 1440,3.7317
146
- 1450,3.7915
147
- 1460,3.6741
148
- 1470,3.6490
149
- 1480,3.6448
150
- 1490,3.5571
151
- 1500,3.6427
152
- 1510,3.7507
153
- 1520,3.6749
154
- 1530,3.7123
155
- 1540,3.7059
156
- 1550,3.5544
157
- 1560,3.6306
158
- 1570,3.7105
159
- 1580,3.7773
160
- 1590,3.7557
161
- 1600,3.6184
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12d715d0539c84fdd11f8bfa468da7f31227a29820ce1e8af26b22a5511c081f
3
+ size 1822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
metadata/traj.csv CHANGED
@@ -1,8 +1,3 @@
1
- iter,gram,know
2
- 1000,90.9,28.4
3
- 1100,90.9,32.6
4
- 1200,90.9,25.3
5
- 1300,90.9,26.3
6
- 1400,90.9,30.5
7
- 1500,90.9,28.4
8
- 1600,90.9,27.4
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43663d07e0c15205490602554477168c2ca65f8bd232d8ad536707e4c4eb4631
3
+ size 120
 
 
 
 
 
tokenizers/polish_bpe_32k.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizers/rxlm_polish_bpe_65k.json CHANGED
The diff for this file is too large to render. See raw diff