Upload Slayer GPT tokenizer model archive

Browse files

Files changed (6) hide show

README.md +50 -0
examples/inference_from_hf.py +124 -0
metadata/loss_train.csv +3 -161
metadata/traj.csv +3 -8
tokenizers/polish_bpe_32k.json +0 -0
tokenizers/rxlm_polish_bpe_65k.json +0 -0

README.md CHANGED Viewed

@@ -39,6 +39,56 @@ pip install -r requirements.txt
 python scripts/sample_mac.py "Polska jest" 80
 ```
 ## What Is Included
 - `model/ckpt.pt` - runnable nanoGPT-style checkpoint from `/Users/kacper/Local/Ventures/Slayer/gpt2-pl-mac/ckpt.pt`.

 python scripts/sample_mac.py "Polska jest" 80
 ```
+## Inference From Hugging Face
+This is a custom PyTorch checkpoint, so use the included model code instead of `AutoModelForCausalLM`.
+Option 1: clone the model repo and run the bundled sampler:
+```bash
+git lfs install
+git clone https://huggingface.co/SlayerLab/slayer-gpt-tokenizer-model
+cd slayer-gpt-tokenizer-model
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+python scripts/sample_mac.py "Polska jest" 80
+```
+Option 2: download only the needed files via `huggingface_hub`:
+```bash
+pip install torch tokenizers huggingface-hub
+python examples/inference_from_hf.py "Polska jest" 80
+```
+Minimal Python pattern:
+```python
+import importlib.util
+import sys
+import torch
+from huggingface_hub import hf_hub_download
+from tokenizers import Tokenizer
+repo_id = "SlayerLab/slayer-gpt-tokenizer-model"
+model_py = hf_hub_download(repo_id, "scripts/model.py")
+ckpt_path = hf_hub_download(repo_id, "model/ckpt.pt")
+tok_path = hf_hub_download(repo_id, "tokenizers/polish_bpe_32k.json")
+spec = importlib.util.spec_from_file_location("slayer_gpt_model", model_py)
+module = importlib.util.module_from_spec(spec)
+sys.modules[spec.name] = module
+spec.loader.exec_module(module)
+ckpt = torch.load(ckpt_path, map_location="cpu")
+model = module.GPT(module.GPTConfig(**ckpt["model_args"]))
+model.load_state_dict(ckpt["model"])
+model.eval()
+tok = Tokenizer.from_file(tok_path)
+```
 ## What Is Included
 - `model/ckpt.pt` - runnable nanoGPT-style checkpoint from `/Users/kacper/Local/Ventures/Slayer/gpt2-pl-mac/ckpt.pt`.

examples/inference_from_hf.py ADDED Viewed

	@@ -0,0 +1,124 @@

+#!/usr/bin/env python3
+"""Run inference from the Hugging Face model repo without cloning it.
+Usage:
+    pip install torch tokenizers huggingface-hub
+    python examples/inference_from_hf.py "Polska jest" 80
+"""
+from __future__ import annotations
+import importlib.util
+import sys
+import time
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+from huggingface_hub import hf_hub_download
+from tokenizers import Tokenizer
+REPO_ID = "SlayerLab/slayer-gpt-tokenizer-model"
+TEMP = 0.7
+TOP_K = 40
+TOP_P = 0.92
+REP_PEN = 1.15
+NGRAM = 3
+EOT = 0
+def load_model_module(path: str):
+    spec = importlib.util.spec_from_file_location("slayer_gpt_model", path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Could not load model module from {path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+def banned_next_tokens(seq: list[int], n: int) -> set[int]:
+    if len(seq) < n - 1:
+        return set()
+    prefix = tuple(seq[-(n - 1):])
+    banned: set[int] = set()
+    for i in range(len(seq) - n + 1):
+        if tuple(seq[i:i + n - 1]) == prefix:
+            banned.add(seq[i + n - 1])
+    return banned
+@torch.no_grad()
+def generate(model, tokenizer: Tokenizer, prompt: str, max_new_tokens: int, block_size: int, device: str) -> tuple[str, float]:
+    idx = torch.tensor(tokenizer.encode(prompt).ids, dtype=torch.long, device=device)[None]
+    start = time.time()
+    generated = 0
+    for _ in range(max_new_tokens):
+        logits, _ = model(idx[:, -block_size:])
+        logits = logits[:, -1, :].float()
+        for token_id in set(idx[0].tolist()):
+            logits[0, token_id] /= REP_PEN if logits[0, token_id] > 0 else 1 / REP_PEN
+        for token_id in banned_next_tokens(idx[0].tolist(), NGRAM):
+            logits[0, token_id] = -float("inf")
+        logits /= TEMP
+        kth = torch.topk(logits, TOP_K)[0][..., -1, None]
+        logits[logits < kth] = -float("inf")
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        remove = cumulative > TOP_P
+        remove[..., 1:] = remove[..., :-1].clone()
+        remove[..., 0] = False
+        logits[0, sorted_indices[0][remove[0]]] = -float("inf")
+        next_id = torch.multinomial(F.softmax(logits, dim=-1), 1)
+        generated += 1
+        if next_id.item() == EOT:
+            break
+        idx = torch.cat([idx, next_id], dim=1)
+    tokens_per_second = generated / max(time.time() - start, 1e-6)
+    return tokenizer.decode(idx[0].tolist()), tokens_per_second
+def main() -> None:
+    prompt = sys.argv[1] if len(sys.argv) > 1 else "Polska jest"
+    max_new_tokens = int(sys.argv[2]) if len(sys.argv) > 2 else 80
+    device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
+    model_py = hf_hub_download(REPO_ID, "scripts/model.py")
+    ckpt_path = hf_hub_download(REPO_ID, "model/ckpt.pt")
+    tokenizer_path = hf_hub_download(REPO_ID, "tokenizers/polish_bpe_32k.json")
+    model_module = load_model_module(model_py)
+    ckpt = torch.load(ckpt_path, map_location="cpu")
+    model = model_module.GPT(model_module.GPTConfig(**ckpt["model_args"]))
+    state_dict = ckpt["model"]
+    for key in list(state_dict):
+        if key.startswith("_orig_mod."):
+            state_dict[key[len("_orig_mod."):]] = state_dict.pop(key)
+    model.load_state_dict(state_dict)
+    model.eval().to(device)
+    tokenizer = Tokenizer.from_file(tokenizer_path)
+    text, tps = generate(
+        model,
+        tokenizer,
+        prompt,
+        max_new_tokens,
+        ckpt["model_args"]["block_size"],
+        device,
+    )
+    print(f"[repo={REPO_ID} device={device} {tps:.1f} tok/s]\n")
+    print(text)
+if __name__ == "__main__":
+    main()

metadata/loss_train.csv CHANGED Viewed

@@ -1,161 +1,3 @@
-0,10.5432
-10,8.9096
-20,8.3818
-30,7.7284
-40,7.3585
-50,7.2123
-60,6.8500
-70,6.7956
-80,6.4644
-90,6.4187
-100,6.4978
-110,6.2566
-120,6.3528
-130,6.0993
-140,6.0455
-150,6.0754
-160,5.8164
-170,5.8069
-180,5.7759
-190,5.6976
-200,5.6238
-210,5.6289
-220,5.5407
-230,5.4001
-240,5.4523
-250,5.4548
-260,5.2646
-270,5.2490
-280,5.2464
-290,5.2203
-300,5.2502
-310,5.1590
-320,5.0595
-330,5.1221
-340,5.0980
-350,4.9837
-360,4.9870
-370,4.8676
-380,5.0423
-390,4.8983
-400,4.8116
-410,4.7852
-420,4.7880
-430,4.7554
-440,4.7762
-450,4.7746
-460,4.8073
-470,4.5162
-480,4.5992
-490,4.6830
-500,4.6345
-510,4.3883
-520,4.6188
-530,4.4315
-540,4.4713
-550,4.4083
-560,4.3543
-570,4.3069
-580,4.2223
-590,4.3264
-600,4.3473
-610,4.1376
-620,4.2780
-630,4.2489
-640,4.1217
-650,4.1767
-660,4.0496
-670,4.0011
-680,4.0010
-690,4.0702
-700,4.0163
-710,4.0544
-720,4.1402
-730,4.0240
-740,4.1338
-750,4.0968
-760,3.9717
-770,3.8710
-780,3.9123
-790,3.9936
-800,3.9854
-810,3.9391
-820,3.8748
-830,3.9396
-840,4.0900
-850,3.9185
-860,3.9237
-870,3.9972
-880,3.8443
-890,3.8706
-900,3.9335
-910,3.8034
-920,3.8431
-930,3.8501
-940,3.9286
-950,3.8670
-960,3.8986
-970,3.6916
-980,3.7584
-990,3.7107
-1000,3.5749
-1010,3.7844
-1020,3.8467
-1030,3.6829
-1040,3.7354
-1050,3.9265
-1060,3.7477
-1070,3.6859
-1080,3.7451
-1090,3.8840
-1100,3.7716
-1110,3.6441
-1120,3.7806
-1130,3.6817
-1140,3.7985
-1150,3.7247
-1160,3.7286
-1170,3.7495
-1180,3.7451
-1190,3.7496
-1200,3.7041
-1210,3.7436
-1220,3.5851
-1230,3.6694
-1240,3.5732
-1250,3.7169
-1260,3.7615
-1270,3.7332
-1280,3.6454
-1290,3.7745
-1300,3.5835
-1310,3.6660
-1320,3.7584
-1330,3.6219
-1340,3.6977
-1350,3.5445
-1360,3.6224
-1370,3.6865
-1380,3.6163
-1390,3.8143
-1400,3.6447
-1410,3.6732
-1420,3.5276
-1430,3.6848
-1440,3.7317
-1450,3.7915
-1460,3.6741
-1470,3.6490
-1480,3.6448
-1490,3.5571
-1500,3.6427
-1510,3.7507
-1520,3.6749
-1530,3.7123
-1540,3.7059
-1550,3.5544
-1560,3.6306
-1570,3.7105
-1580,3.7773
-1590,3.7557
-1600,3.6184

+version https://git-lfs.github.com/spec/v1
+oid sha256:12d715d0539c84fdd11f8bfa468da7f31227a29820ce1e8af26b22a5511c081f
+size 1822

metadata/traj.csv CHANGED Viewed

@@ -1,8 +1,3 @@
-iter,gram,know
-1000,90.9,28.4
-1100,90.9,32.6
-1200,90.9,25.3
-1300,90.9,26.3
-1400,90.9,30.5
-1500,90.9,28.4
-1600,90.9,27.4

+version https://git-lfs.github.com/spec/v1
+oid sha256:43663d07e0c15205490602554477168c2ca65f8bd232d8ad536707e4c4eb4631
+size 120

tokenizers/polish_bpe_32k.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizers/rxlm_polish_bpe_65k.json CHANGED Viewed

The diff for this file is too large to render. See raw diff