init bpess.py
Browse files
bpess.py
CHANGED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compact Chess BPE Tokenizer: Train, Upload, Load, Inference"""
|
| 2 |
+
import os, json, rustbpe, tiktoken
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
from huggingface_hub import HfApi, create_repo, upload_folder, hf_hub_download
|
| 5 |
+
|
| 6 |
+
REPO_ID = "ItsMaxNorm/bpess"
|
| 7 |
+
|
| 8 |
+
def train(vocab_size=4096, split="train[0:10000]"):
|
| 9 |
+
"""Train BPE tokenizer on chess moves."""
|
| 10 |
+
ds = load_dataset('angeluriot/chess_games', split=split)
|
| 11 |
+
tok = rustbpe.Tokenizer()
|
| 12 |
+
tok.train_from_iterator((' '.join(g['moves_custom']) for g in ds if g['moves_custom']), vocab_size)
|
| 13 |
+
return tok
|
| 14 |
+
|
| 15 |
+
def save(tok, path="./tokenizer"):
|
| 16 |
+
"""Save tokenizer files locally."""
|
| 17 |
+
os.makedirs(path, exist_ok=True)
|
| 18 |
+
ranks = tok.get_mergeable_ranks()
|
| 19 |
+
json.dump({bytes(k).decode('utf-8', errors='replace'): v for k, v in ranks},
|
| 20 |
+
open(f"{path}/vocab.json", 'w'), indent=2)
|
| 21 |
+
json.dump({"pattern": tok.get_pattern(), "vocab_size": tok.vocab_size},
|
| 22 |
+
open(f"{path}/config.json", 'w'))
|
| 23 |
+
return path
|
| 24 |
+
|
| 25 |
+
def upload(tok, repo_id=REPO_ID, private=False):
|
| 26 |
+
"""Upload tokenizer to HuggingFace Hub."""
|
| 27 |
+
path = save(tok)
|
| 28 |
+
try: create_repo(repo_id, private=private)
|
| 29 |
+
except: pass
|
| 30 |
+
HfApi().upload_folder(folder_path=path, repo_id=repo_id)
|
| 31 |
+
print(f"Uploaded: https://huggingface.co/{repo_id}")
|
| 32 |
+
|
| 33 |
+
def load_tiktoken(repo_id=REPO_ID):
|
| 34 |
+
"""Load tokenizer from HuggingFace as tiktoken Encoding."""
|
| 35 |
+
config = json.load(open(hf_hub_download(repo_id, "config.json")))
|
| 36 |
+
vocab = json.load(open(hf_hub_download(repo_id, "vocab.json")))
|
| 37 |
+
return tiktoken.Encoding(
|
| 38 |
+
name="chess", pat_str=config["pattern"],
|
| 39 |
+
mergeable_ranks={k.encode('utf-8', errors='replace'): v for k, v in vocab.items()},
|
| 40 |
+
special_tokens={}
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
if __name__ == "__main__":
|
| 44 |
+
# Train & Upload
|
| 45 |
+
tok = train(vocab_size=4096, split="train[0:10000]")
|
| 46 |
+
print(f"Trained: {tok.vocab_size} tokens")
|
| 47 |
+
upload(tok, REPO_ID)
|
| 48 |
+
|
| 49 |
+
# Load & Inference
|
| 50 |
+
enc = load_tiktoken(REPO_ID)
|
| 51 |
+
test = "w.♘g1♘f3.. b.♟c7♟c5.. w.♙d2♙d4.."
|
| 52 |
+
ids = enc.encode(test)
|
| 53 |
+
print(f"Encoded: {ids[:10]}... ({len(ids)} tokens)")
|
| 54 |
+
print(f"Decoded: {enc.decode(ids)}")
|