ItsMaxNorm commited on
Commit
a2b525b
·
verified ·
1 Parent(s): 9dca0cf

init bpess.py

Browse files
Files changed (1) hide show
  1. bpess.py +54 -0
bpess.py CHANGED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compact Chess BPE Tokenizer: Train, Upload, Load, Inference"""
2
+ import os, json, rustbpe, tiktoken
3
+ from datasets import load_dataset
4
+ from huggingface_hub import HfApi, create_repo, upload_folder, hf_hub_download
5
+
6
+ REPO_ID = "ItsMaxNorm/bpess"
7
+
8
+ def train(vocab_size=4096, split="train[0:10000]"):
9
+ """Train BPE tokenizer on chess moves."""
10
+ ds = load_dataset('angeluriot/chess_games', split=split)
11
+ tok = rustbpe.Tokenizer()
12
+ tok.train_from_iterator((' '.join(g['moves_custom']) for g in ds if g['moves_custom']), vocab_size)
13
+ return tok
14
+
15
+ def save(tok, path="./tokenizer"):
16
+ """Save tokenizer files locally."""
17
+ os.makedirs(path, exist_ok=True)
18
+ ranks = tok.get_mergeable_ranks()
19
+ json.dump({bytes(k).decode('utf-8', errors='replace'): v for k, v in ranks},
20
+ open(f"{path}/vocab.json", 'w'), indent=2)
21
+ json.dump({"pattern": tok.get_pattern(), "vocab_size": tok.vocab_size},
22
+ open(f"{path}/config.json", 'w'))
23
+ return path
24
+
25
+ def upload(tok, repo_id=REPO_ID, private=False):
26
+ """Upload tokenizer to HuggingFace Hub."""
27
+ path = save(tok)
28
+ try: create_repo(repo_id, private=private)
29
+ except: pass
30
+ HfApi().upload_folder(folder_path=path, repo_id=repo_id)
31
+ print(f"Uploaded: https://huggingface.co/{repo_id}")
32
+
33
+ def load_tiktoken(repo_id=REPO_ID):
34
+ """Load tokenizer from HuggingFace as tiktoken Encoding."""
35
+ config = json.load(open(hf_hub_download(repo_id, "config.json")))
36
+ vocab = json.load(open(hf_hub_download(repo_id, "vocab.json")))
37
+ return tiktoken.Encoding(
38
+ name="chess", pat_str=config["pattern"],
39
+ mergeable_ranks={k.encode('utf-8', errors='replace'): v for k, v in vocab.items()},
40
+ special_tokens={}
41
+ )
42
+
43
+ if __name__ == "__main__":
44
+ # Train & Upload
45
+ tok = train(vocab_size=4096, split="train[0:10000]")
46
+ print(f"Trained: {tok.vocab_size} tokens")
47
+ upload(tok, REPO_ID)
48
+
49
+ # Load & Inference
50
+ enc = load_tiktoken(REPO_ID)
51
+ test = "w.♘g1♘f3.. b.♟c7♟c5.. w.♙d2♙d4.."
52
+ ids = enc.encode(test)
53
+ print(f"Encoded: {ids[:10]}... ({len(ids)} tokens)")
54
+ print(f"Decoded: {enc.decode(ids)}")