Spaces:
No application file
No application file
File size: 904 Bytes
342ab6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
import pickle
import json
import sys
import string
class SimpleTokenizer:
def __init__(self, vocab=None):
self.vocab = vocab or {}
def is_clean_token(t):
return isinstance(t, str) and t.isprintable() and not any(c in t for c in "\u0000\uFFFD")
try:
with open("tokenizer.pkl", "rb") as f:
tokenizer_obj = pickle.load(f)
vocab = tokenizer_obj.vocab if hasattr(tokenizer_obj, "vocab") else tokenizer_obj
clean_vocab = {
k: v for k, v in vocab.items()
if is_clean_token(k)
}
with open("tokenizer_vocab.json", "w", encoding="utf-8") as f:
json.dump(clean_vocab, f, indent=2, ensure_ascii=True)
print("β Clean vocab extracted")
print(f"β Original size: {len(vocab)}")
print(f"β Clean size: {len(clean_vocab)}")
except Exception as e:
print(f"β Error: {e}")
sys.exit(1)
|