movie_nerd / extract_tokenizer.py
Soumalya Das
Upload 4 files
342ab6d verified
raw
history blame contribute delete
904 Bytes
import pickle
import json
import sys
import string
class SimpleTokenizer:
def __init__(self, vocab=None):
self.vocab = vocab or {}
def is_clean_token(t):
return isinstance(t, str) and t.isprintable() and not any(c in t for c in "\u0000\uFFFD")
try:
with open("tokenizer.pkl", "rb") as f:
tokenizer_obj = pickle.load(f)
vocab = tokenizer_obj.vocab if hasattr(tokenizer_obj, "vocab") else tokenizer_obj
clean_vocab = {
k: v for k, v in vocab.items()
if is_clean_token(k)
}
with open("tokenizer_vocab.json", "w", encoding="utf-8") as f:
json.dump(clean_vocab, f, indent=2, ensure_ascii=True)
print("βœ“ Clean vocab extracted")
print(f"βœ“ Original size: {len(vocab)}")
print(f"βœ“ Clean size: {len(clean_vocab)}")
except Exception as e:
print(f"βœ— Error: {e}")
sys.exit(1)