import pickle import json import sys import string class SimpleTokenizer: def __init__(self, vocab=None): self.vocab = vocab or {} def is_clean_token(t): return isinstance(t, str) and t.isprintable() and not any(c in t for c in "\u0000\uFFFD") try: with open("tokenizer.pkl", "rb") as f: tokenizer_obj = pickle.load(f) vocab = tokenizer_obj.vocab if hasattr(tokenizer_obj, "vocab") else tokenizer_obj clean_vocab = { k: v for k, v in vocab.items() if is_clean_token(k) } with open("tokenizer_vocab.json", "w", encoding="utf-8") as f: json.dump(clean_vocab, f, indent=2, ensure_ascii=True) print("✓ Clean vocab extracted") print(f"✓ Original size: {len(vocab)}") print(f"✓ Clean size: {len(clean_vocab)}") except Exception as e: print(f"✗ Error: {e}") sys.exit(1)