Spaces:
No application file
No application file
| import pickle | |
| import json | |
| import sys | |
| import string | |
| class SimpleTokenizer: | |
| def __init__(self, vocab=None): | |
| self.vocab = vocab or {} | |
| def is_clean_token(t): | |
| return isinstance(t, str) and t.isprintable() and not any(c in t for c in "\u0000\uFFFD") | |
| try: | |
| with open("tokenizer.pkl", "rb") as f: | |
| tokenizer_obj = pickle.load(f) | |
| vocab = tokenizer_obj.vocab if hasattr(tokenizer_obj, "vocab") else tokenizer_obj | |
| clean_vocab = { | |
| k: v for k, v in vocab.items() | |
| if is_clean_token(k) | |
| } | |
| with open("tokenizer_vocab.json", "w", encoding="utf-8") as f: | |
| json.dump(clean_vocab, f, indent=2, ensure_ascii=True) | |
| print("β Clean vocab extracted") | |
| print(f"β Original size: {len(vocab)}") | |
| print(f"β Clean size: {len(clean_vocab)}") | |
| except Exception as e: | |
| print(f"β Error: {e}") | |
| sys.exit(1) | |