File size: 904 Bytes
342ab6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pickle
import json
import sys
import string

class SimpleTokenizer:
    def __init__(self, vocab=None):
        self.vocab = vocab or {}

def is_clean_token(t):
    return isinstance(t, str) and t.isprintable() and not any(c in t for c in "\u0000\uFFFD")

try:
    with open("tokenizer.pkl", "rb") as f:
        tokenizer_obj = pickle.load(f)

    vocab = tokenizer_obj.vocab if hasattr(tokenizer_obj, "vocab") else tokenizer_obj

    clean_vocab = {
        k: v for k, v in vocab.items()
        if is_clean_token(k)
    }

    with open("tokenizer_vocab.json", "w", encoding="utf-8") as f:
        json.dump(clean_vocab, f, indent=2, ensure_ascii=True)

    print("βœ“ Clean vocab extracted")
    print(f"βœ“ Original size: {len(vocab)}")
    print(f"βœ“ Clean size: {len(clean_vocab)}")

except Exception as e:
    print(f"βœ— Error: {e}")
    sys.exit(1)