File size: 1,678 Bytes
6875b4b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | import unicodedata
import torch
class VocabCompressor:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
self.mapping_tensor = None
self.num_canonical = 0
def build_mapping(self):
vocab = self.tokenizer.get_vocab()
normalized_to_id = {}
raw_to_canonical = [-1] * len(vocab)
for token_str, raw_id in vocab.items():
text = self.tokenizer.decode([raw_id])
text = unicodedata.normalize("NFKC", text)
text = text.lower().strip()
if not text:
text = "<empty>"
if text not in normalized_to_id:
normalized_to_id[text] = len(normalized_to_id)
raw_to_canonical[raw_id] = normalized_to_id[text]
self.mapping_tensor = torch.tensor(raw_to_canonical, dtype=torch.long)
self.num_canonical = len(normalized_to_id)
print(f"Original Vocab Size: {len(vocab)}")
print(f"Canonical Vocab Size: {self.num_canonical}")
print(f"Reduction: {100 * (1 - self.num_canonical/len(vocab)):.2f}%")
return self.mapping_tensor
if __name__=="__main__":
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
compressor = VocabCompressor(tokenizer)
mapping = compressor.build_mapping()
id1 = tokenizer.encode("Apple")[0]
id2 = tokenizer.encode(" apple")[0]
print(f"\nRaw ID for 'Apple': {id1}")
print(f"Raw ID for ' apple': {id2}")
canon1 = mapping[id1].item()
canon2 = mapping[id2].item()
print(f"Canonical ID for 'Apple': {canon1}")
print(f"Canonical ID for ' apple': {canon2}") |