File size: 2,512 Bytes
37ed739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
Script to explore CodeGen model vocabulary
"""
from transformers import AutoTokenizer

# Load the tokenizer (which contains the vocabulary)
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")

print("=" * 80)
print("CODEGEN VOCABULARY EXPLORATION")
print("=" * 80)

# 1. Vocabulary size
vocab_size = len(tokenizer)
print(f"\n1. Vocabulary Size: {vocab_size:,} tokens")

# 2. Get the vocabulary as a dictionary (token -> id)
vocab = tokenizer.get_vocab()
print(f"\n2. Vocabulary type: {type(vocab)}")

# 3. Show some example tokens
print("\n3. Sample tokens from vocabulary:")
sample_tokens = list(vocab.items())[:20]
for token, token_id in sample_tokens:
    print(f"   ID {token_id:5d}: '{token}'")

# 4. Search for specific tokens
print("\n4. Programming-related tokens:")
search_terms = ["length", "def", "class", "function", "return", "import", "for", "while"]
for term in search_terms:
    if term in vocab:
        token_id = vocab[term]
        print(f"   '{term}' -> Token ID: {token_id}")
    else:
        print(f"   '{term}' -> NOT found as single token")

# 5. Show how a word gets tokenized
print("\n5. Tokenization examples:")
examples = ["length", "quicksort", "def", "uncommon_variable_name", "print"]
for example in examples:
    tokens = tokenizer.tokenize(example)
    token_ids = tokenizer.encode(example, add_special_tokens=False)
    print(f"   '{example}':")
    print(f"      Tokens: {tokens}")
    print(f"      IDs: {token_ids}")

# 6. Reverse lookup - get token from ID
print("\n6. Reverse lookup (ID -> token):")
interesting_ids = [0, 1, 2, 100, 1000, 5000, 10000]
for token_id in interesting_ids:
    token = tokenizer.decode([token_id])
    print(f"   ID {token_id:5d} -> '{token}'")

# 7. Special tokens
print("\n7. Special tokens:")
print(f"   BOS (beginning of sequence): {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})")
print(f"   EOS (end of sequence): {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")
print(f"   PAD (padding): {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
print(f"   UNK (unknown): {tokenizer.unk_token} (ID: {tokenizer.unk_token_id})")

# 8. Export vocabulary to file (optional)
print("\n8. Export options:")
print("   To export full vocabulary to JSON:")
print("   import json")
print("   with open('codegen_vocabulary.json', 'w') as f:")
print("       json.dump(vocab, f, indent=2)")

print("\n" + "=" * 80)
print("TIP: The vocabulary is fixed - you cannot add new tokens at inference time!")
print("=" * 80)