api / explore_vocabulary.py
gary-boon
Add research attention analysis endpoints with Q/K/V extraction
37ed739
"""
Script to explore CodeGen model vocabulary
"""
from transformers import AutoTokenizer
# Load the tokenizer (which contains the vocabulary)
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
print("=" * 80)
print("CODEGEN VOCABULARY EXPLORATION")
print("=" * 80)
# 1. Vocabulary size
vocab_size = len(tokenizer)
print(f"\n1. Vocabulary Size: {vocab_size:,} tokens")
# 2. Get the vocabulary as a dictionary (token -> id)
vocab = tokenizer.get_vocab()
print(f"\n2. Vocabulary type: {type(vocab)}")
# 3. Show some example tokens
print("\n3. Sample tokens from vocabulary:")
sample_tokens = list(vocab.items())[:20]
for token, token_id in sample_tokens:
print(f" ID {token_id:5d}: '{token}'")
# 4. Search for specific tokens
print("\n4. Programming-related tokens:")
search_terms = ["length", "def", "class", "function", "return", "import", "for", "while"]
for term in search_terms:
if term in vocab:
token_id = vocab[term]
print(f" '{term}' -> Token ID: {token_id}")
else:
print(f" '{term}' -> NOT found as single token")
# 5. Show how a word gets tokenized
print("\n5. Tokenization examples:")
examples = ["length", "quicksort", "def", "uncommon_variable_name", "print"]
for example in examples:
tokens = tokenizer.tokenize(example)
token_ids = tokenizer.encode(example, add_special_tokens=False)
print(f" '{example}':")
print(f" Tokens: {tokens}")
print(f" IDs: {token_ids}")
# 6. Reverse lookup - get token from ID
print("\n6. Reverse lookup (ID -> token):")
interesting_ids = [0, 1, 2, 100, 1000, 5000, 10000]
for token_id in interesting_ids:
token = tokenizer.decode([token_id])
print(f" ID {token_id:5d} -> '{token}'")
# 7. Special tokens
print("\n7. Special tokens:")
print(f" BOS (beginning of sequence): {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})")
print(f" EOS (end of sequence): {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")
print(f" PAD (padding): {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
print(f" UNK (unknown): {tokenizer.unk_token} (ID: {tokenizer.unk_token_id})")
# 8. Export vocabulary to file (optional)
print("\n8. Export options:")
print(" To export full vocabulary to JSON:")
print(" import json")
print(" with open('codegen_vocabulary.json', 'w') as f:")
print(" json.dump(vocab, f, indent=2)")
print("\n" + "=" * 80)
print("TIP: The vocabulary is fixed - you cannot add new tokens at inference time!")
print("=" * 80)