Spaces:
Sleeping
Sleeping
| """ | |
| Script to explore CodeGen model vocabulary | |
| """ | |
| from transformers import AutoTokenizer | |
| # Load the tokenizer (which contains the vocabulary) | |
| tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono") | |
| print("=" * 80) | |
| print("CODEGEN VOCABULARY EXPLORATION") | |
| print("=" * 80) | |
| # 1. Vocabulary size | |
| vocab_size = len(tokenizer) | |
| print(f"\n1. Vocabulary Size: {vocab_size:,} tokens") | |
| # 2. Get the vocabulary as a dictionary (token -> id) | |
| vocab = tokenizer.get_vocab() | |
| print(f"\n2. Vocabulary type: {type(vocab)}") | |
| # 3. Show some example tokens | |
| print("\n3. Sample tokens from vocabulary:") | |
| sample_tokens = list(vocab.items())[:20] | |
| for token, token_id in sample_tokens: | |
| print(f" ID {token_id:5d}: '{token}'") | |
| # 4. Search for specific tokens | |
| print("\n4. Programming-related tokens:") | |
| search_terms = ["length", "def", "class", "function", "return", "import", "for", "while"] | |
| for term in search_terms: | |
| if term in vocab: | |
| token_id = vocab[term] | |
| print(f" '{term}' -> Token ID: {token_id}") | |
| else: | |
| print(f" '{term}' -> NOT found as single token") | |
| # 5. Show how a word gets tokenized | |
| print("\n5. Tokenization examples:") | |
| examples = ["length", "quicksort", "def", "uncommon_variable_name", "print"] | |
| for example in examples: | |
| tokens = tokenizer.tokenize(example) | |
| token_ids = tokenizer.encode(example, add_special_tokens=False) | |
| print(f" '{example}':") | |
| print(f" Tokens: {tokens}") | |
| print(f" IDs: {token_ids}") | |
| # 6. Reverse lookup - get token from ID | |
| print("\n6. Reverse lookup (ID -> token):") | |
| interesting_ids = [0, 1, 2, 100, 1000, 5000, 10000] | |
| for token_id in interesting_ids: | |
| token = tokenizer.decode([token_id]) | |
| print(f" ID {token_id:5d} -> '{token}'") | |
| # 7. Special tokens | |
| print("\n7. Special tokens:") | |
| print(f" BOS (beginning of sequence): {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})") | |
| print(f" EOS (end of sequence): {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})") | |
| print(f" PAD (padding): {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})") | |
| print(f" UNK (unknown): {tokenizer.unk_token} (ID: {tokenizer.unk_token_id})") | |
| # 8. Export vocabulary to file (optional) | |
| print("\n8. Export options:") | |
| print(" To export full vocabulary to JSON:") | |
| print(" import json") | |
| print(" with open('codegen_vocabulary.json', 'w') as f:") | |
| print(" json.dump(vocab, f, indent=2)") | |
| print("\n" + "=" * 80) | |
| print("TIP: The vocabulary is fixed - you cannot add new tokens at inference time!") | |
| print("=" * 80) | |