Spaces:

visualisable-ai
/

api

Sleeping

api / explore_vocabulary.py

gary-boon

Add research attention analysis endpoints with Q/K/V extraction

37ed739 29 days ago

2.51 kB

	"""
	Script to explore CodeGen model vocabulary
	"""
	from transformers import AutoTokenizer

	# Load the tokenizer (which contains the vocabulary)
	tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")

	print("=" * 80)
	print("CODEGEN VOCABULARY EXPLORATION")
	print("=" * 80)

	# 1. Vocabulary size
	vocab_size = len(tokenizer)
	print(f"\n1. Vocabulary Size: {vocab_size:,} tokens")

	# 2. Get the vocabulary as a dictionary (token -> id)
	vocab = tokenizer.get_vocab()
	print(f"\n2. Vocabulary type: {type(vocab)}")

	# 3. Show some example tokens
	print("\n3. Sample tokens from vocabulary:")
	sample_tokens = list(vocab.items())[:20]
	for token, token_id in sample_tokens:
	print(f" ID {token_id:5d}: '{token}'")

	# 4. Search for specific tokens
	print("\n4. Programming-related tokens:")
	search_terms = ["length", "def", "class", "function", "return", "import", "for", "while"]
	for term in search_terms:
	if term in vocab:
	token_id = vocab[term]
	print(f" '{term}' -> Token ID: {token_id}")
	else:
	print(f" '{term}' -> NOT found as single token")

	# 5. Show how a word gets tokenized
	print("\n5. Tokenization examples:")
	examples = ["length", "quicksort", "def", "uncommon_variable_name", "print"]
	for example in examples:
	tokens = tokenizer.tokenize(example)
	token_ids = tokenizer.encode(example, add_special_tokens=False)
	print(f" '{example}':")
	print(f" Tokens: {tokens}")
	print(f" IDs: {token_ids}")

	# 6. Reverse lookup - get token from ID
	print("\n6. Reverse lookup (ID -> token):")
	interesting_ids = [0, 1, 2, 100, 1000, 5000, 10000]
	for token_id in interesting_ids:
	token = tokenizer.decode([token_id])
	print(f" ID {token_id:5d} -> '{token}'")

	# 7. Special tokens
	print("\n7. Special tokens:")
	print(f" BOS (beginning of sequence): {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})")
	print(f" EOS (end of sequence): {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")
	print(f" PAD (padding): {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
	print(f" UNK (unknown): {tokenizer.unk_token} (ID: {tokenizer.unk_token_id})")

	# 8. Export vocabulary to file (optional)
	print("\n8. Export options:")
	print(" To export full vocabulary to JSON:")
	print(" import json")
	print(" with open('codegen_vocabulary.json', 'w') as f:")
	print(" json.dump(vocab, f, indent=2)")

	print("\n" + "=" * 80)
	print("TIP: The vocabulary is fixed - you cannot add new tokens at inference time!")
	print("=" * 80)