MINDI-1.5-Vision-Coder / scripts /add_special_tokens.py

Faaz

Day 1 Complete: Tokenizer setup — Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test

11e0d89 about 1 month ago

raw

history blame contribute delete

3.28 kB

	"""
	MINDI 1.5 Vision-Coder — Step 4: Add MINDI Special Tokens

	Loads the base Qwen2.5-Coder tokenizer, adds 22 MINDI-specific
	special tokens, saves the updated tokenizer, and reports vocab changes.
	"""

	import sys
	from pathlib import Path

	PROJECT_ROOT = Path(__file__).resolve().parents[1]
	sys.path.insert(0, str(PROJECT_ROOT))


	MINDI_SPECIAL_TOKENS = [
	"<\|mindi_start\|>",
	"<\|mindi_end\|>",
	"<\|code_start\|>",
	"<\|code_end\|>",
	"<\|vision_start\|>",
	"<\|vision_end\|>",
	"<\|critique_start\|>",
	"<\|critique_end\|>",
	"<\|suggest_start\|>",
	"<\|suggest_end\|>",
	"<\|think_start\|>",
	"<\|think_end\|>",
	"<\|file_start\|>",
	"<\|file_end\|>",
	"<\|search_start\|>",
	"<\|search_end\|>",
	"<\|sandbox_start\|>",
	"<\|sandbox_end\|>",
	"<\|error_start\|>",
	"<\|error_end\|>",
	"<\|fix_start\|>",
	"<\|fix_end\|>",
	]


	def main():
	from transformers import AutoTokenizer

	base_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
	save_dir = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"

	print(f"\n{'='*60}")
	print(f" Step 4: Adding MINDI Special Tokens")
	print(f"{'='*60}")

	# Load base tokenizer
	print(f"\n Loading base tokenizer from: {base_dir}")
	tokenizer = AutoTokenizer.from_pretrained(str(base_dir), trust_remote_code=True)
	original_vocab_size = len(tokenizer)
	print(f" ✅ Base vocab size: {original_vocab_size:,}")

	# Add special tokens
	print(f"\n Adding {len(MINDI_SPECIAL_TOKENS)} MINDI special tokens...")
	num_added = tokenizer.add_special_tokens({
	"additional_special_tokens": MINDI_SPECIAL_TOKENS
	})
	new_vocab_size = len(tokenizer)
	print(f" ✅ Tokens added: {num_added}")
	print(f" ✅ New vocab size: {new_vocab_size:,}")
	print(f" ✅ Delta: +{new_vocab_size - original_vocab_size}")

	# Save updated tokenizer
	save_dir.mkdir(parents=True, exist_ok=True)
	tokenizer.save_pretrained(str(save_dir))
	print(f"\n ✅ Saved MINDI tokenizer to: {save_dir}")

	# Show token ID mapping
	print(f"\n{'='*60}")
	print(f" Special Token ID Mapping")
	print(f"{'='*60}")
	for token in MINDI_SPECIAL_TOKENS:
	token_id = tokenizer.convert_tokens_to_ids(token)
	print(f" {token:<25} → ID {token_id}")

	# Verify round-trip for each special token
	print(f"\n{'='*60}")
	print(f" Round-trip Verification")
	print(f"{'='*60}")
	all_pass = True
	for token in MINDI_SPECIAL_TOKENS:
	token_id = tokenizer.convert_tokens_to_ids(token)
	decoded = tokenizer.decode([token_id])
	match = decoded == token
	if not match:
	all_pass = False
	status = "✅" if match else "❌"
	print(f" {status} {token} → {token_id} → \"{decoded}\"")

	# Summary
	print(f"\n{'='*60}")
	print(f" SUMMARY")
	print(f"{'='*60}")
	print(f" Original vocab size: {original_vocab_size:,}")
	print(f" New vocab size: {new_vocab_size:,}")
	print(f" Special tokens added: {num_added}")
	if all_pass:
	print(f" Round-trip test: ✅ ALL {len(MINDI_SPECIAL_TOKENS)} PASSED")
	else:
	print(f" Round-trip test: ❌ SOME FAILED")
	print(f"{'='*60}\n")


	if __name__ == "__main__":
	main()