Spaces:
Running
Running
| # python tokeninspector.py "B:\12B\models--mistralai--Mistral-Nemo-Instruct-2407" "B:\12B\models--aixonlab--Aether-12b.backup" "B:\12B\models--aixonlab--Aether-12b" | |
| import os | |
| import click | |
| import torch | |
| import transformers | |
| from mergekit.io.lazy_tensor_loader import LazyTensorLoader | |
| def get_embed_tensor(model_path): | |
| """Lazily loads the embedding tensor from a model directory.""" | |
| try: | |
| loader = LazyTensorLoader.from_disk(model_path) | |
| for key in loader.index.tensor_paths.keys(): | |
| if "embed_tokens.weight" in key or "wte.weight" in key: | |
| return loader.get_tensor(key) | |
| except Exception as e: | |
| print(f" [!] Error loading tensors from {model_path}: {e}") | |
| return None | |
| def main(base_model, donor_model, output_model): | |
| print("="*60) | |
| print("🔍 TOKEN SURGEON AUDIT TOOL") | |
| print("="*60) | |
| print("\n[1] Loading Tokenizers...") | |
| tok_base = transformers.AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) | |
| tok_donor = transformers.AutoTokenizer.from_pretrained(donor_model, trust_remote_code=True) | |
| tok_out = transformers.AutoTokenizer.from_pretrained(output_model, trust_remote_code=True) | |
| print(f" Base: {len(tok_base)} tokens") | |
| print(f" Donor: {len(tok_donor)} tokens") | |
| print(f" Output: {len(tok_out)} tokens") | |
| if len(tok_out) != len(tok_donor): | |
| print(" ❌ FAIL: Output vocab size does not match Donor vocab size!") | |
| else: | |
| print(" ✅ PASS: Output vocab size matches Donor.") | |
| print("\n[2] Loading Embedding Tensors (Lazy Load)...") | |
| emb_base = get_embed_tensor(base_model) | |
| emb_donor = get_embed_tensor(donor_model) | |
| emb_out = get_embed_tensor(output_model) | |
| print(f" Base Matrix: {emb_base.shape if emb_base is not None else 'Not found'}") | |
| print(f" Donor Matrix: {emb_donor.shape if emb_donor is not None else 'Not found'}") | |
| print(f" Output Matrix: {emb_out.shape if emb_out is not None else 'Not found'}") | |
| if emb_out is not None and emb_donor is not None: | |
| if emb_out.shape[0] >= len(tok_donor): | |
| print(" ✅ PASS: Output embedding matrix size is sufficient for Donor vocab.") | |
| else: | |
| print(" ❌ FAIL: Output embedding matrix is smaller than Donor vocab!") | |
| vocab_base = tok_base.get_vocab() | |
| vocab_donor = tok_donor.get_vocab() | |
| shared_tokens = set(vocab_base.keys()).intersection(set(vocab_donor.keys())) | |
| donor_only_tokens = set(vocab_donor.keys()) - set(vocab_base.keys()) | |
| print("\n[3] Testing a Shared Token (Verifying exact transfer)...") | |
| if shared_tokens: | |
| # Pick a common word that is likely to exist in both | |
| test_shared = None | |
| for candidate in [" the", " hello", "The", "Hello", "Ġthe", "Ġhello", "the", "hello"]: | |
| if candidate in shared_tokens: | |
| test_shared = candidate | |
| break | |
| if not test_shared: | |
| test_shared = list(shared_tokens)[len(shared_tokens)//2] | |
| id_base = vocab_base[test_shared] | |
| id_out = vocab_donor[test_shared] # output uses donor vocab | |
| print(f" Token: '{test_shared}'") | |
| print(f" ID in Base: {id_base} | ID in Output: {id_out}") | |
| if emb_base is not None and emb_out is not None: | |
| vec_base = emb_base[id_base].float() | |
| vec_out = emb_out[id_out].float() | |
| cos_sim = torch.nn.functional.cosine_similarity(vec_base, vec_out, dim=0).item() | |
| print(f" Cosine similarity between Base and Output vectors: {cos_sim:.6f}") | |
| if cos_sim > 0.999: | |
| print(" ✅ PASS: Embeddings match perfectly. The vector was successfully moved to the new ID.") | |
| else: | |
| print(" ❌ FAIL: Embeddings for shared token do not match!") | |
| else: | |
| print(" ⚠️ No shared tokens found between vocabularies.") | |
| print("\n[4] Testing a New Token (Verifying OMP approximation)...") | |
| if donor_only_tokens: | |
| # Try to find a special token or a distinct word | |
| test_new = list(donor_only_tokens)[0] | |
| for t in donor_only_tokens: | |
| if "<" in t or "[" in t or "im_start" in t: | |
| test_new = t | |
| break | |
| id_out = vocab_donor[test_new] | |
| print(f" Token: '{test_new}' (Only exists in Donor)") | |
| print(f" ID in Output: {id_out}") | |
| if emb_out is not None: | |
| vec_out = emb_out[id_out].float() | |
| norm = vec_out.norm().item() | |
| print(f" Vector L2 Norm: {norm:.4f}") | |
| if norm > 0.01: | |
| print(" ✅ PASS: Vector is non-zero. OMP successfully approximated a new embedding.") | |
| else: | |
| print(" ⚠️ WARN: Vector is zero or very close to zero. It may have been treated as a junk token.") | |
| else: | |
| print(" ⚠️ No donor-only tokens found. Vocabularies are identical.") | |
| print("\n[5] Testing Tokenizer Encoding Behavior...") | |
| test_text = "Hello world! This is a test of the new tokenizer. <|im_start|>system\n12345<|im_end|>" | |
| enc_donor = tok_donor.encode(test_text) | |
| enc_out = tok_out.encode(test_text) | |
| if enc_donor == enc_out: | |
| print(" ✅ PASS: Output model encodes text exactly identically to the Donor model.") | |
| else: | |
| print(" ❌ FAIL: Output model encoding differs from Donor model!") | |
| print(f" Donor: {enc_donor[:10]}...") | |
| print(f" Output: {enc_out[:10]}...") | |
| print("\n" + "="*60) | |
| print("Audit Complete.") | |
| print("="*60) | |
| if __name__ == '__main__': | |
| main() |