import os import json import sys # ============================================================================== # CONFIGURATION # ============================================================================== SEARCH_ROOTS = [ r"B:\LLM\.cache\huggingface\hub", r"C:\.cache" ] # Mistral Nemo Baseline (Tekken Tokenizer) BASELINE_VOCAB = 131072 BASELINE_ROPE = 1000000.0 # ============================================================================== # SCRIPT # ============================================================================== def find_model_paths(roots): model_paths = [] print(f"šŸ” Scanning directories for config.json...") for root_dir in roots: if not os.path.exists(root_dir): print(f"āš ļø Warning: Directory not found: {root_dir}") continue print(f" -> Crawling {root_dir} (this may take a moment)...") count = 0 for root, dirs, files in os.walk(root_dir): if "config.json" in files: model_paths.append(root) count += 1 # Optional: Optimization to stop diving deeper if we found a model root # (Commented out to ensure we find snapshots in HF cache structure) # dirs[:] = [] print(f" Found {count} models in {root_dir}") return model_paths def check_models(): paths = find_model_paths(SEARCH_ROOTS) if not paths: print("\nāŒ No models found in the specified directories.") return print("\n" + "="*110) print(f"{'Model Name (Short)':<45} | {'Vocab':<8} | {'RoPE Theta':<12} | {'EOS ID':<8} | {'Status'}") print("="*110) suspects = [] for path in paths: config_path = os.path.join(path, "config.json") try: with open(config_path, 'r', encoding='utf-8') as f: cfg = json.load(f) except Exception as e: print(f"āŒ Error reading {path}: {e}") continue # Extract Metadata vocab_size = cfg.get("vocab_size", 0) rope_theta = cfg.get("rope_theta", 0.0) eos_id = cfg.get("eos_token_id", "N/A") arch = cfg.get("architectures", ["Unknown"])[0] # Clean up the name for display (handle HF cache paths) name = os.path.basename(path) if "snapshots" in path: # Try to get the folder name above 'snapshots' for better readability try: parent = os.path.dirname(os.path.dirname(path)) name = os.path.basename(parent).replace("models--", "") except: pass # --- THE AUDIT LOGIC --- flags = [] is_suspect = False # Check Vocab (The most likely cause of your EOS bug) # Mistral Nemo is 131072. Llama 3 is 128256. Old Mistral is 32000. if vocab_size != BASELINE_VOCAB: flags.append(f"VOCAB({vocab_size})") is_suspect = True # Check RoPE (Nemo is 1,000,000. Standard is 10,000) if float(rope_theta) != float(BASELINE_ROPE): flags.append(f"ROPE({int(rope_theta)})") is_suspect = True # Check EOS (Multi-EOS can confuse mergekit) if isinstance(eos_id, list) and len(eos_id) > 1: flags.append("MULTI-EOS") # This isn't always fatal, but good to know status = "āœ… OK" if not is_suspect else f"🚩 {' '.join(flags)}" # Print Row print(f"{name[:45]:<45} | {str(vocab_size):<8} | {str(rope_theta):<12} | {str(eos_id):<8} | {status}") if is_suspect: suspects.append((name, path, flags)) print("\n" + "="*110) if suspects: print(f"🚨 DETECTED {len(suspects)} POTENTIALLY INCOMPATIBLE MODELS:") print("These models do not match the Mistral Nemo baseline (Vocab 131k, RoPE 1M).") print("Including them in the merge is likely causing the 'One Sentence' bug. Use vocab_resizer.py to fix.\n") for s_name, s_path, s_flags in suspects: print(f"āŒ {s_name}") print(f" Path: {s_path}") print(f" Issues: {', '.join(s_flags)}\n") else: print("āœ… All scanned models match the Mistral Nemo baseline specs.") if __name__ == "__main__": check_models()