Spaces:

Naphula
/

model_tools

Running

App Files Files Community

Naphula commited on Dec 24, 2025

Commit

9f4bde3

verified ·

1 Parent(s): 4b54cb2

Upload metadata_audit.py

Browse files

Files changed (1) hide show

metadata_audit.py +123 -0

metadata_audit.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import json
+import sys
+# ==============================================================================
+# CONFIGURATION
+# ==============================================================================
+SEARCH_ROOTS = [
+    r"B:\LLM\.cache\huggingface\hub",
+    r"C:\.cache"
+]
+# Mistral Nemo Baseline (Tekken Tokenizer)
+BASELINE_VOCAB = 131072
+BASELINE_ROPE = 1000000.0
+# ==============================================================================
+# SCRIPT
+# ==============================================================================
+def find_model_paths(roots):
+    model_paths = []
+    print(f"🔍 Scanning directories for config.json...")
+    for root_dir in roots:
+        if not os.path.exists(root_dir):
+            print(f"⚠️  Warning: Directory not found: {root_dir}")
+            continue
+        print(f"   -> Crawling {root_dir} (this may take a moment)...")
+        count = 0
+        for root, dirs, files in os.walk(root_dir):
+            if "config.json" in files:
+                model_paths.append(root)
+                count += 1
+                # Optional: Optimization to stop diving deeper if we found a model root
+                # (Commented out to ensure we find snapshots in HF cache structure)
+                # dirs[:] = []
+        print(f"      Found {count} models in {root_dir}")
+    return model_paths
+def check_models():
+    paths = find_model_paths(SEARCH_ROOTS)
+    if not paths:
+        print("\n❌ No models found in the specified directories.")
+        return
+    print("\n" + "="*110)
+    print(f"{'Model Name (Short)':<45} | {'Vocab':<8} | {'RoPE Theta':<12} | {'EOS ID':<8} | {'Status'}")
+    print("="*110)
+    suspects = []
+    for path in paths:
+        config_path = os.path.join(path, "config.json")
+        try:
+            with open(config_path, 'r', encoding='utf-8') as f:
+                cfg = json.load(f)
+        except Exception as e:
+            print(f"❌ Error reading {path}: {e}")
+            continue
+        # Extract Metadata
+        vocab_size = cfg.get("vocab_size", 0)
+        rope_theta = cfg.get("rope_theta", 0.0)
+        eos_id = cfg.get("eos_token_id", "N/A")
+        arch = cfg.get("architectures", ["Unknown"])[0]
+        # Clean up the name for display (handle HF cache paths)
+        name = os.path.basename(path)
+        if "snapshots" in path:
+            # Try to get the folder name above 'snapshots' for better readability
+            try:
+                parent = os.path.dirname(os.path.dirname(path))
+                name = os.path.basename(parent).replace("models--", "")
+            except:
+                pass
+        # --- THE AUDIT LOGIC ---
+        flags = []
+        is_suspect = False
+        # Check Vocab (The most likely cause of your EOS bug)
+        # Mistral Nemo is 131072. Llama 3 is 128256. Old Mistral is 32000.
+        if vocab_size != BASELINE_VOCAB:
+            flags.append(f"VOCAB({vocab_size})")
+            is_suspect = True
+        # Check RoPE (Nemo is 1,000,000. Standard is 10,000)
+        if float(rope_theta) != float(BASELINE_ROPE):
+            flags.append(f"ROPE({int(rope_theta)})")
+            is_suspect = True
+        # Check EOS (Multi-EOS can confuse mergekit)
+        if isinstance(eos_id, list) and len(eos_id) > 1:
+            flags.append("MULTI-EOS")
+            # This isn't always fatal, but good to know
+        status = "✅ OK" if not is_suspect else f"🚩 {' '.join(flags)}"
+        # Print Row
+        print(f"{name[:45]:<45} | {str(vocab_size):<8} | {str(rope_theta):<12} | {str(eos_id):<8} | {status}")
+        if is_suspect:
+            suspects.append((name, path, flags))
+    print("\n" + "="*110)
+    if suspects:
+        print(f"🚨 DETECTED {len(suspects)} POTENTIALLY INCOMPATIBLE MODELS:")
+        print("These models do not match the Mistral Nemo baseline (Vocab 131k, RoPE 1M).")
+        print("Including them in the merge is likely causing the 'One Sentence' bug.\n")
+        for s_name, s_path, s_flags in suspects:
+            print(f"❌ {s_name}")
+            print(f"   Path: {s_path}")
+            print(f"   Issues: {', '.join(s_flags)}\n")
+    else:
+        print("✅ All scanned models match the Mistral Nemo baseline specs.")
+if __name__ == "__main__":
+    check_models()