Naphula commited on
Commit
9f4bde3
Β·
verified Β·
1 Parent(s): 4b54cb2

Upload metadata_audit.py

Browse files
Files changed (1) hide show
  1. metadata_audit.py +123 -0
metadata_audit.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import sys
4
+
5
+ # ==============================================================================
6
+ # CONFIGURATION
7
+ # ==============================================================================
8
+ SEARCH_ROOTS = [
9
+ r"B:\LLM\.cache\huggingface\hub",
10
+ r"C:\.cache"
11
+ ]
12
+
13
+ # Mistral Nemo Baseline (Tekken Tokenizer)
14
+ BASELINE_VOCAB = 131072
15
+ BASELINE_ROPE = 1000000.0
16
+
17
+ # ==============================================================================
18
+ # SCRIPT
19
+ # ==============================================================================
20
+
21
+ def find_model_paths(roots):
22
+ model_paths = []
23
+ print(f"πŸ” Scanning directories for config.json...")
24
+
25
+ for root_dir in roots:
26
+ if not os.path.exists(root_dir):
27
+ print(f"⚠️ Warning: Directory not found: {root_dir}")
28
+ continue
29
+
30
+ print(f" -> Crawling {root_dir} (this may take a moment)...")
31
+ count = 0
32
+ for root, dirs, files in os.walk(root_dir):
33
+ if "config.json" in files:
34
+ model_paths.append(root)
35
+ count += 1
36
+ # Optional: Optimization to stop diving deeper if we found a model root
37
+ # (Commented out to ensure we find snapshots in HF cache structure)
38
+ # dirs[:] = []
39
+ print(f" Found {count} models in {root_dir}")
40
+
41
+ return model_paths
42
+
43
+ def check_models():
44
+ paths = find_model_paths(SEARCH_ROOTS)
45
+
46
+ if not paths:
47
+ print("\n❌ No models found in the specified directories.")
48
+ return
49
+
50
+ print("\n" + "="*110)
51
+ print(f"{'Model Name (Short)':<45} | {'Vocab':<8} | {'RoPE Theta':<12} | {'EOS ID':<8} | {'Status'}")
52
+ print("="*110)
53
+
54
+ suspects = []
55
+
56
+ for path in paths:
57
+ config_path = os.path.join(path, "config.json")
58
+
59
+ try:
60
+ with open(config_path, 'r', encoding='utf-8') as f:
61
+ cfg = json.load(f)
62
+ except Exception as e:
63
+ print(f"❌ Error reading {path}: {e}")
64
+ continue
65
+
66
+ # Extract Metadata
67
+ vocab_size = cfg.get("vocab_size", 0)
68
+ rope_theta = cfg.get("rope_theta", 0.0)
69
+ eos_id = cfg.get("eos_token_id", "N/A")
70
+ arch = cfg.get("architectures", ["Unknown"])[0]
71
+
72
+ # Clean up the name for display (handle HF cache paths)
73
+ name = os.path.basename(path)
74
+ if "snapshots" in path:
75
+ # Try to get the folder name above 'snapshots' for better readability
76
+ try:
77
+ parent = os.path.dirname(os.path.dirname(path))
78
+ name = os.path.basename(parent).replace("models--", "")
79
+ except:
80
+ pass
81
+
82
+ # --- THE AUDIT LOGIC ---
83
+ flags = []
84
+ is_suspect = False
85
+
86
+ # Check Vocab (The most likely cause of your EOS bug)
87
+ # Mistral Nemo is 131072. Llama 3 is 128256. Old Mistral is 32000.
88
+ if vocab_size != BASELINE_VOCAB:
89
+ flags.append(f"VOCAB({vocab_size})")
90
+ is_suspect = True
91
+
92
+ # Check RoPE (Nemo is 1,000,000. Standard is 10,000)
93
+ if float(rope_theta) != float(BASELINE_ROPE):
94
+ flags.append(f"ROPE({int(rope_theta)})")
95
+ is_suspect = True
96
+
97
+ # Check EOS (Multi-EOS can confuse mergekit)
98
+ if isinstance(eos_id, list) and len(eos_id) > 1:
99
+ flags.append("MULTI-EOS")
100
+ # This isn't always fatal, but good to know
101
+
102
+ status = "βœ… OK" if not is_suspect else f"🚩 {' '.join(flags)}"
103
+
104
+ # Print Row
105
+ print(f"{name[:45]:<45} | {str(vocab_size):<8} | {str(rope_theta):<12} | {str(eos_id):<8} | {status}")
106
+
107
+ if is_suspect:
108
+ suspects.append((name, path, flags))
109
+
110
+ print("\n" + "="*110)
111
+ if suspects:
112
+ print(f"🚨 DETECTED {len(suspects)} POTENTIALLY INCOMPATIBLE MODELS:")
113
+ print("These models do not match the Mistral Nemo baseline (Vocab 131k, RoPE 1M).")
114
+ print("Including them in the merge is likely causing the 'One Sentence' bug.\n")
115
+ for s_name, s_path, s_flags in suspects:
116
+ print(f"❌ {s_name}")
117
+ print(f" Path: {s_path}")
118
+ print(f" Issues: {', '.join(s_flags)}\n")
119
+ else:
120
+ print("βœ… All scanned models match the Mistral Nemo baseline specs.")
121
+
122
+ if __name__ == "__main__":
123
+ check_models()