AndreasThinks commited on
Commit
1da5816
·
verified ·
1 Parent(s): 7b1d688

Upload eval_standard_benchmarks.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. eval_standard_benchmarks.py +215 -0
eval_standard_benchmarks.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Evaluate fine-tuned model on standard LLM benchmarks.
2
+
3
+ This script runs as a Hugging Face Job to evaluate the model on standard
4
+ benchmarks (MMLU, HellaSwag, ARC, etc.) using lm-evaluation-harness.
5
+ """
6
+ # /// script
7
+ # requires-python = ">=3.11"
8
+ # dependencies = [
9
+ # "lm-eval>=0.4.0",
10
+ # "transformers>=4.40.0",
11
+ # "torch>=2.0.0",
12
+ # "peft>=0.7.0",
13
+ # "huggingface-hub>=0.20.0",
14
+ # "accelerate>=0.20.0",
15
+ # ]
16
+ # ///
17
+
18
+ import json
19
+ import os
20
+ import subprocess
21
+ from datetime import datetime
22
+ from pathlib import Path
23
+ from huggingface_hub import HfApi
24
+
25
+ def run_benchmarks(model_id: str, output_dir: str, use_adapter: bool = False, base_model: str = None):
26
+ """Run standard benchmarks using lm-eval."""
27
+ # Define benchmark tasks
28
+ tasks = [
29
+ "mmlu", # General knowledge
30
+ "hellaswag", # Common sense reasoning
31
+ "arc_challenge", # Science reasoning
32
+ "truthfulqa_mc2", # Truthfulness
33
+ "gsm8k", # Math reasoning
34
+ "winogrande", # Pronoun resolution
35
+ ]
36
+
37
+ # Build command
38
+ cmd = [
39
+ "lm_eval",
40
+ "--model", "hf",
41
+ "--tasks", ",".join(tasks),
42
+ "--device", "cuda:0",
43
+ "--batch_size", "8",
44
+ "--output_path", output_dir,
45
+ "--log_samples"
46
+ ]
47
+
48
+ # Add model args
49
+ if use_adapter and base_model:
50
+ model_args = f"pretrained={base_model},peft={model_id},dtype=float16"
51
+ else:
52
+ model_args = f"pretrained={model_id},dtype=float16"
53
+
54
+ cmd.extend(["--model_args", model_args])
55
+
56
+ print(f"\nRunning benchmarks on: {model_id}")
57
+ print(f"Tasks: {', '.join(tasks)}")
58
+ print(f"Output: {output_dir}\n")
59
+ print("Command:", " ".join(cmd), "\n")
60
+
61
+ # Run benchmarks
62
+ try:
63
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
64
+ print(result.stdout)
65
+ if result.stderr:
66
+ print("STDERR:", result.stderr)
67
+ return True
68
+ except subprocess.CalledProcessError as e:
69
+ print(f"✗ Benchmark failed: {e}")
70
+ print("STDOUT:", e.stdout)
71
+ print("STDERR:", e.stderr)
72
+ return False
73
+
74
+ def extract_results(results_dir: Path) -> dict:
75
+ """Extract results from lm-eval output."""
76
+ results_file = results_dir / "results.json"
77
+
78
+ if not results_file.exists():
79
+ print(f"⚠️ Results file not found: {results_file}")
80
+ return {}
81
+
82
+ with open(results_file, 'r') as f:
83
+ data = json.load(f)
84
+
85
+ # Extract key metrics
86
+ results = data.get("results", {})
87
+ summary = {}
88
+
89
+ for task, metrics in results.items():
90
+ # Get the main accuracy metric (varies by task)
91
+ if "acc,none" in metrics:
92
+ summary[task] = metrics["acc,none"]
93
+ elif "acc_norm,none" in metrics:
94
+ summary[task] = metrics["acc_norm,none"]
95
+ elif "exact_match,none" in metrics:
96
+ summary[task] = metrics["exact_match,none"]
97
+ else:
98
+ # Take first available metric
99
+ summary[task] = list(metrics.values())[0] if metrics else 0
100
+
101
+ return summary
102
+
103
+ def main():
104
+ """Run standard benchmark evaluation."""
105
+ print("=" * 70)
106
+ print("NATO Doctrine Model - Standard LLM Benchmarks")
107
+ print("=" * 70)
108
+
109
+ # Configuration
110
+ adapter_model = "AndreasThinks/mistral-7b-nato-doctrine"
111
+ base_model = "mistralai/Mistral-7B-Instruct-v0.3"
112
+
113
+ # Create output directories
114
+ results_dir = Path("benchmark_results")
115
+ results_dir.mkdir(exist_ok=True)
116
+
117
+ base_output = results_dir / "base_model"
118
+ ft_output = results_dir / "finetuned_model"
119
+
120
+ # Run benchmarks on base model
121
+ print("\n[1/2] Running benchmarks on BASE model...")
122
+ print("=" * 70)
123
+ base_success = run_benchmarks(
124
+ model_id=base_model,
125
+ output_dir=str(base_output),
126
+ use_adapter=False
127
+ )
128
+
129
+ # Run benchmarks on fine-tuned model
130
+ print("\n[2/2] Running benchmarks on FINE-TUNED model...")
131
+ print("=" * 70)
132
+ ft_success = run_benchmarks(
133
+ model_id=adapter_model,
134
+ output_dir=str(ft_output),
135
+ use_adapter=True,
136
+ base_model=base_model
137
+ )
138
+
139
+ # Extract and compare results
140
+ if base_success and ft_success:
141
+ print("\n" + "=" * 70)
142
+ print("BENCHMARK COMPARISON")
143
+ print("=" * 70)
144
+
145
+ base_results = extract_results(base_output)
146
+ ft_results = extract_results(ft_output)
147
+
148
+ print(f"\n{'Benchmark':<20} {'Base':<12} {'Fine-tuned':<12} {'Change':<12} {'Status'}")
149
+ print("-" * 70)
150
+
151
+ comparison = {}
152
+ for task in base_results:
153
+ if task in ft_results:
154
+ base_score = base_results[task] * 100
155
+ ft_score = ft_results[task] * 100
156
+ delta = ft_score - base_score
157
+ delta_pct = (delta / base_score * 100) if base_score > 0 else 0
158
+
159
+ # Status indicator
160
+ if abs(delta_pct) < 5:
161
+ status = "✅"
162
+ elif abs(delta_pct) < 15:
163
+ status = "⚠️"
164
+ else:
165
+ status = "❌"
166
+
167
+ print(f"{task:<20} {base_score:>10.2f}% {ft_score:>11.2f}% {delta_pct:>+10.1f}% {status}")
168
+
169
+ comparison[task] = {
170
+ "base_score": round(base_score, 2),
171
+ "finetuned_score": round(ft_score, 2),
172
+ "delta": round(delta, 2),
173
+ "delta_percent": round(delta_pct, 2)
174
+ }
175
+
176
+ print("\n" + "=" * 70)
177
+ print("Legend: ✅ <5% change | ⚠️ 5-15% change | ❌ >15% change")
178
+ print("=" * 70)
179
+
180
+ # Save comparison
181
+ comparison_data = {
182
+ "model": adapter_model,
183
+ "base_model": base_model,
184
+ "evaluation_date": datetime.now().isoformat(),
185
+ "benchmarks": comparison,
186
+ "base_results": base_results,
187
+ "finetuned_results": ft_results
188
+ }
189
+
190
+ comparison_file = results_dir / "benchmark_comparison.json"
191
+ with open(comparison_file, 'w') as f:
192
+ json.dump(comparison_data, f, indent=2)
193
+
194
+ print(f"\nComparison saved to: {comparison_file}")
195
+
196
+ # Upload results to Hub
197
+ token = os.environ.get("HF_TOKEN")
198
+ if token:
199
+ print("\nUploading results to Hub...")
200
+ try:
201
+ api = HfApi(token=token)
202
+ api.upload_file(
203
+ path_or_fileobj=str(comparison_file),
204
+ path_in_repo=f"results/standard_benchmarks_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
205
+ repo_id=adapter_model,
206
+ repo_type="model"
207
+ )
208
+ print("✅ Results uploaded to model repository")
209
+ except Exception as e:
210
+ print(f"⚠️ Could not upload results: {e}")
211
+
212
+ print("\n✅ Standard benchmark evaluation complete!")
213
+
214
+ if __name__ == "__main__":
215
+ main()