AndreasThinks
/

nato-llm-scripts

Model card Files Files and versions

xet

Community

AndreasThinks commited on Dec 9, 2025

Commit

1da5816

verified ·

1 Parent(s): 7b1d688

Upload eval_standard_benchmarks.py with huggingface_hub

Browse files

Files changed (1) hide show

eval_standard_benchmarks.py +215 -0

eval_standard_benchmarks.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""Evaluate fine-tuned model on standard LLM benchmarks.
+This script runs as a Hugging Face Job to evaluate the model on standard
+benchmarks (MMLU, HellaSwag, ARC, etc.) using lm-evaluation-harness.
+"""
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "lm-eval>=0.4.0",
+#     "transformers>=4.40.0",
+#     "torch>=2.0.0",
+#     "peft>=0.7.0",
+#     "huggingface-hub>=0.20.0",
+#     "accelerate>=0.20.0",
+# ]
+# ///
+import json
+import os
+import subprocess
+from datetime import datetime
+from pathlib import Path
+from huggingface_hub import HfApi
+def run_benchmarks(model_id: str, output_dir: str, use_adapter: bool = False, base_model: str = None):
+    """Run standard benchmarks using lm-eval."""
+    # Define benchmark tasks
+    tasks = [
+        "mmlu",           # General knowledge
+        "hellaswag",      # Common sense reasoning
+        "arc_challenge",  # Science reasoning
+        "truthfulqa_mc2", # Truthfulness
+        "gsm8k",          # Math reasoning
+        "winogrande",     # Pronoun resolution
+    ]
+    # Build command
+    cmd = [
+        "lm_eval",
+        "--model", "hf",
+        "--tasks", ",".join(tasks),
+        "--device", "cuda:0",
+        "--batch_size", "8",
+        "--output_path", output_dir,
+        "--log_samples"
+    ]
+    # Add model args
+    if use_adapter and base_model:
+        model_args = f"pretrained={base_model},peft={model_id},dtype=float16"
+    else:
+        model_args = f"pretrained={model_id},dtype=float16"
+    cmd.extend(["--model_args", model_args])
+    print(f"\nRunning benchmarks on: {model_id}")
+    print(f"Tasks: {', '.join(tasks)}")
+    print(f"Output: {output_dir}\n")
+    print("Command:", " ".join(cmd), "\n")
+    # Run benchmarks
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        print(result.stdout)
+        if result.stderr:
+            print("STDERR:", result.stderr)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"✗ Benchmark failed: {e}")
+        print("STDOUT:", e.stdout)
+        print("STDERR:", e.stderr)
+        return False
+def extract_results(results_dir: Path) -> dict:
+    """Extract results from lm-eval output."""
+    results_file = results_dir / "results.json"
+    if not results_file.exists():
+        print(f"⚠️ Results file not found: {results_file}")
+        return {}
+    with open(results_file, 'r') as f:
+        data = json.load(f)
+    # Extract key metrics
+    results = data.get("results", {})
+    summary = {}
+    for task, metrics in results.items():
+        # Get the main accuracy metric (varies by task)
+        if "acc,none" in metrics:
+            summary[task] = metrics["acc,none"]
+        elif "acc_norm,none" in metrics:
+            summary[task] = metrics["acc_norm,none"]
+        elif "exact_match,none" in metrics:
+            summary[task] = metrics["exact_match,none"]
+        else:
+            # Take first available metric
+            summary[task] = list(metrics.values())[0] if metrics else 0
+    return summary
+def main():
+    """Run standard benchmark evaluation."""
+    print("=" * 70)
+    print("NATO Doctrine Model - Standard LLM Benchmarks")
+    print("=" * 70)
+    # Configuration
+    adapter_model = "AndreasThinks/mistral-7b-nato-doctrine"
+    base_model = "mistralai/Mistral-7B-Instruct-v0.3"
+    # Create output directories
+    results_dir = Path("benchmark_results")
+    results_dir.mkdir(exist_ok=True)
+    base_output = results_dir / "base_model"
+    ft_output = results_dir / "finetuned_model"
+    # Run benchmarks on base model
+    print("\n[1/2] Running benchmarks on BASE model...")
+    print("=" * 70)
+    base_success = run_benchmarks(
+        model_id=base_model,
+        output_dir=str(base_output),
+        use_adapter=False
+    )
+    # Run benchmarks on fine-tuned model
+    print("\n[2/2] Running benchmarks on FINE-TUNED model...")
+    print("=" * 70)
+    ft_success = run_benchmarks(
+        model_id=adapter_model,
+        output_dir=str(ft_output),
+        use_adapter=True,
+        base_model=base_model
+    )
+    # Extract and compare results
+    if base_success and ft_success:
+        print("\n" + "=" * 70)
+        print("BENCHMARK COMPARISON")
+        print("=" * 70)
+        base_results = extract_results(base_output)
+        ft_results = extract_results(ft_output)
+        print(f"\n{'Benchmark':<20} {'Base':<12} {'Fine-tuned':<12} {'Change':<12} {'Status'}")
+        print("-" * 70)
+        comparison = {}
+        for task in base_results:
+            if task in ft_results:
+                base_score = base_results[task] * 100
+                ft_score = ft_results[task] * 100
+                delta = ft_score - base_score
+                delta_pct = (delta / base_score * 100) if base_score > 0 else 0
+                # Status indicator
+                if abs(delta_pct) < 5:
+                    status = "✅"
+                elif abs(delta_pct) < 15:
+                    status = "⚠️"
+                else:
+                    status = "❌"
+                print(f"{task:<20} {base_score:>10.2f}% {ft_score:>11.2f}% {delta_pct:>+10.1f}%  {status}")
+                comparison[task] = {
+                    "base_score": round(base_score, 2),
+                    "finetuned_score": round(ft_score, 2),
+                    "delta": round(delta, 2),
+                    "delta_percent": round(delta_pct, 2)
+                }
+        print("\n" + "=" * 70)
+        print("Legend: ✅ <5% change | ⚠️ 5-15% change | ❌ >15% change")
+        print("=" * 70)
+        # Save comparison
+        comparison_data = {
+            "model": adapter_model,
+            "base_model": base_model,
+            "evaluation_date": datetime.now().isoformat(),
+            "benchmarks": comparison,
+            "base_results": base_results,
+            "finetuned_results": ft_results
+        }
+        comparison_file = results_dir / "benchmark_comparison.json"
+        with open(comparison_file, 'w') as f:
+            json.dump(comparison_data, f, indent=2)
+        print(f"\nComparison saved to: {comparison_file}")
+        # Upload results to Hub
+        token = os.environ.get("HF_TOKEN")
+        if token:
+            print("\nUploading results to Hub...")
+            try:
+                api = HfApi(token=token)
+                api.upload_file(
+                    path_or_fileobj=str(comparison_file),
+                    path_in_repo=f"results/standard_benchmarks_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+                    repo_id=adapter_model,
+                    repo_type="model"
+                )
+                print("✅ Results uploaded to model repository")
+            except Exception as e:
+                print(f"⚠️ Could not upload results: {e}")
+    print("\n✅ Standard benchmark evaluation complete!")
+if __name__ == "__main__":
+    main()