nato-llm-scripts / eval_standard_benchmarks.py
AndreasThinks's picture
Upload eval_standard_benchmarks.py with huggingface_hub
4c4778a verified
"""Evaluate fine-tuned model on standard LLM benchmarks.
This script runs as a Hugging Face Job to evaluate the model on standard
benchmarks (MMLU, HellaSwag, ARC, etc.) using lm-evaluation-harness.
"""
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "lm-eval>=0.4.0",
# "transformers>=4.40.0",
# "torch>=2.0.0",
# "peft>=0.7.0",
# "huggingface-hub>=0.20.0",
# "accelerate>=0.20.0",
# "protobuf>=3.20.0",
# "sentencepiece>=0.1.99",
# ]
# ///
import json
import os
import subprocess
from datetime import datetime
from pathlib import Path
from huggingface_hub import HfApi
def run_benchmarks(model_id: str, output_dir: str, use_adapter: bool = False, base_model: str = None):
"""Run standard benchmarks using lm-eval."""
# Define benchmark tasks
tasks = [
"mmlu", # General knowledge
"hellaswag", # Common sense reasoning
"arc_challenge", # Science reasoning
"truthfulqa_mc2", # Truthfulness
"gsm8k", # Math reasoning
"winogrande", # Pronoun resolution
]
# Build command
cmd = [
"lm_eval",
"--model", "hf",
"--tasks", ",".join(tasks),
"--device", "cuda:0",
"--batch_size", "8",
"--output_path", output_dir,
"--log_samples"
]
# Add model args
if use_adapter and base_model:
model_args = f"pretrained={base_model},peft={model_id},dtype=float16"
else:
model_args = f"pretrained={model_id},dtype=float16"
cmd.extend(["--model_args", model_args])
print(f"\nRunning benchmarks on: {model_id}")
print(f"Tasks: {', '.join(tasks)}")
print(f"Output: {output_dir}\n")
print("Command:", " ".join(cmd), "\n")
# Run benchmarks
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
print("STDERR:", result.stderr)
return True
except subprocess.CalledProcessError as e:
print(f"✗ Benchmark failed: {e}")
print("STDOUT:", e.stdout)
print("STDERR:", e.stderr)
return False
def extract_results(results_dir: Path) -> dict:
"""Extract results from lm-eval output."""
results_file = results_dir / "results.json"
if not results_file.exists():
print(f"⚠️ Results file not found: {results_file}")
return {}
with open(results_file, 'r') as f:
data = json.load(f)
# Extract key metrics
results = data.get("results", {})
summary = {}
for task, metrics in results.items():
# Get the main accuracy metric (varies by task)
if "acc,none" in metrics:
summary[task] = metrics["acc,none"]
elif "acc_norm,none" in metrics:
summary[task] = metrics["acc_norm,none"]
elif "exact_match,none" in metrics:
summary[task] = metrics["exact_match,none"]
else:
# Take first available metric
summary[task] = list(metrics.values())[0] if metrics else 0
return summary
def main():
"""Run standard benchmark evaluation."""
print("=" * 70)
print("NATO Doctrine Model - Standard LLM Benchmarks")
print("=" * 70)
# Configuration
adapter_model = "AndreasThinks/mistral-7b-nato-doctrine"
base_model = "mistralai/Mistral-7B-Instruct-v0.3"
# Create output directories
results_dir = Path("benchmark_results")
results_dir.mkdir(exist_ok=True)
base_output = results_dir / "base_model"
ft_output = results_dir / "finetuned_model"
# Run benchmarks on base model
print("\n[1/2] Running benchmarks on BASE model...")
print("=" * 70)
base_success = run_benchmarks(
model_id=base_model,
output_dir=str(base_output),
use_adapter=False
)
# Run benchmarks on fine-tuned model
print("\n[2/2] Running benchmarks on FINE-TUNED model...")
print("=" * 70)
ft_success = run_benchmarks(
model_id=adapter_model,
output_dir=str(ft_output),
use_adapter=True,
base_model=base_model
)
# Extract and compare results
if base_success and ft_success:
print("\n" + "=" * 70)
print("BENCHMARK COMPARISON")
print("=" * 70)
base_results = extract_results(base_output)
ft_results = extract_results(ft_output)
print(f"\n{'Benchmark':<20} {'Base':<12} {'Fine-tuned':<12} {'Change':<12} {'Status'}")
print("-" * 70)
comparison = {}
for task in base_results:
if task in ft_results:
base_score = base_results[task] * 100
ft_score = ft_results[task] * 100
delta = ft_score - base_score
delta_pct = (delta / base_score * 100) if base_score > 0 else 0
# Status indicator
if abs(delta_pct) < 5:
status = "✅"
elif abs(delta_pct) < 15:
status = "⚠️"
else:
status = "❌"
print(f"{task:<20} {base_score:>10.2f}% {ft_score:>11.2f}% {delta_pct:>+10.1f}% {status}")
comparison[task] = {
"base_score": round(base_score, 2),
"finetuned_score": round(ft_score, 2),
"delta": round(delta, 2),
"delta_percent": round(delta_pct, 2)
}
print("\n" + "=" * 70)
print("Legend: ✅ <5% change | ⚠️ 5-15% change | ❌ >15% change")
print("=" * 70)
# Save comparison
comparison_data = {
"model": adapter_model,
"base_model": base_model,
"evaluation_date": datetime.now().isoformat(),
"benchmarks": comparison,
"base_results": base_results,
"finetuned_results": ft_results
}
comparison_file = results_dir / "benchmark_comparison.json"
with open(comparison_file, 'w') as f:
json.dump(comparison_data, f, indent=2)
print(f"\nComparison saved to: {comparison_file}")
# Upload results to Hub
token = os.environ.get("HF_TOKEN")
if token:
print("\nUploading results to Hub...")
try:
api = HfApi(token=token)
api.upload_file(
path_or_fileobj=str(comparison_file),
path_in_repo=f"results/standard_benchmarks_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
repo_id=adapter_model,
repo_type="model"
)
print("✅ Results uploaded to model repository")
except Exception as e:
print(f"⚠️ Could not upload results: {e}")
print("\n✅ Standard benchmark evaluation complete!")
if __name__ == "__main__":
main()