nato-llm-scripts / eval_standard_benchmarks.py

Upload eval_standard_benchmarks.py with huggingface_hub

4c4778a verified about 1 month ago

6.93 kB

	"""Evaluate fine-tuned model on standard LLM benchmarks.

	This script runs as a Hugging Face Job to evaluate the model on standard
	benchmarks (MMLU, HellaSwag, ARC, etc.) using lm-evaluation-harness.
	"""
	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# "lm-eval>=0.4.0",
	# "transformers>=4.40.0",
	# "torch>=2.0.0",
	# "peft>=0.7.0",
	# "huggingface-hub>=0.20.0",
	# "accelerate>=0.20.0",
	# "protobuf>=3.20.0",
	# "sentencepiece>=0.1.99",
	# ]
	# ///

	import json
	import os
	import subprocess
	from datetime import datetime
	from pathlib import Path
	from huggingface_hub import HfApi

	def run_benchmarks(model_id: str, output_dir: str, use_adapter: bool = False, base_model: str = None):
	"""Run standard benchmarks using lm-eval."""
	# Define benchmark tasks
	tasks = [
	"mmlu", # General knowledge
	"hellaswag", # Common sense reasoning
	"arc_challenge", # Science reasoning
	"truthfulqa_mc2", # Truthfulness
	"gsm8k", # Math reasoning
	"winogrande", # Pronoun resolution
	]

	# Build command
	cmd = [
	"lm_eval",
	"--model", "hf",
	"--tasks", ",".join(tasks),
	"--device", "cuda:0",
	"--batch_size", "8",
	"--output_path", output_dir,
	"--log_samples"
	]

	# Add model args
	if use_adapter and base_model:
	model_args = f"pretrained={base_model},peft={model_id},dtype=float16"
	else:
	model_args = f"pretrained={model_id},dtype=float16"

	cmd.extend(["--model_args", model_args])

	print(f"\nRunning benchmarks on: {model_id}")
	print(f"Tasks: {', '.join(tasks)}")
	print(f"Output: {output_dir}\n")
	print("Command:", " ".join(cmd), "\n")

	# Run benchmarks
	try:
	result = subprocess.run(cmd, check=True, capture_output=True, text=True)
	print(result.stdout)
	if result.stderr:
	print("STDERR:", result.stderr)
	return True
	except subprocess.CalledProcessError as e:
	print(f"✗ Benchmark failed: {e}")
	print("STDOUT:", e.stdout)
	print("STDERR:", e.stderr)
	return False

	def extract_results(results_dir: Path) -> dict:
	"""Extract results from lm-eval output."""
	results_file = results_dir / "results.json"

	if not results_file.exists():
	print(f"⚠️ Results file not found: {results_file}")
	return {}

	with open(results_file, 'r') as f:
	data = json.load(f)

	# Extract key metrics
	results = data.get("results", {})
	summary = {}

	for task, metrics in results.items():
	# Get the main accuracy metric (varies by task)
	if "acc,none" in metrics:
	summary[task] = metrics["acc,none"]
	elif "acc_norm,none" in metrics:
	summary[task] = metrics["acc_norm,none"]
	elif "exact_match,none" in metrics:
	summary[task] = metrics["exact_match,none"]
	else:
	# Take first available metric
	summary[task] = list(metrics.values())[0] if metrics else 0

	return summary

	def main():
	"""Run standard benchmark evaluation."""
	print("=" * 70)
	print("NATO Doctrine Model - Standard LLM Benchmarks")
	print("=" * 70)

	# Configuration
	adapter_model = "AndreasThinks/mistral-7b-nato-doctrine"
	base_model = "mistralai/Mistral-7B-Instruct-v0.3"

	# Create output directories
	results_dir = Path("benchmark_results")
	results_dir.mkdir(exist_ok=True)

	base_output = results_dir / "base_model"
	ft_output = results_dir / "finetuned_model"

	# Run benchmarks on base model
	print("\n[1/2] Running benchmarks on BASE model...")
	print("=" * 70)
	base_success = run_benchmarks(
	model_id=base_model,
	output_dir=str(base_output),
	use_adapter=False
	)

	# Run benchmarks on fine-tuned model
	print("\n[2/2] Running benchmarks on FINE-TUNED model...")
	print("=" * 70)
	ft_success = run_benchmarks(
	model_id=adapter_model,
	output_dir=str(ft_output),
	use_adapter=True,
	base_model=base_model
	)

	# Extract and compare results
	if base_success and ft_success:
	print("\n" + "=" * 70)
	print("BENCHMARK COMPARISON")
	print("=" * 70)

	base_results = extract_results(base_output)
	ft_results = extract_results(ft_output)

	print(f"\n{'Benchmark':<20} {'Base':<12} {'Fine-tuned':<12} {'Change':<12} {'Status'}")
	print("-" * 70)

	comparison = {}
	for task in base_results:
	if task in ft_results:
	base_score = base_results[task] * 100
	ft_score = ft_results[task] * 100
	delta = ft_score - base_score
	delta_pct = (delta / base_score * 100) if base_score > 0 else 0

	# Status indicator
	if abs(delta_pct) < 5:
	status = "✅"
	elif abs(delta_pct) < 15:
	status = "⚠️"
	else:
	status = "❌"

	print(f"{task:<20} {base_score:>10.2f}% {ft_score:>11.2f}% {delta_pct:>+10.1f}% {status}")

	comparison[task] = {
	"base_score": round(base_score, 2),
	"finetuned_score": round(ft_score, 2),
	"delta": round(delta, 2),
	"delta_percent": round(delta_pct, 2)
	}

	print("\n" + "=" * 70)
	print("Legend: ✅ <5% change \| ⚠️ 5-15% change \| ❌ >15% change")
	print("=" * 70)

	# Save comparison
	comparison_data = {
	"model": adapter_model,
	"base_model": base_model,
	"evaluation_date": datetime.now().isoformat(),
	"benchmarks": comparison,
	"base_results": base_results,
	"finetuned_results": ft_results
	}

	comparison_file = results_dir / "benchmark_comparison.json"
	with open(comparison_file, 'w') as f:
	json.dump(comparison_data, f, indent=2)

	print(f"\nComparison saved to: {comparison_file}")

	# Upload results to Hub
	token = os.environ.get("HF_TOKEN")
	if token:
	print("\nUploading results to Hub...")
	try:
	api = HfApi(token=token)
	api.upload_file(
	path_or_fileobj=str(comparison_file),
	path_in_repo=f"results/standard_benchmarks_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
	repo_id=adapter_model,
	repo_type="model"
	)
	print("✅ Results uploaded to model repository")
	except Exception as e:
	print(f"⚠️ Could not upload results: {e}")

	print("\n✅ Standard benchmark evaluation complete!")

	if __name__ == "__main__":
	main()