"""Evaluate fine-tuned model on standard LLM benchmarks. This script runs as a Hugging Face Job to evaluate the model on standard benchmarks (MMLU, HellaSwag, ARC, etc.) using lm-evaluation-harness. """ # /// script # requires-python = ">=3.11" # dependencies = [ # "lm-eval>=0.4.0", # "transformers>=4.40.0", # "torch>=2.0.0", # "peft>=0.7.0", # "huggingface-hub>=0.20.0", # "accelerate>=0.20.0", # "protobuf>=3.20.0", # "sentencepiece>=0.1.99", # ] # /// import json import os import subprocess from datetime import datetime from pathlib import Path from huggingface_hub import HfApi def run_benchmarks(model_id: str, output_dir: str, use_adapter: bool = False, base_model: str = None): """Run standard benchmarks using lm-eval.""" # Define benchmark tasks tasks = [ "mmlu", # General knowledge "hellaswag", # Common sense reasoning "arc_challenge", # Science reasoning "truthfulqa_mc2", # Truthfulness "gsm8k", # Math reasoning "winogrande", # Pronoun resolution ] # Build command cmd = [ "lm_eval", "--model", "hf", "--tasks", ",".join(tasks), "--device", "cuda:0", "--batch_size", "8", "--output_path", output_dir, "--log_samples" ] # Add model args if use_adapter and base_model: model_args = f"pretrained={base_model},peft={model_id},dtype=float16" else: model_args = f"pretrained={model_id},dtype=float16" cmd.extend(["--model_args", model_args]) print(f"\nRunning benchmarks on: {model_id}") print(f"Tasks: {', '.join(tasks)}") print(f"Output: {output_dir}\n") print("Command:", " ".join(cmd), "\n") # Run benchmarks try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) print(result.stdout) if result.stderr: print("STDERR:", result.stderr) return True except subprocess.CalledProcessError as e: print(f"✗ Benchmark failed: {e}") print("STDOUT:", e.stdout) print("STDERR:", e.stderr) return False def extract_results(results_dir: Path) -> dict: """Extract results from lm-eval output.""" results_file = results_dir / "results.json" if not results_file.exists(): print(f"⚠️ Results file not found: {results_file}") return {} with open(results_file, 'r') as f: data = json.load(f) # Extract key metrics results = data.get("results", {}) summary = {} for task, metrics in results.items(): # Get the main accuracy metric (varies by task) if "acc,none" in metrics: summary[task] = metrics["acc,none"] elif "acc_norm,none" in metrics: summary[task] = metrics["acc_norm,none"] elif "exact_match,none" in metrics: summary[task] = metrics["exact_match,none"] else: # Take first available metric summary[task] = list(metrics.values())[0] if metrics else 0 return summary def main(): """Run standard benchmark evaluation.""" print("=" * 70) print("NATO Doctrine Model - Standard LLM Benchmarks") print("=" * 70) # Configuration adapter_model = "AndreasThinks/mistral-7b-nato-doctrine" base_model = "mistralai/Mistral-7B-Instruct-v0.3" # Create output directories results_dir = Path("benchmark_results") results_dir.mkdir(exist_ok=True) base_output = results_dir / "base_model" ft_output = results_dir / "finetuned_model" # Run benchmarks on base model print("\n[1/2] Running benchmarks on BASE model...") print("=" * 70) base_success = run_benchmarks( model_id=base_model, output_dir=str(base_output), use_adapter=False ) # Run benchmarks on fine-tuned model print("\n[2/2] Running benchmarks on FINE-TUNED model...") print("=" * 70) ft_success = run_benchmarks( model_id=adapter_model, output_dir=str(ft_output), use_adapter=True, base_model=base_model ) # Extract and compare results if base_success and ft_success: print("\n" + "=" * 70) print("BENCHMARK COMPARISON") print("=" * 70) base_results = extract_results(base_output) ft_results = extract_results(ft_output) print(f"\n{'Benchmark':<20} {'Base':<12} {'Fine-tuned':<12} {'Change':<12} {'Status'}") print("-" * 70) comparison = {} for task in base_results: if task in ft_results: base_score = base_results[task] * 100 ft_score = ft_results[task] * 100 delta = ft_score - base_score delta_pct = (delta / base_score * 100) if base_score > 0 else 0 # Status indicator if abs(delta_pct) < 5: status = "✅" elif abs(delta_pct) < 15: status = "⚠️" else: status = "❌" print(f"{task:<20} {base_score:>10.2f}% {ft_score:>11.2f}% {delta_pct:>+10.1f}% {status}") comparison[task] = { "base_score": round(base_score, 2), "finetuned_score": round(ft_score, 2), "delta": round(delta, 2), "delta_percent": round(delta_pct, 2) } print("\n" + "=" * 70) print("Legend: ✅ <5% change | ⚠️ 5-15% change | ❌ >15% change") print("=" * 70) # Save comparison comparison_data = { "model": adapter_model, "base_model": base_model, "evaluation_date": datetime.now().isoformat(), "benchmarks": comparison, "base_results": base_results, "finetuned_results": ft_results } comparison_file = results_dir / "benchmark_comparison.json" with open(comparison_file, 'w') as f: json.dump(comparison_data, f, indent=2) print(f"\nComparison saved to: {comparison_file}") # Upload results to Hub token = os.environ.get("HF_TOKEN") if token: print("\nUploading results to Hub...") try: api = HfApi(token=token) api.upload_file( path_or_fileobj=str(comparison_file), path_in_repo=f"results/standard_benchmarks_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", repo_id=adapter_model, repo_type="model" ) print("✅ Results uploaded to model repository") except Exception as e: print(f"⚠️ Could not upload results: {e}") print("\n✅ Standard benchmark evaluation complete!") if __name__ == "__main__": main()