|
|
"""Evaluate fine-tuned model on standard LLM benchmarks. |
|
|
|
|
|
This script runs as a Hugging Face Job to evaluate the model on standard |
|
|
benchmarks (MMLU, HellaSwag, ARC, etc.) using lm-evaluation-harness. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
|
import os |
|
|
import subprocess |
|
|
from datetime import datetime |
|
|
from pathlib import Path |
|
|
from huggingface_hub import HfApi |
|
|
|
|
|
def run_benchmarks(model_id: str, output_dir: str, use_adapter: bool = False, base_model: str = None): |
|
|
"""Run standard benchmarks using lm-eval.""" |
|
|
|
|
|
tasks = [ |
|
|
"mmlu", |
|
|
"hellaswag", |
|
|
"arc_challenge", |
|
|
"truthfulqa_mc2", |
|
|
"gsm8k", |
|
|
"winogrande", |
|
|
] |
|
|
|
|
|
|
|
|
cmd = [ |
|
|
"lm_eval", |
|
|
"--model", "hf", |
|
|
"--tasks", ",".join(tasks), |
|
|
"--device", "cuda:0", |
|
|
"--batch_size", "8", |
|
|
"--output_path", output_dir, |
|
|
"--log_samples" |
|
|
] |
|
|
|
|
|
|
|
|
if use_adapter and base_model: |
|
|
model_args = f"pretrained={base_model},peft={model_id},dtype=float16" |
|
|
else: |
|
|
model_args = f"pretrained={model_id},dtype=float16" |
|
|
|
|
|
cmd.extend(["--model_args", model_args]) |
|
|
|
|
|
print(f"\nRunning benchmarks on: {model_id}") |
|
|
print(f"Tasks: {', '.join(tasks)}") |
|
|
print(f"Output: {output_dir}\n") |
|
|
print("Command:", " ".join(cmd), "\n") |
|
|
|
|
|
|
|
|
try: |
|
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True) |
|
|
print(result.stdout) |
|
|
if result.stderr: |
|
|
print("STDERR:", result.stderr) |
|
|
return True |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f"✗ Benchmark failed: {e}") |
|
|
print("STDOUT:", e.stdout) |
|
|
print("STDERR:", e.stderr) |
|
|
return False |
|
|
|
|
|
def extract_results(results_dir: Path) -> dict: |
|
|
"""Extract results from lm-eval output.""" |
|
|
results_file = results_dir / "results.json" |
|
|
|
|
|
if not results_file.exists(): |
|
|
print(f"⚠️ Results file not found: {results_file}") |
|
|
return {} |
|
|
|
|
|
with open(results_file, 'r') as f: |
|
|
data = json.load(f) |
|
|
|
|
|
|
|
|
results = data.get("results", {}) |
|
|
summary = {} |
|
|
|
|
|
for task, metrics in results.items(): |
|
|
|
|
|
if "acc,none" in metrics: |
|
|
summary[task] = metrics["acc,none"] |
|
|
elif "acc_norm,none" in metrics: |
|
|
summary[task] = metrics["acc_norm,none"] |
|
|
elif "exact_match,none" in metrics: |
|
|
summary[task] = metrics["exact_match,none"] |
|
|
else: |
|
|
|
|
|
summary[task] = list(metrics.values())[0] if metrics else 0 |
|
|
|
|
|
return summary |
|
|
|
|
|
def main(): |
|
|
"""Run standard benchmark evaluation.""" |
|
|
print("=" * 70) |
|
|
print("NATO Doctrine Model - Standard LLM Benchmarks") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
adapter_model = "AndreasThinks/mistral-7b-nato-doctrine" |
|
|
base_model = "mistralai/Mistral-7B-Instruct-v0.3" |
|
|
|
|
|
|
|
|
results_dir = Path("benchmark_results") |
|
|
results_dir.mkdir(exist_ok=True) |
|
|
|
|
|
base_output = results_dir / "base_model" |
|
|
ft_output = results_dir / "finetuned_model" |
|
|
|
|
|
|
|
|
print("\n[1/2] Running benchmarks on BASE model...") |
|
|
print("=" * 70) |
|
|
base_success = run_benchmarks( |
|
|
model_id=base_model, |
|
|
output_dir=str(base_output), |
|
|
use_adapter=False |
|
|
) |
|
|
|
|
|
|
|
|
print("\n[2/2] Running benchmarks on FINE-TUNED model...") |
|
|
print("=" * 70) |
|
|
ft_success = run_benchmarks( |
|
|
model_id=adapter_model, |
|
|
output_dir=str(ft_output), |
|
|
use_adapter=True, |
|
|
base_model=base_model |
|
|
) |
|
|
|
|
|
|
|
|
if base_success and ft_success: |
|
|
print("\n" + "=" * 70) |
|
|
print("BENCHMARK COMPARISON") |
|
|
print("=" * 70) |
|
|
|
|
|
base_results = extract_results(base_output) |
|
|
ft_results = extract_results(ft_output) |
|
|
|
|
|
print(f"\n{'Benchmark':<20} {'Base':<12} {'Fine-tuned':<12} {'Change':<12} {'Status'}") |
|
|
print("-" * 70) |
|
|
|
|
|
comparison = {} |
|
|
for task in base_results: |
|
|
if task in ft_results: |
|
|
base_score = base_results[task] * 100 |
|
|
ft_score = ft_results[task] * 100 |
|
|
delta = ft_score - base_score |
|
|
delta_pct = (delta / base_score * 100) if base_score > 0 else 0 |
|
|
|
|
|
|
|
|
if abs(delta_pct) < 5: |
|
|
status = "✅" |
|
|
elif abs(delta_pct) < 15: |
|
|
status = "⚠️" |
|
|
else: |
|
|
status = "❌" |
|
|
|
|
|
print(f"{task:<20} {base_score:>10.2f}% {ft_score:>11.2f}% {delta_pct:>+10.1f}% {status}") |
|
|
|
|
|
comparison[task] = { |
|
|
"base_score": round(base_score, 2), |
|
|
"finetuned_score": round(ft_score, 2), |
|
|
"delta": round(delta, 2), |
|
|
"delta_percent": round(delta_pct, 2) |
|
|
} |
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("Legend: ✅ <5% change | ⚠️ 5-15% change | ❌ >15% change") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
comparison_data = { |
|
|
"model": adapter_model, |
|
|
"base_model": base_model, |
|
|
"evaluation_date": datetime.now().isoformat(), |
|
|
"benchmarks": comparison, |
|
|
"base_results": base_results, |
|
|
"finetuned_results": ft_results |
|
|
} |
|
|
|
|
|
comparison_file = results_dir / "benchmark_comparison.json" |
|
|
with open(comparison_file, 'w') as f: |
|
|
json.dump(comparison_data, f, indent=2) |
|
|
|
|
|
print(f"\nComparison saved to: {comparison_file}") |
|
|
|
|
|
|
|
|
token = os.environ.get("HF_TOKEN") |
|
|
if token: |
|
|
print("\nUploading results to Hub...") |
|
|
try: |
|
|
api = HfApi(token=token) |
|
|
api.upload_file( |
|
|
path_or_fileobj=str(comparison_file), |
|
|
path_in_repo=f"results/standard_benchmarks_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", |
|
|
repo_id=adapter_model, |
|
|
repo_type="model" |
|
|
) |
|
|
print("✅ Results uploaded to model repository") |
|
|
except Exception as e: |
|
|
print(f"⚠️ Could not upload results: {e}") |
|
|
|
|
|
print("\n✅ Standard benchmark evaluation complete!") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|