nato-llm-scripts / eval_nato_benchmark.py
AndreasThinks's picture
Upload eval_nato_benchmark.py with huggingface_hub
b9d3d72 verified
"""Evaluate fine-tuned model on NATO-specific benchmark.
This script runs as a Hugging Face Job to evaluate the NATO doctrine model
on domain-specific questions and saves results to the Hub.
"""
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "transformers>=4.40.0",
# "torch>=2.0.0",
# "peft>=0.7.0",
# "datasets>=2.16.0",
# "huggingface-hub>=0.20.0",
# "accelerate>=0.20.0",
# "protobuf>=3.20.0",
# "sentencepiece>=0.1.99",
# ]
# ///
import json
import os
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from huggingface_hub import HfApi
import torch
def load_model(adapter_path: str, base_model_path: str = "mistralai/Mistral-7B-Instruct-v0.3"):
"""Load the fine-tuned model with adapter."""
print(f"Loading base model: {base_model_path}")
base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
device_map="auto",
torch_dtype=torch.float16
)
print(f"Loading LoRA adapter: {adapter_path}")
model = PeftModel.from_pretrained(base_model, adapter_path)
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
tokenizer.pad_token = tokenizer.eos_token
return model, tokenizer
def generate_response(model, tokenizer, question: str, max_new_tokens: int = 300):
"""Generate a response to a question."""
messages = [{"role": "user", "content": question}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=0.7,
top_p=0.9,
do_sample=True
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
if "[/INST]" in response:
response = response.split("[/INST]")[-1].strip()
return response
def evaluate_question(model, tokenizer, question_data: dict) -> dict:
"""Evaluate a single question."""
question = question_data["question"]
key_concepts = question_data["key_concepts"]
print(f"\nEvaluating: {question}")
response = generate_response(model, tokenizer, question)
# Check for key concepts
concepts_found = [
concept for concept in key_concepts
if concept.lower() in response.lower()
]
score = (len(concepts_found) / len(key_concepts)) * 10 if key_concepts else 0
return {
"id": question_data["id"],
"ajp_source": question_data["ajp_source"],
"difficulty": question_data["difficulty"],
"question": question,
"response": response,
"key_concepts": key_concepts,
"concepts_found": concepts_found,
"score": round(score, 2)
}
def main():
"""Run NATO benchmark evaluation."""
print("=" * 70)
print("NATO Doctrine Model - Benchmark Evaluation")
print("=" * 70)
# Configuration
adapter_model = "AndreasThinks/mistral-7b-nato-doctrine"
base_model = "mistralai/Mistral-7B-Instruct-v0.3"
# Load NATO test questions
print("\nLoading NATO test questions...")
api = HfApi()
# Download test questions from the repo
questions_file = api.hf_hub_download(
repo_id="AndreasThinks/nato-llm-scripts",
filename="nato_test_questions.json",
repo_type="model"
)
with open(questions_file, 'r') as f:
test_questions = json.load(f)
print(f"Loaded {len(test_questions)} test questions")
# Load model
model, tokenizer = load_model(adapter_model, base_model)
# Run evaluation
print("\n" + "=" * 70)
print("Running Evaluation")
print("=" * 70)
results = []
for question_data in test_questions:
result = evaluate_question(model, tokenizer, question_data)
results.append(result)
print(f" Question {result['id']}: Score = {result['score']}/10")
# Calculate summary statistics
total_score = sum(r['score'] for r in results)
max_score = len(results) * 10
percentage = (total_score / max_score) * 100
# Group by difficulty
basic_results = [r for r in results if r['difficulty'] == 'basic']
intermediate_results = [r for r in results if r['difficulty'] == 'intermediate']
basic_score = sum(r['score'] for r in basic_results) / len(basic_results) if basic_results else 0
intermediate_score = sum(r['score'] for r in intermediate_results) / len(intermediate_results) if intermediate_results else 0
# Create summary
summary = {
"model": adapter_model,
"base_model": base_model,
"evaluation_date": datetime.now().isoformat(),
"total_questions": len(results),
"total_score": round(total_score, 2),
"max_score": max_score,
"percentage": round(percentage, 2),
"average_score": round(total_score / len(results), 2),
"basic_avg_score": round(basic_score, 2),
"intermediate_avg_score": round(intermediate_score, 2),
"results": results
}
# Print summary
print("\n" + "=" * 70)
print("EVALUATION SUMMARY")
print("=" * 70)
print(f"Total Score: {total_score:.2f}/{max_score} ({percentage:.1f}%)")
print(f"Average Score: {summary['average_score']:.2f}/10")
print(f"Basic Questions: {basic_score:.2f}/10")
print(f"Intermediate Questions: {intermediate_score:.2f}/10")
print("=" * 70)
# Save results
output_file = "nato_benchmark_results.json"
with open(output_file, 'w') as f:
json.dump(summary, f, indent=2)
print(f"\nResults saved to: {output_file}")
# Upload results to Hub
token = os.environ.get("HF_TOKEN")
if token:
print("\nUploading results to Hub...")
try:
api = HfApi(token=token)
api.upload_file(
path_or_fileobj=output_file,
path_in_repo=f"results/nato_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
repo_id=adapter_model,
repo_type="model"
)
print("✅ Results uploaded to model repository")
except Exception as e:
print(f"⚠️ Could not upload results: {e}")
print("\n✅ NATO benchmark evaluation complete!")
return summary
if __name__ == "__main__":
main()