nato-llm-scripts / eval_nato_benchmark.py

Upload eval_nato_benchmark.py with huggingface_hub

b9d3d72 verified about 1 month ago

6.43 kB

	"""Evaluate fine-tuned model on NATO-specific benchmark.

	This script runs as a Hugging Face Job to evaluate the NATO doctrine model
	on domain-specific questions and saves results to the Hub.
	"""
	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# "transformers>=4.40.0",
	# "torch>=2.0.0",
	# "peft>=0.7.0",
	# "datasets>=2.16.0",
	# "huggingface-hub>=0.20.0",
	# "accelerate>=0.20.0",
	# "protobuf>=3.20.0",
	# "sentencepiece>=0.1.99",
	# ]
	# ///

	import json
	import os
	from datetime import datetime
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	from huggingface_hub import HfApi
	import torch

	def load_model(adapter_path: str, base_model_path: str = "mistralai/Mistral-7B-Instruct-v0.3"):
	"""Load the fine-tuned model with adapter."""
	print(f"Loading base model: {base_model_path}")
	base_model = AutoModelForCausalLM.from_pretrained(
	base_model_path,
	device_map="auto",
	torch_dtype=torch.float16
	)

	print(f"Loading LoRA adapter: {adapter_path}")
	model = PeftModel.from_pretrained(base_model, adapter_path)

	tokenizer = AutoTokenizer.from_pretrained(base_model_path)
	tokenizer.pad_token = tokenizer.eos_token

	return model, tokenizer

	def generate_response(model, tokenizer, question: str, max_new_tokens: int = 300):
	"""Generate a response to a question."""
	messages = [{"role": "user", "content": question}]

	prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	temperature=0.7,
	top_p=0.9,
	do_sample=True
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	if "[/INST]" in response:
	response = response.split("[/INST]")[-1].strip()

	return response

	def evaluate_question(model, tokenizer, question_data: dict) -> dict:
	"""Evaluate a single question."""
	question = question_data["question"]
	key_concepts = question_data["key_concepts"]

	print(f"\nEvaluating: {question}")
	response = generate_response(model, tokenizer, question)

	# Check for key concepts
	concepts_found = [
	concept for concept in key_concepts
	if concept.lower() in response.lower()
	]

	score = (len(concepts_found) / len(key_concepts)) * 10 if key_concepts else 0

	return {
	"id": question_data["id"],
	"ajp_source": question_data["ajp_source"],
	"difficulty": question_data["difficulty"],
	"question": question,
	"response": response,
	"key_concepts": key_concepts,
	"concepts_found": concepts_found,
	"score": round(score, 2)
	}

	def main():
	"""Run NATO benchmark evaluation."""
	print("=" * 70)
	print("NATO Doctrine Model - Benchmark Evaluation")
	print("=" * 70)

	# Configuration
	adapter_model = "AndreasThinks/mistral-7b-nato-doctrine"
	base_model = "mistralai/Mistral-7B-Instruct-v0.3"

	# Load NATO test questions
	print("\nLoading NATO test questions...")
	api = HfApi()

	# Download test questions from the repo
	questions_file = api.hf_hub_download(
	repo_id="AndreasThinks/nato-llm-scripts",
	filename="nato_test_questions.json",
	repo_type="model"
	)

	with open(questions_file, 'r') as f:
	test_questions = json.load(f)

	print(f"Loaded {len(test_questions)} test questions")

	# Load model
	model, tokenizer = load_model(adapter_model, base_model)

	# Run evaluation
	print("\n" + "=" * 70)
	print("Running Evaluation")
	print("=" * 70)

	results = []
	for question_data in test_questions:
	result = evaluate_question(model, tokenizer, question_data)
	results.append(result)
	print(f" Question {result['id']}: Score = {result['score']}/10")

	# Calculate summary statistics
	total_score = sum(r['score'] for r in results)
	max_score = len(results) * 10
	percentage = (total_score / max_score) * 100

	# Group by difficulty
	basic_results = [r for r in results if r['difficulty'] == 'basic']
	intermediate_results = [r for r in results if r['difficulty'] == 'intermediate']

	basic_score = sum(r['score'] for r in basic_results) / len(basic_results) if basic_results else 0
	intermediate_score = sum(r['score'] for r in intermediate_results) / len(intermediate_results) if intermediate_results else 0

	# Create summary
	summary = {
	"model": adapter_model,
	"base_model": base_model,
	"evaluation_date": datetime.now().isoformat(),
	"total_questions": len(results),
	"total_score": round(total_score, 2),
	"max_score": max_score,
	"percentage": round(percentage, 2),
	"average_score": round(total_score / len(results), 2),
	"basic_avg_score": round(basic_score, 2),
	"intermediate_avg_score": round(intermediate_score, 2),
	"results": results
	}

	# Print summary
	print("\n" + "=" * 70)
	print("EVALUATION SUMMARY")
	print("=" * 70)
	print(f"Total Score: {total_score:.2f}/{max_score} ({percentage:.1f}%)")
	print(f"Average Score: {summary['average_score']:.2f}/10")
	print(f"Basic Questions: {basic_score:.2f}/10")
	print(f"Intermediate Questions: {intermediate_score:.2f}/10")
	print("=" * 70)

	# Save results
	output_file = "nato_benchmark_results.json"
	with open(output_file, 'w') as f:
	json.dump(summary, f, indent=2)

	print(f"\nResults saved to: {output_file}")

	# Upload results to Hub
	token = os.environ.get("HF_TOKEN")
	if token:
	print("\nUploading results to Hub...")
	try:
	api = HfApi(token=token)
	api.upload_file(
	path_or_fileobj=output_file,
	path_in_repo=f"results/nato_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
	repo_id=adapter_model,
	repo_type="model"
	)
	print("✅ Results uploaded to model repository")
	except Exception as e:
	print(f"⚠️ Could not upload results: {e}")

	print("\n✅ NATO benchmark evaluation complete!")
	return summary

	if __name__ == "__main__":
	main()