|
|
"""Evaluate fine-tuned model on NATO-specific benchmark. |
|
|
|
|
|
This script runs as a Hugging Face Job to evaluate the NATO doctrine model |
|
|
on domain-specific questions and saves results to the Hub. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
|
import os |
|
|
from datetime import datetime |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from peft import PeftModel |
|
|
from huggingface_hub import HfApi |
|
|
import torch |
|
|
|
|
|
def load_model(adapter_path: str, base_model_path: str = "mistralai/Mistral-7B-Instruct-v0.3"): |
|
|
"""Load the fine-tuned model with adapter.""" |
|
|
print(f"Loading base model: {base_model_path}") |
|
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
|
base_model_path, |
|
|
device_map="auto", |
|
|
torch_dtype=torch.float16 |
|
|
) |
|
|
|
|
|
print(f"Loading LoRA adapter: {adapter_path}") |
|
|
model = PeftModel.from_pretrained(base_model, adapter_path) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(base_model_path) |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
return model, tokenizer |
|
|
|
|
|
def generate_response(model, tokenizer, question: str, max_new_tokens: int = 300): |
|
|
"""Generate a response to a question.""" |
|
|
messages = [{"role": "user", "content": question}] |
|
|
|
|
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=max_new_tokens, |
|
|
temperature=0.7, |
|
|
top_p=0.9, |
|
|
do_sample=True |
|
|
) |
|
|
|
|
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
if "[/INST]" in response: |
|
|
response = response.split("[/INST]")[-1].strip() |
|
|
|
|
|
return response |
|
|
|
|
|
def evaluate_question(model, tokenizer, question_data: dict) -> dict: |
|
|
"""Evaluate a single question.""" |
|
|
question = question_data["question"] |
|
|
key_concepts = question_data["key_concepts"] |
|
|
|
|
|
print(f"\nEvaluating: {question}") |
|
|
response = generate_response(model, tokenizer, question) |
|
|
|
|
|
|
|
|
concepts_found = [ |
|
|
concept for concept in key_concepts |
|
|
if concept.lower() in response.lower() |
|
|
] |
|
|
|
|
|
score = (len(concepts_found) / len(key_concepts)) * 10 if key_concepts else 0 |
|
|
|
|
|
return { |
|
|
"id": question_data["id"], |
|
|
"ajp_source": question_data["ajp_source"], |
|
|
"difficulty": question_data["difficulty"], |
|
|
"question": question, |
|
|
"response": response, |
|
|
"key_concepts": key_concepts, |
|
|
"concepts_found": concepts_found, |
|
|
"score": round(score, 2) |
|
|
} |
|
|
|
|
|
def main(): |
|
|
"""Run NATO benchmark evaluation.""" |
|
|
print("=" * 70) |
|
|
print("NATO Doctrine Model - Benchmark Evaluation") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
adapter_model = "AndreasThinks/mistral-7b-nato-doctrine" |
|
|
base_model = "mistralai/Mistral-7B-Instruct-v0.3" |
|
|
|
|
|
|
|
|
print("\nLoading NATO test questions...") |
|
|
api = HfApi() |
|
|
|
|
|
|
|
|
questions_file = api.hf_hub_download( |
|
|
repo_id="AndreasThinks/nato-llm-scripts", |
|
|
filename="nato_test_questions.json", |
|
|
repo_type="model" |
|
|
) |
|
|
|
|
|
with open(questions_file, 'r') as f: |
|
|
test_questions = json.load(f) |
|
|
|
|
|
print(f"Loaded {len(test_questions)} test questions") |
|
|
|
|
|
|
|
|
model, tokenizer = load_model(adapter_model, base_model) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("Running Evaluation") |
|
|
print("=" * 70) |
|
|
|
|
|
results = [] |
|
|
for question_data in test_questions: |
|
|
result = evaluate_question(model, tokenizer, question_data) |
|
|
results.append(result) |
|
|
print(f" Question {result['id']}: Score = {result['score']}/10") |
|
|
|
|
|
|
|
|
total_score = sum(r['score'] for r in results) |
|
|
max_score = len(results) * 10 |
|
|
percentage = (total_score / max_score) * 100 |
|
|
|
|
|
|
|
|
basic_results = [r for r in results if r['difficulty'] == 'basic'] |
|
|
intermediate_results = [r for r in results if r['difficulty'] == 'intermediate'] |
|
|
|
|
|
basic_score = sum(r['score'] for r in basic_results) / len(basic_results) if basic_results else 0 |
|
|
intermediate_score = sum(r['score'] for r in intermediate_results) / len(intermediate_results) if intermediate_results else 0 |
|
|
|
|
|
|
|
|
summary = { |
|
|
"model": adapter_model, |
|
|
"base_model": base_model, |
|
|
"evaluation_date": datetime.now().isoformat(), |
|
|
"total_questions": len(results), |
|
|
"total_score": round(total_score, 2), |
|
|
"max_score": max_score, |
|
|
"percentage": round(percentage, 2), |
|
|
"average_score": round(total_score / len(results), 2), |
|
|
"basic_avg_score": round(basic_score, 2), |
|
|
"intermediate_avg_score": round(intermediate_score, 2), |
|
|
"results": results |
|
|
} |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("EVALUATION SUMMARY") |
|
|
print("=" * 70) |
|
|
print(f"Total Score: {total_score:.2f}/{max_score} ({percentage:.1f}%)") |
|
|
print(f"Average Score: {summary['average_score']:.2f}/10") |
|
|
print(f"Basic Questions: {basic_score:.2f}/10") |
|
|
print(f"Intermediate Questions: {intermediate_score:.2f}/10") |
|
|
print("=" * 70) |
|
|
|
|
|
|
|
|
output_file = "nato_benchmark_results.json" |
|
|
with open(output_file, 'w') as f: |
|
|
json.dump(summary, f, indent=2) |
|
|
|
|
|
print(f"\nResults saved to: {output_file}") |
|
|
|
|
|
|
|
|
token = os.environ.get("HF_TOKEN") |
|
|
if token: |
|
|
print("\nUploading results to Hub...") |
|
|
try: |
|
|
api = HfApi(token=token) |
|
|
api.upload_file( |
|
|
path_or_fileobj=output_file, |
|
|
path_in_repo=f"results/nato_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", |
|
|
repo_id=adapter_model, |
|
|
repo_type="model" |
|
|
) |
|
|
print("✅ Results uploaded to model repository") |
|
|
except Exception as e: |
|
|
print(f"⚠️ Could not upload results: {e}") |
|
|
|
|
|
print("\n✅ NATO benchmark evaluation complete!") |
|
|
return summary |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|