| """Evaluate fine-tuned model on NATO-specific benchmark. |
| |
| This script runs as a Hugging Face Job to evaluate the NATO doctrine model |
| on domain-specific questions and saves results to the Hub. |
| """ |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import json |
| import os |
| from datetime import datetime |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from peft import PeftModel |
| from huggingface_hub import HfApi |
| import torch |
|
|
| def load_model(adapter_path: str, base_model_path: str = "mistralai/Mistral-7B-Instruct-v0.3"): |
| """Load the fine-tuned model with adapter.""" |
| print(f"Loading base model: {base_model_path}") |
| base_model = AutoModelForCausalLM.from_pretrained( |
| base_model_path, |
| device_map="auto", |
| torch_dtype=torch.float16 |
| ) |
|
|
| print(f"Loading LoRA adapter: {adapter_path}") |
| model = PeftModel.from_pretrained(base_model, adapter_path) |
|
|
| tokenizer = AutoTokenizer.from_pretrained(base_model_path) |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| return model, tokenizer |
|
|
| def generate_response(model, tokenizer, question: str, max_new_tokens: int = 300): |
| """Generate a response to a question.""" |
| messages = [{"role": "user", "content": question}] |
|
|
| prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
| with torch.no_grad(): |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=max_new_tokens, |
| temperature=0.7, |
| top_p=0.9, |
| do_sample=True |
| ) |
|
|
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| if "[/INST]" in response: |
| response = response.split("[/INST]")[-1].strip() |
|
|
| return response |
|
|
| def evaluate_question(model, tokenizer, question_data: dict) -> dict: |
| """Evaluate a single question.""" |
| question = question_data["question"] |
| key_concepts = question_data["key_concepts"] |
|
|
| print(f"\nEvaluating: {question}") |
| response = generate_response(model, tokenizer, question) |
|
|
| |
| concepts_found = [ |
| concept for concept in key_concepts |
| if concept.lower() in response.lower() |
| ] |
|
|
| score = (len(concepts_found) / len(key_concepts)) * 10 if key_concepts else 0 |
|
|
| return { |
| "id": question_data["id"], |
| "ajp_source": question_data["ajp_source"], |
| "difficulty": question_data["difficulty"], |
| "question": question, |
| "response": response, |
| "key_concepts": key_concepts, |
| "concepts_found": concepts_found, |
| "score": round(score, 2) |
| } |
|
|
| def main(): |
| """Run NATO benchmark evaluation.""" |
| print("=" * 70) |
| print("NATO Doctrine Model - Benchmark Evaluation") |
| print("=" * 70) |
|
|
| |
| adapter_model = "AndreasThinks/mistral-7b-nato-doctrine" |
| base_model = "mistralai/Mistral-7B-Instruct-v0.3" |
|
|
| |
| print("\nLoading NATO test questions...") |
| api = HfApi() |
|
|
| |
| questions_file = api.hf_hub_download( |
| repo_id="AndreasThinks/nato-llm-scripts", |
| filename="nato_test_questions.json", |
| repo_type="model" |
| ) |
|
|
| with open(questions_file, 'r') as f: |
| test_questions = json.load(f) |
|
|
| print(f"Loaded {len(test_questions)} test questions") |
|
|
| |
| model, tokenizer = load_model(adapter_model, base_model) |
|
|
| |
| print("\n" + "=" * 70) |
| print("Running Evaluation") |
| print("=" * 70) |
|
|
| results = [] |
| for question_data in test_questions: |
| result = evaluate_question(model, tokenizer, question_data) |
| results.append(result) |
| print(f" Question {result['id']}: Score = {result['score']}/10") |
|
|
| |
| total_score = sum(r['score'] for r in results) |
| max_score = len(results) * 10 |
| percentage = (total_score / max_score) * 100 |
|
|
| |
| basic_results = [r for r in results if r['difficulty'] == 'basic'] |
| intermediate_results = [r for r in results if r['difficulty'] == 'intermediate'] |
|
|
| basic_score = sum(r['score'] for r in basic_results) / len(basic_results) if basic_results else 0 |
| intermediate_score = sum(r['score'] for r in intermediate_results) / len(intermediate_results) if intermediate_results else 0 |
|
|
| |
| summary = { |
| "model": adapter_model, |
| "base_model": base_model, |
| "evaluation_date": datetime.now().isoformat(), |
| "total_questions": len(results), |
| "total_score": round(total_score, 2), |
| "max_score": max_score, |
| "percentage": round(percentage, 2), |
| "average_score": round(total_score / len(results), 2), |
| "basic_avg_score": round(basic_score, 2), |
| "intermediate_avg_score": round(intermediate_score, 2), |
| "results": results |
| } |
|
|
| |
| print("\n" + "=" * 70) |
| print("EVALUATION SUMMARY") |
| print("=" * 70) |
| print(f"Total Score: {total_score:.2f}/{max_score} ({percentage:.1f}%)") |
| print(f"Average Score: {summary['average_score']:.2f}/10") |
| print(f"Basic Questions: {basic_score:.2f}/10") |
| print(f"Intermediate Questions: {intermediate_score:.2f}/10") |
| print("=" * 70) |
|
|
| |
| output_file = "nato_benchmark_results.json" |
| with open(output_file, 'w') as f: |
| json.dump(summary, f, indent=2) |
|
|
| print(f"\nResults saved to: {output_file}") |
|
|
| |
| token = os.environ.get("HF_TOKEN") |
| if token: |
| print("\nUploading results to Hub...") |
| try: |
| api = HfApi(token=token) |
| api.upload_file( |
| path_or_fileobj=output_file, |
| path_in_repo=f"results/nato_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", |
| repo_id=adapter_model, |
| repo_type="model" |
| ) |
| print("✅ Results uploaded to model repository") |
| except Exception as e: |
| print(f"⚠️ Could not upload results: {e}") |
|
|
| print("\n✅ NATO benchmark evaluation complete!") |
| return summary |
|
|
| if __name__ == "__main__": |
| main() |
|
|