"""Evaluate fine-tuned model on NATO-specific benchmark. This script runs as a Hugging Face Job to evaluate the NATO doctrine model on domain-specific questions and saves results to the Hub. """ # /// script # requires-python = ">=3.11" # dependencies = [ # "transformers>=4.40.0", # "torch>=2.0.0", # "peft>=0.7.0", # "datasets>=2.16.0", # "huggingface-hub>=0.20.0", # "accelerate>=0.20.0", # "protobuf>=3.20.0", # "sentencepiece>=0.1.99", # ] # /// import json import os from datetime import datetime from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from huggingface_hub import HfApi import torch def load_model(adapter_path: str, base_model_path: str = "mistralai/Mistral-7B-Instruct-v0.3"): """Load the fine-tuned model with adapter.""" print(f"Loading base model: {base_model_path}") base_model = AutoModelForCausalLM.from_pretrained( base_model_path, device_map="auto", torch_dtype=torch.float16 ) print(f"Loading LoRA adapter: {adapter_path}") model = PeftModel.from_pretrained(base_model, adapter_path) tokenizer = AutoTokenizer.from_pretrained(base_model_path) tokenizer.pad_token = tokenizer.eos_token return model, tokenizer def generate_response(model, tokenizer, question: str, max_new_tokens: int = 300): """Generate a response to a question.""" messages = [{"role": "user", "content": question}] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=0.7, top_p=0.9, do_sample=True ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) if "[/INST]" in response: response = response.split("[/INST]")[-1].strip() return response def evaluate_question(model, tokenizer, question_data: dict) -> dict: """Evaluate a single question.""" question = question_data["question"] key_concepts = question_data["key_concepts"] print(f"\nEvaluating: {question}") response = generate_response(model, tokenizer, question) # Check for key concepts concepts_found = [ concept for concept in key_concepts if concept.lower() in response.lower() ] score = (len(concepts_found) / len(key_concepts)) * 10 if key_concepts else 0 return { "id": question_data["id"], "ajp_source": question_data["ajp_source"], "difficulty": question_data["difficulty"], "question": question, "response": response, "key_concepts": key_concepts, "concepts_found": concepts_found, "score": round(score, 2) } def main(): """Run NATO benchmark evaluation.""" print("=" * 70) print("NATO Doctrine Model - Benchmark Evaluation") print("=" * 70) # Configuration adapter_model = "AndreasThinks/mistral-7b-nato-doctrine" base_model = "mistralai/Mistral-7B-Instruct-v0.3" # Load NATO test questions print("\nLoading NATO test questions...") api = HfApi() # Download test questions from the repo questions_file = api.hf_hub_download( repo_id="AndreasThinks/nato-llm-scripts", filename="nato_test_questions.json", repo_type="model" ) with open(questions_file, 'r') as f: test_questions = json.load(f) print(f"Loaded {len(test_questions)} test questions") # Load model model, tokenizer = load_model(adapter_model, base_model) # Run evaluation print("\n" + "=" * 70) print("Running Evaluation") print("=" * 70) results = [] for question_data in test_questions: result = evaluate_question(model, tokenizer, question_data) results.append(result) print(f" Question {result['id']}: Score = {result['score']}/10") # Calculate summary statistics total_score = sum(r['score'] for r in results) max_score = len(results) * 10 percentage = (total_score / max_score) * 100 # Group by difficulty basic_results = [r for r in results if r['difficulty'] == 'basic'] intermediate_results = [r for r in results if r['difficulty'] == 'intermediate'] basic_score = sum(r['score'] for r in basic_results) / len(basic_results) if basic_results else 0 intermediate_score = sum(r['score'] for r in intermediate_results) / len(intermediate_results) if intermediate_results else 0 # Create summary summary = { "model": adapter_model, "base_model": base_model, "evaluation_date": datetime.now().isoformat(), "total_questions": len(results), "total_score": round(total_score, 2), "max_score": max_score, "percentage": round(percentage, 2), "average_score": round(total_score / len(results), 2), "basic_avg_score": round(basic_score, 2), "intermediate_avg_score": round(intermediate_score, 2), "results": results } # Print summary print("\n" + "=" * 70) print("EVALUATION SUMMARY") print("=" * 70) print(f"Total Score: {total_score:.2f}/{max_score} ({percentage:.1f}%)") print(f"Average Score: {summary['average_score']:.2f}/10") print(f"Basic Questions: {basic_score:.2f}/10") print(f"Intermediate Questions: {intermediate_score:.2f}/10") print("=" * 70) # Save results output_file = "nato_benchmark_results.json" with open(output_file, 'w') as f: json.dump(summary, f, indent=2) print(f"\nResults saved to: {output_file}") # Upload results to Hub token = os.environ.get("HF_TOKEN") if token: print("\nUploading results to Hub...") try: api = HfApi(token=token) api.upload_file( path_or_fileobj=output_file, path_in_repo=f"results/nato_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", repo_id=adapter_model, repo_type="model" ) print("✅ Results uploaded to model repository") except Exception as e: print(f"⚠️ Could not upload results: {e}") print("\n✅ NATO benchmark evaluation complete!") return summary if __name__ == "__main__": main()