AndreasThinks
/

nato-llm-scripts

Model card Files Files and versions

xet

Community

AndreasThinks commited on Dec 9, 2025

Commit

7b1d688

verified ·

1 Parent(s): 2460d23

Upload eval_nato_benchmark.py with huggingface_hub

Browse files

Files changed (1) hide show

eval_nato_benchmark.py +196 -0

eval_nato_benchmark.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""Evaluate fine-tuned model on NATO-specific benchmark.
+This script runs as a Hugging Face Job to evaluate the NATO doctrine model
+on domain-specific questions and saves results to the Hub.
+"""
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "transformers>=4.40.0",
+#     "torch>=2.0.0",
+#     "peft>=0.7.0",
+#     "datasets>=2.16.0",
+#     "huggingface-hub>=0.20.0",
+#     "accelerate>=0.20.0",
+# ]
+# ///
+import json
+import os
+from datetime import datetime
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+from huggingface_hub import HfApi
+import torch
+def load_model(adapter_path: str, base_model_path: str = "mistralai/Mistral-7B-Instruct-v0.3"):
+    """Load the fine-tuned model with adapter."""
+    print(f"Loading base model: {base_model_path}")
+    base_model = AutoModelForCausalLM.from_pretrained(
+        base_model_path,
+        device_map="auto",
+        torch_dtype=torch.float16
+    )
+    print(f"Loading LoRA adapter: {adapter_path}")
+    model = PeftModel.from_pretrained(base_model, adapter_path)
+    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
+def generate_response(model, tokenizer, question: str, max_new_tokens: int = 300):
+    """Generate a response to a question."""
+    messages = [{"role": "user", "content": question}]
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    if "[/INST]" in response:
+        response = response.split("[/INST]")[-1].strip()
+    return response
+def evaluate_question(model, tokenizer, question_data: dict) -> dict:
+    """Evaluate a single question."""
+    question = question_data["question"]
+    key_concepts = question_data["key_concepts"]
+    print(f"\nEvaluating: {question}")
+    response = generate_response(model, tokenizer, question)
+    # Check for key concepts
+    concepts_found = [
+        concept for concept in key_concepts
+        if concept.lower() in response.lower()
+    ]
+    score = (len(concepts_found) / len(key_concepts)) * 10 if key_concepts else 0
+    return {
+        "id": question_data["id"],
+        "ajp_source": question_data["ajp_source"],
+        "difficulty": question_data["difficulty"],
+        "question": question,
+        "response": response,
+        "key_concepts": key_concepts,
+        "concepts_found": concepts_found,
+        "score": round(score, 2)
+    }
+def main():
+    """Run NATO benchmark evaluation."""
+    print("=" * 70)
+    print("NATO Doctrine Model - Benchmark Evaluation")
+    print("=" * 70)
+    # Configuration
+    adapter_model = "AndreasThinks/mistral-7b-nato-doctrine"
+    base_model = "mistralai/Mistral-7B-Instruct-v0.3"
+    # Load NATO test questions
+    print("\nLoading NATO test questions...")
+    api = HfApi()
+    # Download test questions from the repo
+    questions_file = api.hf_hub_download(
+        repo_id="AndreasThinks/nato-llm-scripts",
+        filename="nato_test_questions.json",
+        repo_type="dataset"
+    )
+    with open(questions_file, 'r') as f:
+        test_questions = json.load(f)
+    print(f"Loaded {len(test_questions)} test questions")
+    # Load model
+    model, tokenizer = load_model(adapter_model, base_model)
+    # Run evaluation
+    print("\n" + "=" * 70)
+    print("Running Evaluation")
+    print("=" * 70)
+    results = []
+    for question_data in test_questions:
+        result = evaluate_question(model, tokenizer, question_data)
+        results.append(result)
+        print(f"  Question {result['id']}: Score = {result['score']}/10")
+    # Calculate summary statistics
+    total_score = sum(r['score'] for r in results)
+    max_score = len(results) * 10
+    percentage = (total_score / max_score) * 100
+    # Group by difficulty
+    basic_results = [r for r in results if r['difficulty'] == 'basic']
+    intermediate_results = [r for r in results if r['difficulty'] == 'intermediate']
+    basic_score = sum(r['score'] for r in basic_results) / len(basic_results) if basic_results else 0
+    intermediate_score = sum(r['score'] for r in intermediate_results) / len(intermediate_results) if intermediate_results else 0
+    # Create summary
+    summary = {
+        "model": adapter_model,
+        "base_model": base_model,
+        "evaluation_date": datetime.now().isoformat(),
+        "total_questions": len(results),
+        "total_score": round(total_score, 2),
+        "max_score": max_score,
+        "percentage": round(percentage, 2),
+        "average_score": round(total_score / len(results), 2),
+        "basic_avg_score": round(basic_score, 2),
+        "intermediate_avg_score": round(intermediate_score, 2),
+        "results": results
+    }
+    # Print summary
+    print("\n" + "=" * 70)
+    print("EVALUATION SUMMARY")
+    print("=" * 70)
+    print(f"Total Score: {total_score:.2f}/{max_score} ({percentage:.1f}%)")
+    print(f"Average Score: {summary['average_score']:.2f}/10")
+    print(f"Basic Questions: {basic_score:.2f}/10")
+    print(f"Intermediate Questions: {intermediate_score:.2f}/10")
+    print("=" * 70)
+    # Save results
+    output_file = "nato_benchmark_results.json"
+    with open(output_file, 'w') as f:
+        json.dump(summary, f, indent=2)
+    print(f"\nResults saved to: {output_file}")
+    # Upload results to Hub
+    token = os.environ.get("HF_TOKEN")
+    if token:
+        print("\nUploading results to Hub...")
+        try:
+            api = HfApi(token=token)
+            api.upload_file(
+                path_or_fileobj=output_file,
+                path_in_repo=f"results/nato_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+                repo_id=adapter_model,
+                repo_type="model"
+            )
+            print("✅ Results uploaded to model repository")
+        except Exception as e:
+            print(f"⚠️ Could not upload results: {e}")
+    print("\n✅ NATO benchmark evaluation complete!")
+    return summary
+if __name__ == "__main__":
+    main()