File size: 6,429 Bytes
7b1d688
 
 
 
 
 
 
 
 
 
 
 
 
 
b9d3d72
 
7b1d688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77be9bc
7b1d688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
"""Evaluate fine-tuned model on NATO-specific benchmark.

This script runs as a Hugging Face Job to evaluate the NATO doctrine model
on domain-specific questions and saves results to the Hub.
"""
# /// script
# requires-python = ">=3.11"
# dependencies = [
#     "transformers>=4.40.0",
#     "torch>=2.0.0",
#     "peft>=0.7.0",
#     "datasets>=2.16.0",
#     "huggingface-hub>=0.20.0",
#     "accelerate>=0.20.0",
#     "protobuf>=3.20.0",
#     "sentencepiece>=0.1.99",
# ]
# ///

import json
import os
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from huggingface_hub import HfApi
import torch

def load_model(adapter_path: str, base_model_path: str = "mistralai/Mistral-7B-Instruct-v0.3"):
    """Load the fine-tuned model with adapter."""
    print(f"Loading base model: {base_model_path}")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        device_map="auto",
        torch_dtype=torch.float16
    )

    print(f"Loading LoRA adapter: {adapter_path}")
    model = PeftModel.from_pretrained(base_model, adapter_path)

    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def generate_response(model, tokenizer, question: str, max_new_tokens: int = 300):
    """Generate a response to a question."""
    messages = [{"role": "user", "content": question}]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "[/INST]" in response:
        response = response.split("[/INST]")[-1].strip()

    return response

def evaluate_question(model, tokenizer, question_data: dict) -> dict:
    """Evaluate a single question."""
    question = question_data["question"]
    key_concepts = question_data["key_concepts"]

    print(f"\nEvaluating: {question}")
    response = generate_response(model, tokenizer, question)

    # Check for key concepts
    concepts_found = [
        concept for concept in key_concepts
        if concept.lower() in response.lower()
    ]

    score = (len(concepts_found) / len(key_concepts)) * 10 if key_concepts else 0

    return {
        "id": question_data["id"],
        "ajp_source": question_data["ajp_source"],
        "difficulty": question_data["difficulty"],
        "question": question,
        "response": response,
        "key_concepts": key_concepts,
        "concepts_found": concepts_found,
        "score": round(score, 2)
    }

def main():
    """Run NATO benchmark evaluation."""
    print("=" * 70)
    print("NATO Doctrine Model - Benchmark Evaluation")
    print("=" * 70)

    # Configuration
    adapter_model = "AndreasThinks/mistral-7b-nato-doctrine"
    base_model = "mistralai/Mistral-7B-Instruct-v0.3"

    # Load NATO test questions
    print("\nLoading NATO test questions...")
    api = HfApi()

    # Download test questions from the repo
    questions_file = api.hf_hub_download(
        repo_id="AndreasThinks/nato-llm-scripts",
        filename="nato_test_questions.json",
        repo_type="model"
    )

    with open(questions_file, 'r') as f:
        test_questions = json.load(f)

    print(f"Loaded {len(test_questions)} test questions")

    # Load model
    model, tokenizer = load_model(adapter_model, base_model)

    # Run evaluation
    print("\n" + "=" * 70)
    print("Running Evaluation")
    print("=" * 70)

    results = []
    for question_data in test_questions:
        result = evaluate_question(model, tokenizer, question_data)
        results.append(result)
        print(f"  Question {result['id']}: Score = {result['score']}/10")

    # Calculate summary statistics
    total_score = sum(r['score'] for r in results)
    max_score = len(results) * 10
    percentage = (total_score / max_score) * 100

    # Group by difficulty
    basic_results = [r for r in results if r['difficulty'] == 'basic']
    intermediate_results = [r for r in results if r['difficulty'] == 'intermediate']

    basic_score = sum(r['score'] for r in basic_results) / len(basic_results) if basic_results else 0
    intermediate_score = sum(r['score'] for r in intermediate_results) / len(intermediate_results) if intermediate_results else 0

    # Create summary
    summary = {
        "model": adapter_model,
        "base_model": base_model,
        "evaluation_date": datetime.now().isoformat(),
        "total_questions": len(results),
        "total_score": round(total_score, 2),
        "max_score": max_score,
        "percentage": round(percentage, 2),
        "average_score": round(total_score / len(results), 2),
        "basic_avg_score": round(basic_score, 2),
        "intermediate_avg_score": round(intermediate_score, 2),
        "results": results
    }

    # Print summary
    print("\n" + "=" * 70)
    print("EVALUATION SUMMARY")
    print("=" * 70)
    print(f"Total Score: {total_score:.2f}/{max_score} ({percentage:.1f}%)")
    print(f"Average Score: {summary['average_score']:.2f}/10")
    print(f"Basic Questions: {basic_score:.2f}/10")
    print(f"Intermediate Questions: {intermediate_score:.2f}/10")
    print("=" * 70)

    # Save results
    output_file = "nato_benchmark_results.json"
    with open(output_file, 'w') as f:
        json.dump(summary, f, indent=2)

    print(f"\nResults saved to: {output_file}")

    # Upload results to Hub
    token = os.environ.get("HF_TOKEN")
    if token:
        print("\nUploading results to Hub...")
        try:
            api = HfApi(token=token)
            api.upload_file(
                path_or_fileobj=output_file,
                path_in_repo=f"results/nato_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
                repo_id=adapter_model,
                repo_type="model"
            )
            print("✅ Results uploaded to model repository")
        except Exception as e:
            print(f"⚠️ Could not upload results: {e}")

    print("\n✅ NATO benchmark evaluation complete!")
    return summary

if __name__ == "__main__":
    main()