AndreasThinks commited on
Commit
7b1d688
·
verified ·
1 Parent(s): 2460d23

Upload eval_nato_benchmark.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. eval_nato_benchmark.py +196 -0
eval_nato_benchmark.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Evaluate fine-tuned model on NATO-specific benchmark.
2
+
3
+ This script runs as a Hugging Face Job to evaluate the NATO doctrine model
4
+ on domain-specific questions and saves results to the Hub.
5
+ """
6
+ # /// script
7
+ # requires-python = ">=3.11"
8
+ # dependencies = [
9
+ # "transformers>=4.40.0",
10
+ # "torch>=2.0.0",
11
+ # "peft>=0.7.0",
12
+ # "datasets>=2.16.0",
13
+ # "huggingface-hub>=0.20.0",
14
+ # "accelerate>=0.20.0",
15
+ # ]
16
+ # ///
17
+
18
+ import json
19
+ import os
20
+ from datetime import datetime
21
+ from transformers import AutoModelForCausalLM, AutoTokenizer
22
+ from peft import PeftModel
23
+ from huggingface_hub import HfApi
24
+ import torch
25
+
26
+ def load_model(adapter_path: str, base_model_path: str = "mistralai/Mistral-7B-Instruct-v0.3"):
27
+ """Load the fine-tuned model with adapter."""
28
+ print(f"Loading base model: {base_model_path}")
29
+ base_model = AutoModelForCausalLM.from_pretrained(
30
+ base_model_path,
31
+ device_map="auto",
32
+ torch_dtype=torch.float16
33
+ )
34
+
35
+ print(f"Loading LoRA adapter: {adapter_path}")
36
+ model = PeftModel.from_pretrained(base_model, adapter_path)
37
+
38
+ tokenizer = AutoTokenizer.from_pretrained(base_model_path)
39
+ tokenizer.pad_token = tokenizer.eos_token
40
+
41
+ return model, tokenizer
42
+
43
+ def generate_response(model, tokenizer, question: str, max_new_tokens: int = 300):
44
+ """Generate a response to a question."""
45
+ messages = [{"role": "user", "content": question}]
46
+
47
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
48
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
49
+
50
+ with torch.no_grad():
51
+ outputs = model.generate(
52
+ **inputs,
53
+ max_new_tokens=max_new_tokens,
54
+ temperature=0.7,
55
+ top_p=0.9,
56
+ do_sample=True
57
+ )
58
+
59
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
60
+ if "[/INST]" in response:
61
+ response = response.split("[/INST]")[-1].strip()
62
+
63
+ return response
64
+
65
+ def evaluate_question(model, tokenizer, question_data: dict) -> dict:
66
+ """Evaluate a single question."""
67
+ question = question_data["question"]
68
+ key_concepts = question_data["key_concepts"]
69
+
70
+ print(f"\nEvaluating: {question}")
71
+ response = generate_response(model, tokenizer, question)
72
+
73
+ # Check for key concepts
74
+ concepts_found = [
75
+ concept for concept in key_concepts
76
+ if concept.lower() in response.lower()
77
+ ]
78
+
79
+ score = (len(concepts_found) / len(key_concepts)) * 10 if key_concepts else 0
80
+
81
+ return {
82
+ "id": question_data["id"],
83
+ "ajp_source": question_data["ajp_source"],
84
+ "difficulty": question_data["difficulty"],
85
+ "question": question,
86
+ "response": response,
87
+ "key_concepts": key_concepts,
88
+ "concepts_found": concepts_found,
89
+ "score": round(score, 2)
90
+ }
91
+
92
+ def main():
93
+ """Run NATO benchmark evaluation."""
94
+ print("=" * 70)
95
+ print("NATO Doctrine Model - Benchmark Evaluation")
96
+ print("=" * 70)
97
+
98
+ # Configuration
99
+ adapter_model = "AndreasThinks/mistral-7b-nato-doctrine"
100
+ base_model = "mistralai/Mistral-7B-Instruct-v0.3"
101
+
102
+ # Load NATO test questions
103
+ print("\nLoading NATO test questions...")
104
+ api = HfApi()
105
+
106
+ # Download test questions from the repo
107
+ questions_file = api.hf_hub_download(
108
+ repo_id="AndreasThinks/nato-llm-scripts",
109
+ filename="nato_test_questions.json",
110
+ repo_type="dataset"
111
+ )
112
+
113
+ with open(questions_file, 'r') as f:
114
+ test_questions = json.load(f)
115
+
116
+ print(f"Loaded {len(test_questions)} test questions")
117
+
118
+ # Load model
119
+ model, tokenizer = load_model(adapter_model, base_model)
120
+
121
+ # Run evaluation
122
+ print("\n" + "=" * 70)
123
+ print("Running Evaluation")
124
+ print("=" * 70)
125
+
126
+ results = []
127
+ for question_data in test_questions:
128
+ result = evaluate_question(model, tokenizer, question_data)
129
+ results.append(result)
130
+ print(f" Question {result['id']}: Score = {result['score']}/10")
131
+
132
+ # Calculate summary statistics
133
+ total_score = sum(r['score'] for r in results)
134
+ max_score = len(results) * 10
135
+ percentage = (total_score / max_score) * 100
136
+
137
+ # Group by difficulty
138
+ basic_results = [r for r in results if r['difficulty'] == 'basic']
139
+ intermediate_results = [r for r in results if r['difficulty'] == 'intermediate']
140
+
141
+ basic_score = sum(r['score'] for r in basic_results) / len(basic_results) if basic_results else 0
142
+ intermediate_score = sum(r['score'] for r in intermediate_results) / len(intermediate_results) if intermediate_results else 0
143
+
144
+ # Create summary
145
+ summary = {
146
+ "model": adapter_model,
147
+ "base_model": base_model,
148
+ "evaluation_date": datetime.now().isoformat(),
149
+ "total_questions": len(results),
150
+ "total_score": round(total_score, 2),
151
+ "max_score": max_score,
152
+ "percentage": round(percentage, 2),
153
+ "average_score": round(total_score / len(results), 2),
154
+ "basic_avg_score": round(basic_score, 2),
155
+ "intermediate_avg_score": round(intermediate_score, 2),
156
+ "results": results
157
+ }
158
+
159
+ # Print summary
160
+ print("\n" + "=" * 70)
161
+ print("EVALUATION SUMMARY")
162
+ print("=" * 70)
163
+ print(f"Total Score: {total_score:.2f}/{max_score} ({percentage:.1f}%)")
164
+ print(f"Average Score: {summary['average_score']:.2f}/10")
165
+ print(f"Basic Questions: {basic_score:.2f}/10")
166
+ print(f"Intermediate Questions: {intermediate_score:.2f}/10")
167
+ print("=" * 70)
168
+
169
+ # Save results
170
+ output_file = "nato_benchmark_results.json"
171
+ with open(output_file, 'w') as f:
172
+ json.dump(summary, f, indent=2)
173
+
174
+ print(f"\nResults saved to: {output_file}")
175
+
176
+ # Upload results to Hub
177
+ token = os.environ.get("HF_TOKEN")
178
+ if token:
179
+ print("\nUploading results to Hub...")
180
+ try:
181
+ api = HfApi(token=token)
182
+ api.upload_file(
183
+ path_or_fileobj=output_file,
184
+ path_in_repo=f"results/nato_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
185
+ repo_id=adapter_model,
186
+ repo_type="model"
187
+ )
188
+ print("✅ Results uploaded to model repository")
189
+ except Exception as e:
190
+ print(f"⚠️ Could not upload results: {e}")
191
+
192
+ print("\n✅ NATO benchmark evaluation complete!")
193
+ return summary
194
+
195
+ if __name__ == "__main__":
196
+ main()