teolm30 commited on
Commit
f7a5fb7
·
verified ·
1 Parent(s): 5aa2508

Add evaluate.py: Benchmark evaluation on HumanEval + MBPP

Browse files
Files changed (1) hide show
  1. evaluate.py +241 -0
evaluate.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Fox1.3 Evaluation Script
4
+ Evaluates on HumanEval and MBPP benchmarks
5
+ """
6
+
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
9
+ from datasets import load_dataset
10
+ import json
11
+ import logging
12
+ from typing import List, Dict
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ MODEL_NAME = "teolm30/fox1.3"
18
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
19
+
20
+ def load_model():
21
+ logger.info(f"Loading model: {MODEL_NAME}")
22
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
23
+ tokenizer.pad_token = tokenizer.eos_token
24
+
25
+ model = AutoModelForCausalLM.from_pretrained(
26
+ MODEL_NAME,
27
+ torch_dtype=torch.float16,
28
+ device_map="auto",
29
+ trust_remote_code=True
30
+ )
31
+
32
+ return model, tokenizer
33
+
34
+ def run_humaneval(model, tokenizer) -> Dict:
35
+ """Run HumanEval benchmark."""
36
+ logger.info("Loading HumanEval dataset...")
37
+ dataset = load_dataset("openai/openai_humaneval", split="test")
38
+
39
+ pipe = pipeline(
40
+ "text-generation",
41
+ model=model,
42
+ tokenizer=tokenizer,
43
+ max_new_tokens=256,
44
+ do_sample=False,
45
+ temperature=None,
46
+ top_p=None,
47
+ device_map="auto"
48
+ )
49
+
50
+ correct = 0
51
+ total = len(dataset)
52
+ results = []
53
+
54
+ for i, item in enumerate(dataset):
55
+ prompt = item["prompt"]
56
+ test = item["test"]
57
+ canonical_solution = item["canonical_solution"]
58
+
59
+ # Extract the prompt up to the function signature
60
+ prompt_end = prompt.find("def ")
61
+ if prompt_end == -1:
62
+ prompt_end = len(prompt)
63
+
64
+ full_prompt = prompt[:prompt_end]
65
+
66
+ try:
67
+ output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id)
68
+ generated = output[0]["generated_text"]
69
+
70
+ # Extract code block
71
+ code = generated[len(full_prompt):].strip()
72
+
73
+ # Try to extract just the function body
74
+ if "```python" in code:
75
+ code = code.split("```python")[1].split("```")[0].strip()
76
+ elif "```" in code:
77
+ code = code.split("```")[1].split("```")[0].strip()
78
+
79
+ # Execute the code with the test
80
+ exec_globals = {}
81
+ exec(code, exec_globals)
82
+ exec(test, exec_globals)
83
+
84
+ # Check if test passed by running it
85
+ local_vars = {}
86
+ exec(code, local_vars)
87
+ try:
88
+ exec(test, local_vars)
89
+ correct += 1
90
+ status = "PASS"
91
+ except AssertionError:
92
+ status = "FAIL"
93
+ except Exception as e:
94
+ status = f"ERROR: {str(e)[:50]}"
95
+
96
+ except Exception as e:
97
+ status = f"ERROR: {str(e)[:50]}"
98
+
99
+ results.append({
100
+ "task_id": item.get("task_id", i),
101
+ "status": status
102
+ })
103
+
104
+ if (i + 1) % 10 == 0:
105
+ logger.info(f"HumanEval progress: {i+1}/{total} | Running pass@{1}: {correct}/{i+1}")
106
+
107
+ pass_at_1 = correct / total
108
+ logger.info(f"HumanEval PASS@1: {pass_at_1:.4f} ({correct}/{total})")
109
+
110
+ return {
111
+ "benchmark": "HumanEval",
112
+ "pass_at_1": pass_at_1,
113
+ "correct": correct,
114
+ "total": total,
115
+ "results": results
116
+ }
117
+
118
+ def run_mbpp(model, tokenizer) -> Dict:
119
+ """Run MBPP benchmark."""
120
+ logger.info("Loading MBPP dataset...")
121
+ dataset = load_dataset("google-research/mbpp", "sanitized", split="test")
122
+
123
+ pipe = pipeline(
124
+ "text-generation",
125
+ model=model,
126
+ tokenizer=tokenizer,
127
+ max_new_tokens=256,
128
+ do_sample=False,
129
+ temperature=None,
130
+ top_p=None,
131
+ device_map="auto"
132
+ )
133
+
134
+ correct = 0
135
+ total = min(len(dataset), 374) # Standard subset size
136
+ results = []
137
+
138
+ for i, item in enumerate(dataset[:total]):
139
+ prompt = item["prompt"]
140
+ test_list = item["test_list"]
141
+ code = item["code"]
142
+
143
+ full_prompt = f"### Instruction:\nWrite a Python function.\n\n### Input:\n{prompt}\n\n### Response:\n"
144
+
145
+ try:
146
+ output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id)
147
+ generated = output[0]["generated_text"]
148
+
149
+ # Extract code from response
150
+ response = generated[len(full_prompt):].strip()
151
+
152
+ if "```python" in response:
153
+ response = response.split("```python")[1].split("```")[0].strip()
154
+ elif "```" in response:
155
+ response = response.split("```")[1].split("```")[0].strip()
156
+
157
+ # Test the generated code
158
+ exec_globals = {}
159
+ exec(response, exec_globals)
160
+
161
+ all_passed = True
162
+ for test_code in test_list:
163
+ try:
164
+ exec(test_code, exec_globals)
165
+ except AssertionError:
166
+ all_passed = False
167
+ break
168
+ except Exception:
169
+ all_passed = False
170
+ break
171
+
172
+ if all_passed:
173
+ correct += 1
174
+ status = "PASS"
175
+ else:
176
+ status = "FAIL"
177
+
178
+ except Exception as e:
179
+ status = f"ERROR: {str(e)[:50]}"
180
+
181
+ results.append({
182
+ "prompt_id": item.get("prompts_id", i),
183
+ "status": status
184
+ })
185
+
186
+ if (i + 1) % 50 == 0:
187
+ logger.info(f"MBPP progress: {i+1}/{total} | Running pass@1: {correct}/{i+1}")
188
+
189
+ pass_at_1 = correct / total
190
+ logger.info(f"MBPP PASS@1: {pass_at_1:.4f} ({correct}/{total})")
191
+
192
+ return {
193
+ "benchmark": "MBPP",
194
+ "pass_at_1": pass_at_1,
195
+ "correct": correct,
196
+ "total": total,
197
+ "results": results
198
+ }
199
+
200
+ def main():
201
+ logger.info(f"Using device: {DEVICE}")
202
+
203
+ model, tokenizer = load_model()
204
+
205
+ # Run benchmarks
206
+ humaneval_results = run_humaneval(model, tokenizer)
207
+ mbpp_results = run_mbpp(model, tokenizer)
208
+
209
+ # Summary
210
+ summary = {
211
+ "model": MODEL_NAME,
212
+ "benchmarks": {
213
+ "HumanEval": {
214
+ "pass_at_1": humaneval_results["pass_at_1"],
215
+ "correct": humaneval_results["correct"],
216
+ "total": humaneval_results["total"]
217
+ },
218
+ "MBPP": {
219
+ "pass_at_1": mbpp_results["pass_at_1"],
220
+ "correct": mbpp_results["correct"],
221
+ "total": mbpp_results["total"]
222
+ }
223
+ }
224
+ }
225
+
226
+ logger.info("\n" + "="*50)
227
+ logger.info("BENCHMARK RESULTS SUMMARY")
228
+ logger.info("="*50)
229
+ logger.info(f"HumanEval: {humaneval_results['pass_at_1']:.4f} ({humaneval_results['correct']}/{humaneval_results['total']})")
230
+ logger.info(f"MBPP: {mbpp_results['pass_at_1']:.4f} ({mbpp_results['correct']}/{mbpp_results['total']})")
231
+
232
+ # Save results
233
+ output_file = "benchmark_results.json"
234
+ with open(output_file, "w") as f:
235
+ json.dump(summary, f, indent=2)
236
+ logger.info(f"Results saved to {output_file}")
237
+
238
+ return summary
239
+
240
+ if __name__ == "__main__":
241
+ main()