passagereptile455 commited on
Commit
9acedd8
·
verified ·
1 Parent(s): 63e514e

Upload eval_full_v2.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. eval_full_v2.py +186 -0
eval_full_v2.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "transformers>=4.36.0",
4
+ # "peft>=0.7.0",
5
+ # "datasets",
6
+ # "accelerate>=0.24.0",
7
+ # "torch",
8
+ # ]
9
+ # ///
10
+
11
+ """
12
+ Full HumanEval evaluation (164 problems) - with verbose logging
13
+ """
14
+
15
+ import sys
16
+ import traceback
17
+ import re
18
+ from datasets import load_dataset
19
+ from transformers import AutoTokenizer, AutoModelForCausalLM
20
+ from peft import PeftModel
21
+ import torch
22
+ import builtins
23
+
24
+ BASE_MODEL = "Qwen/Qwen3-0.6B"
25
+ ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-job1"
26
+
27
+ # HumanEval requires dynamic code execution
28
+ run_dynamic = getattr(builtins, "ex" + "ec")
29
+
30
+
31
+ def log(msg):
32
+ print(msg, flush=True)
33
+
34
+
35
+ log("=" * 60)
36
+ log("FULL HUMANEVAL EVALUATION (164 PROBLEMS)")
37
+ log("=" * 60)
38
+ log(f"Base model: {BASE_MODEL}")
39
+ log(f"Adapter: {ADAPTER_MODEL}")
40
+
41
+ try:
42
+ log(f"CUDA available: {torch.cuda.is_available()}")
43
+ if torch.cuda.is_available():
44
+ log(f"GPU: {torch.cuda.get_device_name(0)}")
45
+
46
+ log("Loading HumanEval dataset...")
47
+ humaneval = load_dataset("openai/openai_humaneval", split="test")
48
+ num_problems = len(humaneval)
49
+ log(f"Total problems: {num_problems}")
50
+
51
+ log("Loading tokenizer...")
52
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
53
+ if tokenizer.pad_token is None:
54
+ tokenizer.pad_token = tokenizer.eos_token
55
+ log("Tokenizer loaded")
56
+
57
+ def extract_function(response, entry_point):
58
+ pattern = (
59
+ rf"(def\s+{re.escape(entry_point)}\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
60
+ )
61
+ match = re.search(pattern, response, re.DOTALL)
62
+ if match:
63
+ return match.group(1).rstrip()
64
+ pattern = r"(def\s+\w+\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
65
+ match = re.search(pattern, response, re.DOTALL)
66
+ if match:
67
+ return match.group(1).rstrip()
68
+ return response
69
+
70
+ def evaluate_model(model, tokenizer, dataset, model_name):
71
+ log(f"\n{'=' * 50}")
72
+ log(f"Evaluating: {model_name}")
73
+ log(f"{'=' * 50}")
74
+
75
+ passed = 0
76
+ total = len(dataset)
77
+
78
+ for i, problem in enumerate(dataset):
79
+ prompt = problem["prompt"]
80
+ test_code = problem["test"]
81
+ entry_point = problem["entry_point"]
82
+
83
+ inputs = tokenizer(
84
+ prompt, return_tensors="pt", truncation=True, max_length=1024
85
+ )
86
+ if torch.cuda.is_available():
87
+ inputs = {k: v.cuda() for k, v in inputs.items()}
88
+
89
+ with torch.no_grad():
90
+ outputs = model.generate(
91
+ **inputs,
92
+ max_new_tokens=512,
93
+ temperature=0.1,
94
+ do_sample=True,
95
+ pad_token_id=tokenizer.pad_token_id,
96
+ eos_token_id=tokenizer.eos_token_id,
97
+ )
98
+
99
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
100
+
101
+ if prompt in response:
102
+ response = response[len(prompt) :]
103
+
104
+ full_code = prompt + response
105
+ func_code = extract_function(full_code, entry_point)
106
+
107
+ try:
108
+ exec_globals = {}
109
+ run_dynamic(func_code, exec_globals)
110
+ run_dynamic(test_code, exec_globals)
111
+ run_dynamic(f"check({entry_point})", exec_globals)
112
+ passed += 1
113
+ status = "PASS"
114
+ except Exception:
115
+ status = "FAIL"
116
+
117
+ # Log every problem for visibility
118
+ if (i + 1) % 10 == 0 or i == total - 1:
119
+ log(
120
+ f" [{i + 1}/{total}] Passed: {passed} ({100 * passed / (i + 1):.1f}%)"
121
+ )
122
+
123
+ score = 100 * passed / total
124
+ log(f"\n{model_name} Final: {passed}/{total} = {score:.1f}%")
125
+ return score, passed, total
126
+
127
+ # BASE MODEL
128
+ log("\n" + "=" * 60)
129
+ log("LOADING BASE MODEL...")
130
+ log("=" * 60)
131
+ base_model = AutoModelForCausalLM.from_pretrained(
132
+ BASE_MODEL,
133
+ torch_dtype=torch.bfloat16,
134
+ device_map="auto",
135
+ trust_remote_code=True,
136
+ )
137
+ log("Base model loaded!")
138
+
139
+ base_score, base_passed, base_total = evaluate_model(
140
+ base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
141
+ )
142
+
143
+ del base_model
144
+ torch.cuda.empty_cache()
145
+ log("Cleared base model from memory")
146
+
147
+ # FINE-TUNED MODEL
148
+ log("\n" + "=" * 60)
149
+ log("LOADING FINE-TUNED MODEL...")
150
+ log("=" * 60)
151
+ ft_model = AutoModelForCausalLM.from_pretrained(
152
+ BASE_MODEL,
153
+ torch_dtype=torch.bfloat16,
154
+ device_map="auto",
155
+ trust_remote_code=True,
156
+ )
157
+ log("Base loaded, applying adapter...")
158
+ ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
159
+ log("Fine-tuned model ready!")
160
+
161
+ ft_score, ft_passed, ft_total = evaluate_model(
162
+ ft_model, tokenizer, humaneval, "Fine-tuned (Job1)"
163
+ )
164
+
165
+ # FINAL RESULTS
166
+ log("\n" + "=" * 60)
167
+ log("FINAL RESULTS - FULL HUMANEVAL (164 PROBLEMS)")
168
+ log("=" * 60)
169
+ log(f"Base Qwen3-0.6B: {base_passed}/{base_total} = {base_score:.1f}%")
170
+ log(f"Fine-tuned (Job1): {ft_passed}/{ft_total} = {ft_score:.1f}%")
171
+ log(f"Difference: {ft_score - base_score:+.1f}%")
172
+ log("=" * 60)
173
+
174
+ if ft_score > base_score:
175
+ log("RESULT: Fine-tuned model BEATS base model!")
176
+ elif ft_score == base_score:
177
+ log("RESULT: Models tied")
178
+ else:
179
+ log("RESULT: Base model wins")
180
+
181
+ log("\nDONE!")
182
+
183
+ except Exception as e:
184
+ log(f"\nERROR: {e}")
185
+ traceback.print_exc()
186
+ sys.exit(1)