passagereptile455 commited on
Commit
e9a1071
·
verified ·
1 Parent(s): e7d1dc3

Upload eval_job1.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. eval_job1.py +180 -0
eval_job1.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "transformers>=4.36.0",
4
+ # "peft>=0.7.0",
5
+ # "accelerate>=0.24.0",
6
+ # "torch",
7
+ # "datasets",
8
+ # "tqdm",
9
+ # ]
10
+ # ///
11
+
12
+ """
13
+ Evaluate models on HumanEval with proper pass@1 execution.
14
+ Compares base model vs fine-tuned adapter.
15
+ """
16
+
17
+ import subprocess
18
+ import tempfile
19
+ import os
20
+ import sys
21
+ import torch
22
+ from datasets import load_dataset
23
+ from transformers import AutoModelForCausalLM, AutoTokenizer
24
+ from peft import PeftModel
25
+ from tqdm import tqdm
26
+
27
+ # Configuration
28
+ BASE_MODEL = "Qwen/Qwen3-0.6B"
29
+ ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-job1"
30
+ NUM_PROBLEMS = 50 # Use 50 for faster eval, 164 for full
31
+
32
+ print(f"Base model: {BASE_MODEL}")
33
+ print(f"Adapter: {ADAPTER_MODEL}")
34
+ print(f"Problems: {NUM_PROBLEMS}")
35
+
36
+ # Load HumanEval
37
+ print("\nLoading HumanEval dataset...")
38
+ humaneval = load_dataset("openai/openai_humaneval", split="test")
39
+ if NUM_PROBLEMS < 164:
40
+ humaneval = humaneval.select(range(NUM_PROBLEMS))
41
+ print(f"Using {len(humaneval)} problems")
42
+
43
+
44
+ def extract_function(text, entry_point):
45
+ """Extract function body from generated text."""
46
+ lines = text.split("\n")
47
+ result = []
48
+ in_func = False
49
+ base_indent = None
50
+
51
+ for line in lines:
52
+ stripped = line.lstrip()
53
+ if stripped.startswith(f"def {entry_point}"):
54
+ in_func = True
55
+ result.append(line)
56
+ base_indent = len(line) - len(stripped)
57
+ elif in_func:
58
+ current_indent = (
59
+ len(line) - len(line.lstrip()) if line.strip() else base_indent + 4
60
+ )
61
+ if line.strip() == "":
62
+ result.append("")
63
+ elif current_indent > base_indent or not line.strip():
64
+ result.append(line)
65
+ elif stripped.startswith("def ") or stripped.startswith("class "):
66
+ break
67
+ else:
68
+ # Check if it's a continuation
69
+ if current_indent > base_indent:
70
+ result.append(line)
71
+ else:
72
+ break
73
+
74
+ return "\n".join(result)
75
+
76
+
77
+ def run_test(code, test, timeout=5):
78
+ """Execute code with test cases."""
79
+ full_code = code + "\n\n" + test
80
+
81
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
82
+ f.write(full_code)
83
+ tmp_path = f.name
84
+
85
+ try:
86
+ result = subprocess.run(
87
+ [sys.executable, tmp_path], capture_output=True, timeout=timeout, text=True
88
+ )
89
+ return result.returncode == 0
90
+ except (subprocess.TimeoutExpired, Exception):
91
+ return False
92
+ finally:
93
+ try:
94
+ os.unlink(tmp_path)
95
+ except:
96
+ pass
97
+
98
+
99
+ def evaluate_model(model, tokenizer, problems, model_name):
100
+ """Evaluate a model on HumanEval problems."""
101
+ results = []
102
+
103
+ print(f"\nEvaluating: {model_name}")
104
+ for problem in tqdm(problems, desc=model_name):
105
+ prompt = problem["prompt"]
106
+ entry_point = problem["entry_point"]
107
+ test = problem["test"]
108
+
109
+ # Generate
110
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
111
+
112
+ with torch.no_grad():
113
+ outputs = model.generate(
114
+ **inputs,
115
+ max_new_tokens=512,
116
+ temperature=0.2,
117
+ top_p=0.95,
118
+ do_sample=True,
119
+ pad_token_id=tokenizer.eos_token_id,
120
+ )
121
+
122
+ generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
123
+ code = extract_function(generated, entry_point)
124
+
125
+ # Test
126
+ passed = run_test(code, test)
127
+ results.append(passed)
128
+
129
+ score = sum(results) / len(results) * 100
130
+ return score, sum(results), len(results)
131
+
132
+
133
+ # Load tokenizer
134
+ print("\nLoading tokenizer...")
135
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
136
+ if tokenizer.pad_token is None:
137
+ tokenizer.pad_token = tokenizer.eos_token
138
+
139
+ # Evaluate BASE model
140
+ print("\nLoading base model...")
141
+ base_model = AutoModelForCausalLM.from_pretrained(
142
+ BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
143
+ )
144
+
145
+ base_score, base_passed, base_total = evaluate_model(
146
+ base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
147
+ )
148
+
149
+ # Clear memory
150
+ del base_model
151
+ torch.cuda.empty_cache()
152
+
153
+ # Evaluate FINE-TUNED model
154
+ print(f"\nLoading fine-tuned model from {ADAPTER_MODEL}...")
155
+ try:
156
+ ft_model = AutoModelForCausalLM.from_pretrained(
157
+ BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
158
+ )
159
+ ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
160
+
161
+ ft_score, ft_passed, ft_total = evaluate_model(
162
+ ft_model, tokenizer, humaneval, "Fine-tuned"
163
+ )
164
+ except Exception as e:
165
+ print(f"Error loading adapter: {e}")
166
+ ft_score, ft_passed, ft_total = 0, 0, NUM_PROBLEMS
167
+
168
+ # Results
169
+ print("\n" + "=" * 60)
170
+ print("HUMANEVAL RESULTS")
171
+ print("=" * 60)
172
+ print(f"Base Qwen3-0.6B: {base_score:.1f}% ({base_passed}/{base_total})")
173
+ print(f"Fine-tuned: {ft_score:.1f}% ({ft_passed}/{ft_total})")
174
+ print(f"Difference: {ft_score - base_score:+.1f}%")
175
+ print("=" * 60)
176
+
177
+ if ft_score > base_score:
178
+ print("SUCCESS! Fine-tuned model beats base model!")
179
+ else:
180
+ print("Fine-tuned model did not beat base model.")