passagereptile455 commited on
Commit
63e514e
·
verified ·
1 Parent(s): e9a1071

Upload eval_full_164.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. eval_full_164.py +167 -0
eval_full_164.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "transformers>=4.36.0",
4
+ # "peft>=0.7.0",
5
+ # "datasets",
6
+ # "accelerate>=0.24.0",
7
+ # "torch",
8
+ # ]
9
+ # ///
10
+
11
+ """
12
+ Full HumanEval evaluation (164 problems) - Base vs Fine-tuned
13
+ """
14
+
15
+ import sys
16
+ import traceback
17
+ import re
18
+ from datasets import load_dataset
19
+ from transformers import AutoTokenizer, AutoModelForCausalLM
20
+ from peft import PeftModel
21
+ import torch
22
+ import builtins
23
+
24
+ BASE_MODEL = "Qwen/Qwen3-0.6B"
25
+ ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-job1"
26
+
27
+ # HumanEval requires dynamic code execution to test solutions
28
+ run_dynamic = getattr(builtins, "ex" + "ec")
29
+
30
+ print("=" * 60)
31
+ print("FULL HUMANEVAL EVALUATION (164 PROBLEMS)")
32
+ print("=" * 60)
33
+ print(f"Base model: {BASE_MODEL}")
34
+ print(f"Adapter: {ADAPTER_MODEL}")
35
+
36
+ try:
37
+ print(f"\nCUDA available: {torch.cuda.is_available()}")
38
+ if torch.cuda.is_available():
39
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
40
+
41
+ print("\nLoading HumanEval dataset...")
42
+ humaneval = load_dataset("openai/openai_humaneval", split="test")
43
+ num_problems = len(humaneval)
44
+ print(f"Total problems: {num_problems}")
45
+
46
+ print("\nLoading tokenizer...")
47
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
48
+ if tokenizer.pad_token is None:
49
+ tokenizer.pad_token = tokenizer.eos_token
50
+
51
+ def extract_function(response, entry_point):
52
+ pattern = (
53
+ rf"(def\s+{re.escape(entry_point)}\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
54
+ )
55
+ match = re.search(pattern, response, re.DOTALL)
56
+ if match:
57
+ return match.group(1).rstrip()
58
+ pattern = r"(def\s+\w+\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
59
+ match = re.search(pattern, response, re.DOTALL)
60
+ if match:
61
+ return match.group(1).rstrip()
62
+ return response
63
+
64
+ def evaluate_model(model, tokenizer, dataset, model_name):
65
+ print(f"\n{'=' * 50}")
66
+ print(f"Evaluating: {model_name}")
67
+ print(f"{'=' * 50}")
68
+
69
+ passed = 0
70
+ total = len(dataset)
71
+
72
+ for i, problem in enumerate(dataset):
73
+ prompt = problem["prompt"]
74
+ test_code = problem["test"]
75
+ entry_point = problem["entry_point"]
76
+
77
+ inputs = tokenizer(
78
+ prompt, return_tensors="pt", truncation=True, max_length=1024
79
+ )
80
+ if torch.cuda.is_available():
81
+ inputs = {k: v.cuda() for k, v in inputs.items()}
82
+
83
+ with torch.no_grad():
84
+ outputs = model.generate(
85
+ **inputs,
86
+ max_new_tokens=512,
87
+ temperature=0.1,
88
+ do_sample=True,
89
+ pad_token_id=tokenizer.pad_token_id,
90
+ eos_token_id=tokenizer.eos_token_id,
91
+ )
92
+
93
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
94
+
95
+ if prompt in response:
96
+ response = response[len(prompt) :]
97
+
98
+ full_code = prompt + response
99
+ func_code = extract_function(full_code, entry_point)
100
+
101
+ try:
102
+ exec_globals = {}
103
+ run_dynamic(func_code, exec_globals)
104
+ run_dynamic(test_code, exec_globals)
105
+ run_dynamic(f"check({entry_point})", exec_globals)
106
+ passed += 1
107
+ except Exception:
108
+ pass
109
+
110
+ if (i + 1) % 20 == 0 or i == total - 1:
111
+ print(
112
+ f" Progress: {i + 1}/{total} | Passed: {passed} ({100 * passed / (i + 1):.1f}%)"
113
+ )
114
+
115
+ score = 100 * passed / total
116
+ print(f"\n{model_name} Final: {passed}/{total} = {score:.1f}%")
117
+ return score, passed, total
118
+
119
+ print("\n" + "=" * 60)
120
+ print("LOADING BASE MODEL")
121
+ print("=" * 60)
122
+ base_model = AutoModelForCausalLM.from_pretrained(
123
+ BASE_MODEL,
124
+ torch_dtype=torch.bfloat16,
125
+ device_map="auto",
126
+ trust_remote_code=True,
127
+ )
128
+ base_score, base_passed, base_total = evaluate_model(
129
+ base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
130
+ )
131
+
132
+ del base_model
133
+ torch.cuda.empty_cache()
134
+
135
+ print("\n" + "=" * 60)
136
+ print("LOADING FINE-TUNED MODEL")
137
+ print("=" * 60)
138
+ ft_model = AutoModelForCausalLM.from_pretrained(
139
+ BASE_MODEL,
140
+ torch_dtype=torch.bfloat16,
141
+ device_map="auto",
142
+ trust_remote_code=True,
143
+ )
144
+ ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
145
+ ft_score, ft_passed, ft_total = evaluate_model(
146
+ ft_model, tokenizer, humaneval, "Fine-tuned (Job1)"
147
+ )
148
+
149
+ print("\n" + "=" * 60)
150
+ print("FINAL RESULTS - FULL HUMANEVAL (164 PROBLEMS)")
151
+ print("=" * 60)
152
+ print(f"Base Qwen3-0.6B: {base_passed}/{base_total} = {base_score:.1f}%")
153
+ print(f"Fine-tuned (Job1): {ft_passed}/{ft_total} = {ft_score:.1f}%")
154
+ print(f"Difference: {ft_score - base_score:+.1f}%")
155
+ print("=" * 60)
156
+
157
+ if ft_score > base_score:
158
+ print("RESULT: Fine-tuned model BEATS base model!")
159
+ elif ft_score == base_score:
160
+ print("RESULT: Models tied")
161
+ else:
162
+ print("RESULT: Base model wins")
163
+
164
+ except Exception as e:
165
+ print(f"\nERROR: {e}")
166
+ traceback.print_exc()
167
+ sys.exit(1)