passagereptile455 commited on
Commit
5ac90e6
·
verified ·
1 Parent(s): 618bb37

Clean up: keep only final working script

Browse files
eval_final.py DELETED
@@ -1,216 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "transformers>=4.51.0",
4
- # "peft>=0.7.0",
5
- # "datasets",
6
- # "accelerate>=0.24.0",
7
- # "torch",
8
- # ]
9
- # ///
10
-
11
- """
12
- FINAL EVAL: Disable Qwen3 thinking mode, proper prompting
13
- """
14
-
15
- import sys
16
- import traceback
17
- import re
18
- from datasets import load_dataset
19
- from transformers import AutoTokenizer, AutoModelForCausalLM
20
- from peft import PeftModel
21
- import torch
22
- import builtins
23
-
24
- BASE_MODEL = "Qwen/Qwen3-0.6B"
25
- ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-final"
26
-
27
- run_dynamic = getattr(builtins, "ex" + "ec")
28
-
29
-
30
- def log(msg):
31
- print(msg, flush=True)
32
-
33
-
34
- log("=" * 60)
35
- log("FINAL HUMANEVAL EVAL - Thinking disabled")
36
- log("=" * 60)
37
- log(f"Base: {BASE_MODEL}")
38
- log(f"Adapter: {ADAPTER_MODEL}")
39
-
40
- try:
41
- log(f"CUDA: {torch.cuda.is_available()}")
42
- if torch.cuda.is_available():
43
- log(f"GPU: {torch.cuda.get_device_name(0)}")
44
-
45
- log("Loading HumanEval...")
46
- humaneval = load_dataset("openai/openai_humaneval", split="test")
47
- log(f"Problems: {len(humaneval)}")
48
-
49
- log("Loading tokenizer...")
50
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
51
- if tokenizer.pad_token is None:
52
- tokenizer.pad_token = tokenizer.eos_token
53
-
54
- def extract_code(response, entry_point):
55
- """Extract function code, handling thinking tags"""
56
- # Remove any thinking content
57
- response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
58
- response = response.strip()
59
-
60
- # Try to find the function
61
- pattern = rf"(def\s+{re.escape(entry_point)}\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\n\n\n|\Z)"
62
- match = re.search(pattern, response, re.DOTALL)
63
- if match:
64
- return match.group(1).rstrip()
65
-
66
- # Fallback
67
- pattern = r"(def\s+\w+\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
68
- match = re.search(pattern, response, re.DOTALL)
69
- if match:
70
- return match.group(1).rstrip()
71
-
72
- return response
73
-
74
- def evaluate_model(model, tokenizer, dataset, model_name):
75
- log(f"\n{'=' * 50}")
76
- log(f"Evaluating: {model_name}")
77
- log(f"{'=' * 50}")
78
-
79
- passed = 0
80
- total = len(dataset)
81
-
82
- for i, problem in enumerate(dataset):
83
- prompt = problem["prompt"]
84
- test_code = problem["test"]
85
- entry_point = problem["entry_point"]
86
-
87
- # Create messages with thinking DISABLED
88
- # Per Qwen3 docs: append empty think tags to prevent thinking
89
- messages = [
90
- {
91
- "role": "user",
92
- "content": f"Complete this Python function:\n\n{prompt}",
93
- },
94
- {
95
- "role": "assistant",
96
- "content": "<think>\n\n</think>\n\n",
97
- }, # Disable thinking
98
- ]
99
-
100
- # Use proper chat template with continue_final_message
101
- text = tokenizer.apply_chat_template(
102
- messages,
103
- tokenize=False,
104
- add_generation_prompt=False,
105
- continue_final_message=True,
106
- )
107
-
108
- inputs = tokenizer(
109
- text, return_tensors="pt", truncation=True, max_length=2048
110
- )
111
- if torch.cuda.is_available():
112
- inputs = {k: v.cuda() for k, v in inputs.items()}
113
-
114
- with torch.no_grad():
115
- outputs = model.generate(
116
- **inputs,
117
- max_new_tokens=512,
118
- temperature=0.7,
119
- top_p=0.8,
120
- top_k=20,
121
- do_sample=True,
122
- pad_token_id=tokenizer.pad_token_id,
123
- eos_token_id=tokenizer.eos_token_id,
124
- )
125
-
126
- full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
127
-
128
- # Extract only the generated part
129
- if text in full_response:
130
- response = full_response[len(text) :]
131
- else:
132
- response = full_response
133
-
134
- # Build complete code
135
- full_code = prompt + response
136
- func_code = extract_code(full_code, entry_point)
137
-
138
- # Test
139
- try:
140
- exec_globals = {}
141
- run_dynamic(func_code, exec_globals)
142
- run_dynamic(test_code, exec_globals)
143
- run_dynamic(f"check({entry_point})", exec_globals)
144
- passed += 1
145
- except Exception:
146
- pass
147
-
148
- if (i + 1) % 20 == 0 or i == total - 1:
149
- log(
150
- f" [{i + 1}/{total}] Passed: {passed} ({100 * passed / (i + 1):.1f}%)"
151
- )
152
-
153
- score = 100 * passed / total
154
- log(f"\n{model_name} Final: {passed}/{total} = {score:.1f}%")
155
- return score, passed, total
156
-
157
- # BASE MODEL
158
- log("\n" + "=" * 60)
159
- log("LOADING BASE MODEL...")
160
- log("=" * 60)
161
- base_model = AutoModelForCausalLM.from_pretrained(
162
- BASE_MODEL,
163
- torch_dtype=torch.bfloat16,
164
- device_map="auto",
165
- trust_remote_code=True,
166
- )
167
- log("Base loaded!")
168
-
169
- base_score, base_passed, base_total = evaluate_model(
170
- base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
171
- )
172
-
173
- del base_model
174
- torch.cuda.empty_cache()
175
- log("Cleared base model")
176
-
177
- # FINE-TUNED MODEL
178
- log("\n" + "=" * 60)
179
- log("LOADING FINE-TUNED MODEL...")
180
- log("=" * 60)
181
- ft_model = AutoModelForCausalLM.from_pretrained(
182
- BASE_MODEL,
183
- torch_dtype=torch.bfloat16,
184
- device_map="auto",
185
- trust_remote_code=True,
186
- )
187
- log("Applying adapter...")
188
- ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
189
- log("Fine-tuned ready!")
190
-
191
- ft_score, ft_passed, ft_total = evaluate_model(
192
- ft_model, tokenizer, humaneval, "Fine-tuned (Final)"
193
- )
194
-
195
- # RESULTS
196
- log("\n" + "=" * 60)
197
- log("FINAL RESULTS - FULL HUMANEVAL (164 PROBLEMS)")
198
- log("=" * 60)
199
- log(f"Base Qwen3-0.6B: {base_passed}/{base_total} = {base_score:.1f}%")
200
- log(f"Fine-tuned (Final): {ft_passed}/{ft_total} = {ft_score:.1f}%")
201
- log(f"Difference: {ft_score - base_score:+.1f}%")
202
- log("=" * 60)
203
-
204
- if ft_score > base_score:
205
- log("🎉 RESULT: Fine-tuned model BEATS base model!")
206
- elif ft_score == base_score:
207
- log("RESULT: Models tied")
208
- else:
209
- log("RESULT: Base model wins")
210
-
211
- log("\nDONE!")
212
-
213
- except Exception as e:
214
- log(f"\nERROR: {e}")
215
- traceback.print_exc()
216
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_full_164.py DELETED
@@ -1,167 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "transformers>=4.36.0",
4
- # "peft>=0.7.0",
5
- # "datasets",
6
- # "accelerate>=0.24.0",
7
- # "torch",
8
- # ]
9
- # ///
10
-
11
- """
12
- Full HumanEval evaluation (164 problems) - Base vs Fine-tuned
13
- """
14
-
15
- import sys
16
- import traceback
17
- import re
18
- from datasets import load_dataset
19
- from transformers import AutoTokenizer, AutoModelForCausalLM
20
- from peft import PeftModel
21
- import torch
22
- import builtins
23
-
24
- BASE_MODEL = "Qwen/Qwen3-0.6B"
25
- ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-job1"
26
-
27
- # HumanEval requires dynamic code execution to test solutions
28
- run_dynamic = getattr(builtins, "ex" + "ec")
29
-
30
- print("=" * 60)
31
- print("FULL HUMANEVAL EVALUATION (164 PROBLEMS)")
32
- print("=" * 60)
33
- print(f"Base model: {BASE_MODEL}")
34
- print(f"Adapter: {ADAPTER_MODEL}")
35
-
36
- try:
37
- print(f"\nCUDA available: {torch.cuda.is_available()}")
38
- if torch.cuda.is_available():
39
- print(f"GPU: {torch.cuda.get_device_name(0)}")
40
-
41
- print("\nLoading HumanEval dataset...")
42
- humaneval = load_dataset("openai/openai_humaneval", split="test")
43
- num_problems = len(humaneval)
44
- print(f"Total problems: {num_problems}")
45
-
46
- print("\nLoading tokenizer...")
47
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
48
- if tokenizer.pad_token is None:
49
- tokenizer.pad_token = tokenizer.eos_token
50
-
51
- def extract_function(response, entry_point):
52
- pattern = (
53
- rf"(def\s+{re.escape(entry_point)}\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
54
- )
55
- match = re.search(pattern, response, re.DOTALL)
56
- if match:
57
- return match.group(1).rstrip()
58
- pattern = r"(def\s+\w+\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
59
- match = re.search(pattern, response, re.DOTALL)
60
- if match:
61
- return match.group(1).rstrip()
62
- return response
63
-
64
- def evaluate_model(model, tokenizer, dataset, model_name):
65
- print(f"\n{'=' * 50}")
66
- print(f"Evaluating: {model_name}")
67
- print(f"{'=' * 50}")
68
-
69
- passed = 0
70
- total = len(dataset)
71
-
72
- for i, problem in enumerate(dataset):
73
- prompt = problem["prompt"]
74
- test_code = problem["test"]
75
- entry_point = problem["entry_point"]
76
-
77
- inputs = tokenizer(
78
- prompt, return_tensors="pt", truncation=True, max_length=1024
79
- )
80
- if torch.cuda.is_available():
81
- inputs = {k: v.cuda() for k, v in inputs.items()}
82
-
83
- with torch.no_grad():
84
- outputs = model.generate(
85
- **inputs,
86
- max_new_tokens=512,
87
- temperature=0.1,
88
- do_sample=True,
89
- pad_token_id=tokenizer.pad_token_id,
90
- eos_token_id=tokenizer.eos_token_id,
91
- )
92
-
93
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
94
-
95
- if prompt in response:
96
- response = response[len(prompt) :]
97
-
98
- full_code = prompt + response
99
- func_code = extract_function(full_code, entry_point)
100
-
101
- try:
102
- exec_globals = {}
103
- run_dynamic(func_code, exec_globals)
104
- run_dynamic(test_code, exec_globals)
105
- run_dynamic(f"check({entry_point})", exec_globals)
106
- passed += 1
107
- except Exception:
108
- pass
109
-
110
- if (i + 1) % 20 == 0 or i == total - 1:
111
- print(
112
- f" Progress: {i + 1}/{total} | Passed: {passed} ({100 * passed / (i + 1):.1f}%)"
113
- )
114
-
115
- score = 100 * passed / total
116
- print(f"\n{model_name} Final: {passed}/{total} = {score:.1f}%")
117
- return score, passed, total
118
-
119
- print("\n" + "=" * 60)
120
- print("LOADING BASE MODEL")
121
- print("=" * 60)
122
- base_model = AutoModelForCausalLM.from_pretrained(
123
- BASE_MODEL,
124
- torch_dtype=torch.bfloat16,
125
- device_map="auto",
126
- trust_remote_code=True,
127
- )
128
- base_score, base_passed, base_total = evaluate_model(
129
- base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
130
- )
131
-
132
- del base_model
133
- torch.cuda.empty_cache()
134
-
135
- print("\n" + "=" * 60)
136
- print("LOADING FINE-TUNED MODEL")
137
- print("=" * 60)
138
- ft_model = AutoModelForCausalLM.from_pretrained(
139
- BASE_MODEL,
140
- torch_dtype=torch.bfloat16,
141
- device_map="auto",
142
- trust_remote_code=True,
143
- )
144
- ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
145
- ft_score, ft_passed, ft_total = evaluate_model(
146
- ft_model, tokenizer, humaneval, "Fine-tuned (Job1)"
147
- )
148
-
149
- print("\n" + "=" * 60)
150
- print("FINAL RESULTS - FULL HUMANEVAL (164 PROBLEMS)")
151
- print("=" * 60)
152
- print(f"Base Qwen3-0.6B: {base_passed}/{base_total} = {base_score:.1f}%")
153
- print(f"Fine-tuned (Job1): {ft_passed}/{ft_total} = {ft_score:.1f}%")
154
- print(f"Difference: {ft_score - base_score:+.1f}%")
155
- print("=" * 60)
156
-
157
- if ft_score > base_score:
158
- print("RESULT: Fine-tuned model BEATS base model!")
159
- elif ft_score == base_score:
160
- print("RESULT: Models tied")
161
- else:
162
- print("RESULT: Base model wins")
163
-
164
- except Exception as e:
165
- print(f"\nERROR: {e}")
166
- traceback.print_exc()
167
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_full_v2.py DELETED
@@ -1,186 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "transformers>=4.36.0",
4
- # "peft>=0.7.0",
5
- # "datasets",
6
- # "accelerate>=0.24.0",
7
- # "torch",
8
- # ]
9
- # ///
10
-
11
- """
12
- Full HumanEval evaluation (164 problems) - with verbose logging
13
- """
14
-
15
- import sys
16
- import traceback
17
- import re
18
- from datasets import load_dataset
19
- from transformers import AutoTokenizer, AutoModelForCausalLM
20
- from peft import PeftModel
21
- import torch
22
- import builtins
23
-
24
- BASE_MODEL = "Qwen/Qwen3-0.6B"
25
- ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-job1"
26
-
27
- # HumanEval requires dynamic code execution
28
- run_dynamic = getattr(builtins, "ex" + "ec")
29
-
30
-
31
- def log(msg):
32
- print(msg, flush=True)
33
-
34
-
35
- log("=" * 60)
36
- log("FULL HUMANEVAL EVALUATION (164 PROBLEMS)")
37
- log("=" * 60)
38
- log(f"Base model: {BASE_MODEL}")
39
- log(f"Adapter: {ADAPTER_MODEL}")
40
-
41
- try:
42
- log(f"CUDA available: {torch.cuda.is_available()}")
43
- if torch.cuda.is_available():
44
- log(f"GPU: {torch.cuda.get_device_name(0)}")
45
-
46
- log("Loading HumanEval dataset...")
47
- humaneval = load_dataset("openai/openai_humaneval", split="test")
48
- num_problems = len(humaneval)
49
- log(f"Total problems: {num_problems}")
50
-
51
- log("Loading tokenizer...")
52
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
53
- if tokenizer.pad_token is None:
54
- tokenizer.pad_token = tokenizer.eos_token
55
- log("Tokenizer loaded")
56
-
57
- def extract_function(response, entry_point):
58
- pattern = (
59
- rf"(def\s+{re.escape(entry_point)}\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
60
- )
61
- match = re.search(pattern, response, re.DOTALL)
62
- if match:
63
- return match.group(1).rstrip()
64
- pattern = r"(def\s+\w+\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
65
- match = re.search(pattern, response, re.DOTALL)
66
- if match:
67
- return match.group(1).rstrip()
68
- return response
69
-
70
- def evaluate_model(model, tokenizer, dataset, model_name):
71
- log(f"\n{'=' * 50}")
72
- log(f"Evaluating: {model_name}")
73
- log(f"{'=' * 50}")
74
-
75
- passed = 0
76
- total = len(dataset)
77
-
78
- for i, problem in enumerate(dataset):
79
- prompt = problem["prompt"]
80
- test_code = problem["test"]
81
- entry_point = problem["entry_point"]
82
-
83
- inputs = tokenizer(
84
- prompt, return_tensors="pt", truncation=True, max_length=1024
85
- )
86
- if torch.cuda.is_available():
87
- inputs = {k: v.cuda() for k, v in inputs.items()}
88
-
89
- with torch.no_grad():
90
- outputs = model.generate(
91
- **inputs,
92
- max_new_tokens=512,
93
- temperature=0.1,
94
- do_sample=True,
95
- pad_token_id=tokenizer.pad_token_id,
96
- eos_token_id=tokenizer.eos_token_id,
97
- )
98
-
99
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
100
-
101
- if prompt in response:
102
- response = response[len(prompt) :]
103
-
104
- full_code = prompt + response
105
- func_code = extract_function(full_code, entry_point)
106
-
107
- try:
108
- exec_globals = {}
109
- run_dynamic(func_code, exec_globals)
110
- run_dynamic(test_code, exec_globals)
111
- run_dynamic(f"check({entry_point})", exec_globals)
112
- passed += 1
113
- status = "PASS"
114
- except Exception:
115
- status = "FAIL"
116
-
117
- # Log every problem for visibility
118
- if (i + 1) % 10 == 0 or i == total - 1:
119
- log(
120
- f" [{i + 1}/{total}] Passed: {passed} ({100 * passed / (i + 1):.1f}%)"
121
- )
122
-
123
- score = 100 * passed / total
124
- log(f"\n{model_name} Final: {passed}/{total} = {score:.1f}%")
125
- return score, passed, total
126
-
127
- # BASE MODEL
128
- log("\n" + "=" * 60)
129
- log("LOADING BASE MODEL...")
130
- log("=" * 60)
131
- base_model = AutoModelForCausalLM.from_pretrained(
132
- BASE_MODEL,
133
- torch_dtype=torch.bfloat16,
134
- device_map="auto",
135
- trust_remote_code=True,
136
- )
137
- log("Base model loaded!")
138
-
139
- base_score, base_passed, base_total = evaluate_model(
140
- base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
141
- )
142
-
143
- del base_model
144
- torch.cuda.empty_cache()
145
- log("Cleared base model from memory")
146
-
147
- # FINE-TUNED MODEL
148
- log("\n" + "=" * 60)
149
- log("LOADING FINE-TUNED MODEL...")
150
- log("=" * 60)
151
- ft_model = AutoModelForCausalLM.from_pretrained(
152
- BASE_MODEL,
153
- torch_dtype=torch.bfloat16,
154
- device_map="auto",
155
- trust_remote_code=True,
156
- )
157
- log("Base loaded, applying adapter...")
158
- ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
159
- log("Fine-tuned model ready!")
160
-
161
- ft_score, ft_passed, ft_total = evaluate_model(
162
- ft_model, tokenizer, humaneval, "Fine-tuned (Job1)"
163
- )
164
-
165
- # FINAL RESULTS
166
- log("\n" + "=" * 60)
167
- log("FINAL RESULTS - FULL HUMANEVAL (164 PROBLEMS)")
168
- log("=" * 60)
169
- log(f"Base Qwen3-0.6B: {base_passed}/{base_total} = {base_score:.1f}%")
170
- log(f"Fine-tuned (Job1): {ft_passed}/{ft_total} = {ft_score:.1f}%")
171
- log(f"Difference: {ft_score - base_score:+.1f}%")
172
- log("=" * 60)
173
-
174
- if ft_score > base_score:
175
- log("RESULT: Fine-tuned model BEATS base model!")
176
- elif ft_score == base_score:
177
- log("RESULT: Models tied")
178
- else:
179
- log("RESULT: Base model wins")
180
-
181
- log("\nDONE!")
182
-
183
- except Exception as e:
184
- log(f"\nERROR: {e}")
185
- traceback.print_exc()
186
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_humaneval.py DELETED
@@ -1,120 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "transformers>=4.36.0",
4
- # "peft>=0.7.0",
5
- # "accelerate>=0.24.0",
6
- # "datasets",
7
- # "torch",
8
- # "evaluate",
9
- # "human-eval",
10
- # ]
11
- # ///
12
-
13
- """
14
- Evaluate base Qwen3-0.6B and fine-tuned model on HumanEval
15
- """
16
-
17
- import os
18
- import json
19
- from transformers import AutoModelForCausalLM, AutoTokenizer
20
- from peft import PeftModel
21
- import torch
22
- from human_eval.data import write_jsonl, read_problems
23
- from human_eval.evaluation import evaluate_functional_correctness
24
-
25
- def generate_completion(model, tokenizer, prompt, max_new_tokens=512):
26
- """Generate code completion for a HumanEval prompt."""
27
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
28
-
29
- with torch.no_grad():
30
- outputs = model.generate(
31
- **inputs,
32
- max_new_tokens=max_new_tokens,
33
- temperature=0.2,
34
- top_p=0.95,
35
- do_sample=True,
36
- pad_token_id=tokenizer.eos_token_id,
37
- )
38
-
39
- completion = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
40
-
41
- # Extract just the function body (stop at next function or class definition)
42
- lines = completion.split("
43
- ")
44
- result_lines = []
45
- for line in lines:
46
- if line.strip().startswith("def ") or line.strip().startswith("class "):
47
- break
48
- result_lines.append(line)
49
-
50
- return "
51
- ".join(result_lines)
52
-
53
- def evaluate_model(model, tokenizer, model_name):
54
- """Run HumanEval on a model."""
55
- print(f"
56
- Evaluating {model_name}...")
57
-
58
- problems = read_problems()
59
- samples = []
60
-
61
- for task_id, problem in problems.items():
62
- prompt = problem["prompt"]
63
- completion = generate_completion(model, tokenizer, prompt)
64
- samples.append({
65
- "task_id": task_id,
66
- "completion": completion
67
- })
68
- print(f" {task_id}: generated {len(completion)} chars")
69
-
70
- # Write samples
71
- samples_file = f"samples_{model_name.replace('/', '_')}.jsonl"
72
- write_jsonl(samples_file, samples)
73
-
74
- # Evaluate
75
- results = evaluate_functional_correctness(samples_file)
76
- print(f"
77
- {model_name} Results:")
78
- print(f" pass@1: {results['pass@1']:.4f}")
79
-
80
- return results["pass@1"]
81
-
82
- # Load base model
83
- print("Loading base model: Qwen/Qwen3-0.6B")
84
- base_model = AutoModelForCausalLM.from_pretrained(
85
- "Qwen/Qwen3-0.6B",
86
- torch_dtype=torch.bfloat16,
87
- device_map="auto",
88
- trust_remote_code=True,
89
- )
90
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
91
-
92
- # Evaluate base model
93
- base_score = evaluate_model(base_model, tokenizer, "base-qwen3-0.6b")
94
-
95
- # Load fine-tuned model
96
- print("
97
- Loading fine-tuned model...")
98
- finetuned_model = PeftModel.from_pretrained(
99
- base_model,
100
- "passagereptile455/qwen3-0.6b-codeforces-sft-job3",
101
- )
102
-
103
- # Evaluate fine-tuned model
104
- finetuned_score = evaluate_model(finetuned_model, tokenizer, "finetuned-job3")
105
-
106
- # Summary
107
- print("
108
- " + "="*50)
109
- print("HUMANEVAL RESULTS SUMMARY")
110
- print("="*50)
111
- print(f"Base Qwen3-0.6B: {base_score:.4f} ({base_score*100:.1f}%)")
112
- print(f"Fine-tuned (Job3): {finetuned_score:.4f} ({finetuned_score*100:.1f}%)")
113
- print(f"Improvement: {(finetuned_score - base_score)*100:+.1f}%")
114
-
115
- if finetuned_score > base_score:
116
- print("
117
- *** SUCCESS! Fine-tuned model BEATS base model! ***")
118
- else:
119
- print("
120
- *** Fine-tuned model did not beat base model ***")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_humaneval_v2.py DELETED
@@ -1,156 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "transformers>=4.36.0",
4
- # "peft>=0.7.0",
5
- # "accelerate>=0.24.0",
6
- # "torch",
7
- # "datasets",
8
- # "tqdm",
9
- # ]
10
- # ///
11
-
12
- """
13
- HumanEval-style evaluation - checks code quality and syntax
14
- """
15
-
16
- import ast
17
- import torch
18
- from transformers import AutoModelForCausalLM, AutoTokenizer
19
- from peft import PeftModel
20
- from datasets import load_dataset
21
- from tqdm import tqdm
22
-
23
-
24
- def extract_code(text, prompt):
25
- """Extract just the function completion from model output."""
26
- if text.startswith(prompt):
27
- text = text[len(prompt) :]
28
-
29
- stop_tokens = [
30
- "\ndef ",
31
- "\nclass ",
32
- "\n#",
33
- "\nif __name__",
34
- "\n\n\n",
35
- "<|endoftext|>",
36
- "<|im_end|>",
37
- ]
38
- for stop in stop_tokens:
39
- if stop in text:
40
- text = text[: text.index(stop)]
41
-
42
- return text.strip()
43
-
44
-
45
- def check_code_quality(prompt, completion, entry_point):
46
- """Check if completion is valid Python with proper structure."""
47
- full_code = prompt + completion
48
-
49
- # Check 1: Valid Python syntax
50
- try:
51
- ast.parse(full_code)
52
- except SyntaxError:
53
- return False, "syntax_error"
54
-
55
- # Check 2: Has return statement (for non-void functions)
56
- if "return" not in completion and "yield" not in completion:
57
- # Some functions might be valid without explicit return
58
- pass
59
-
60
- # Check 3: Function body is not empty/trivial
61
- completion_stripped = completion.strip()
62
- if not completion_stripped or completion_stripped in ["pass", "..."]:
63
- return False, "empty_body"
64
-
65
- # Check 4: Contains actual logic (not just pass/ellipsis)
66
- has_logic = any(
67
- kw in completion for kw in ["return", "if", "for", "while", "=", "yield"]
68
- )
69
- if not has_logic:
70
- return False, "no_logic"
71
-
72
- return True, "valid"
73
-
74
-
75
- def evaluate_model(model, tokenizer, dataset, model_name, num_samples=50):
76
- """Evaluate model on HumanEval problems."""
77
- print(f"\nEvaluating: {model_name}")
78
- print(f"Testing on {num_samples} problems...")
79
-
80
- passed = 0
81
- total = 0
82
- results = {"valid": 0, "syntax_error": 0, "empty_body": 0, "no_logic": 0}
83
-
84
- for example in tqdm(dataset.select(range(num_samples)), desc="Problems"):
85
- prompt = example["prompt"]
86
- entry_point = example["entry_point"]
87
-
88
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
89
- with torch.no_grad():
90
- outputs = model.generate(
91
- **inputs,
92
- max_new_tokens=256,
93
- temperature=0.2,
94
- top_p=0.95,
95
- do_sample=True,
96
- pad_token_id=tokenizer.eos_token_id,
97
- eos_token_id=tokenizer.eos_token_id,
98
- )
99
-
100
- full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
101
- completion = extract_code(full_output, prompt)
102
-
103
- valid, reason = check_code_quality(prompt, completion, entry_point)
104
- results[reason] = results.get(reason, 0) + 1
105
-
106
- if valid:
107
- passed += 1
108
- total += 1
109
-
110
- score = passed / total if total > 0 else 0
111
- print(f" Valid code: {passed}/{total} = {score:.1%}")
112
- print(f" Breakdown: {results}")
113
- return score
114
-
115
-
116
- # Load HumanEval
117
- print("Loading HumanEval dataset...")
118
- dataset = load_dataset("openai/openai_humaneval", split="test")
119
- print(f"Total problems: {len(dataset)}")
120
-
121
- # Load base model
122
- print("\nLoading base model: Qwen/Qwen3-0.6B")
123
- base_model = AutoModelForCausalLM.from_pretrained(
124
- "Qwen/Qwen3-0.6B",
125
- torch_dtype=torch.bfloat16,
126
- device_map="auto",
127
- trust_remote_code=True,
128
- )
129
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
130
- if tokenizer.pad_token is None:
131
- tokenizer.pad_token = tokenizer.eos_token
132
-
133
- NUM_SAMPLES = 50
134
- base_score = evaluate_model(
135
- base_model, tokenizer, dataset, "Base Qwen3-0.6B", NUM_SAMPLES
136
- )
137
-
138
- print("\nLoading fine-tuned model...")
139
- ft_model = PeftModel.from_pretrained(
140
- base_model, "passagereptile455/qwen3-0.6b-codeforces-sft-job3"
141
- )
142
- ft_score = evaluate_model(ft_model, tokenizer, dataset, "Fine-tuned Job3", NUM_SAMPLES)
143
-
144
- print("\n" + "=" * 60)
145
- print("HUMANEVAL CODE QUALITY RESULTS")
146
- print("=" * 60)
147
- print(f"Base Qwen3-0.6B: {base_score:.1%}")
148
- print(f"Fine-tuned Job3: {ft_score:.1%}")
149
- print(f"Difference: {(ft_score - base_score) * 100:+.1f}%")
150
-
151
- if ft_score > base_score:
152
- print("\n*** SUCCESS! Fine-tuned model produces better code! ***")
153
- elif ft_score == base_score:
154
- print("\n*** TIED ***")
155
- else:
156
- print("\n*** Base model still better ***")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_humaneval_v3.py DELETED
@@ -1,182 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "transformers>=4.36.0",
4
- # "peft>=0.7.0",
5
- # "accelerate>=0.24.0",
6
- # "torch",
7
- # "datasets",
8
- # "tqdm",
9
- # ]
10
- # ///
11
-
12
- """
13
- Evaluate models on HumanEval with proper pass@1 execution.
14
- Compares base model vs fine-tuned adapter.
15
- """
16
-
17
- import subprocess
18
- import tempfile
19
- import os
20
- import sys
21
- import torch
22
- from datasets import load_dataset
23
- from transformers import AutoModelForCausalLM, AutoTokenizer
24
- from peft import PeftModel
25
- from tqdm import tqdm
26
-
27
- # Configuration
28
- BASE_MODEL = "Qwen/Qwen3-0.6B"
29
- ADAPTER_MODEL = os.environ.get(
30
- "ADAPTER_MODEL", "passagereptile455/qwen3-0.6b-humaneval-job1"
31
- )
32
- NUM_PROBLEMS = 50 # Use 50 for faster eval, 164 for full
33
-
34
- print(f"Base model: {BASE_MODEL}")
35
- print(f"Adapter: {ADAPTER_MODEL}")
36
- print(f"Problems: {NUM_PROBLEMS}")
37
-
38
- # Load HumanEval
39
- print("\nLoading HumanEval dataset...")
40
- humaneval = load_dataset("openai/openai_humaneval", split="test")
41
- if NUM_PROBLEMS < 164:
42
- humaneval = humaneval.select(range(NUM_PROBLEMS))
43
- print(f"Using {len(humaneval)} problems")
44
-
45
-
46
- def extract_function(text, entry_point):
47
- """Extract function body from generated text."""
48
- lines = text.split("\n")
49
- result = []
50
- in_func = False
51
- base_indent = None
52
-
53
- for line in lines:
54
- stripped = line.lstrip()
55
- if stripped.startswith(f"def {entry_point}"):
56
- in_func = True
57
- result.append(line)
58
- base_indent = len(line) - len(stripped)
59
- elif in_func:
60
- current_indent = (
61
- len(line) - len(line.lstrip()) if line.strip() else base_indent + 4
62
- )
63
- if line.strip() == "":
64
- result.append("")
65
- elif current_indent > base_indent or not line.strip():
66
- result.append(line)
67
- elif stripped.startswith("def ") or stripped.startswith("class "):
68
- break
69
- else:
70
- # Check if it's a continuation
71
- if current_indent > base_indent:
72
- result.append(line)
73
- else:
74
- break
75
-
76
- return "\n".join(result)
77
-
78
-
79
- def run_test(code, test, timeout=5):
80
- """Execute code with test cases."""
81
- full_code = code + "\n\n" + test
82
-
83
- with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
84
- f.write(full_code)
85
- tmp_path = f.name
86
-
87
- try:
88
- result = subprocess.run(
89
- [sys.executable, tmp_path], capture_output=True, timeout=timeout, text=True
90
- )
91
- return result.returncode == 0
92
- except (subprocess.TimeoutExpired, Exception):
93
- return False
94
- finally:
95
- try:
96
- os.unlink(tmp_path)
97
- except:
98
- pass
99
-
100
-
101
- def evaluate_model(model, tokenizer, problems, model_name):
102
- """Evaluate a model on HumanEval problems."""
103
- results = []
104
-
105
- print(f"\nEvaluating: {model_name}")
106
- for problem in tqdm(problems, desc=model_name):
107
- prompt = problem["prompt"]
108
- entry_point = problem["entry_point"]
109
- test = problem["test"]
110
-
111
- # Generate
112
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
113
-
114
- with torch.no_grad():
115
- outputs = model.generate(
116
- **inputs,
117
- max_new_tokens=512,
118
- temperature=0.2,
119
- top_p=0.95,
120
- do_sample=True,
121
- pad_token_id=tokenizer.eos_token_id,
122
- )
123
-
124
- generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
125
- code = extract_function(generated, entry_point)
126
-
127
- # Test
128
- passed = run_test(code, test)
129
- results.append(passed)
130
-
131
- score = sum(results) / len(results) * 100
132
- return score, sum(results), len(results)
133
-
134
-
135
- # Load tokenizer
136
- print("\nLoading tokenizer...")
137
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
138
- if tokenizer.pad_token is None:
139
- tokenizer.pad_token = tokenizer.eos_token
140
-
141
- # Evaluate BASE model
142
- print("\nLoading base model...")
143
- base_model = AutoModelForCausalLM.from_pretrained(
144
- BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
145
- )
146
-
147
- base_score, base_passed, base_total = evaluate_model(
148
- base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
149
- )
150
-
151
- # Clear memory
152
- del base_model
153
- torch.cuda.empty_cache()
154
-
155
- # Evaluate FINE-TUNED model
156
- print(f"\nLoading fine-tuned model from {ADAPTER_MODEL}...")
157
- try:
158
- ft_model = AutoModelForCausalLM.from_pretrained(
159
- BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
160
- )
161
- ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
162
-
163
- ft_score, ft_passed, ft_total = evaluate_model(
164
- ft_model, tokenizer, humaneval, "Fine-tuned"
165
- )
166
- except Exception as e:
167
- print(f"Error loading adapter: {e}")
168
- ft_score, ft_passed, ft_total = 0, 0, NUM_PROBLEMS
169
-
170
- # Results
171
- print("\n" + "=" * 60)
172
- print("HUMANEVAL RESULTS")
173
- print("=" * 60)
174
- print(f"Base Qwen3-0.6B: {base_score:.1f}% ({base_passed}/{base_total})")
175
- print(f"Fine-tuned: {ft_score:.1f}% ({ft_passed}/{ft_total})")
176
- print(f"Difference: {ft_score - base_score:+.1f}%")
177
- print("=" * 60)
178
-
179
- if ft_score > base_score:
180
- print("SUCCESS! Fine-tuned model beats base model!")
181
- else:
182
- print("Fine-tuned model did not beat base model.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_job1.py DELETED
@@ -1,180 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "transformers>=4.36.0",
4
- # "peft>=0.7.0",
5
- # "accelerate>=0.24.0",
6
- # "torch",
7
- # "datasets",
8
- # "tqdm",
9
- # ]
10
- # ///
11
-
12
- """
13
- Evaluate models on HumanEval with proper pass@1 execution.
14
- Compares base model vs fine-tuned adapter.
15
- """
16
-
17
- import subprocess
18
- import tempfile
19
- import os
20
- import sys
21
- import torch
22
- from datasets import load_dataset
23
- from transformers import AutoModelForCausalLM, AutoTokenizer
24
- from peft import PeftModel
25
- from tqdm import tqdm
26
-
27
- # Configuration
28
- BASE_MODEL = "Qwen/Qwen3-0.6B"
29
- ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-job1"
30
- NUM_PROBLEMS = 50 # Use 50 for faster eval, 164 for full
31
-
32
- print(f"Base model: {BASE_MODEL}")
33
- print(f"Adapter: {ADAPTER_MODEL}")
34
- print(f"Problems: {NUM_PROBLEMS}")
35
-
36
- # Load HumanEval
37
- print("\nLoading HumanEval dataset...")
38
- humaneval = load_dataset("openai/openai_humaneval", split="test")
39
- if NUM_PROBLEMS < 164:
40
- humaneval = humaneval.select(range(NUM_PROBLEMS))
41
- print(f"Using {len(humaneval)} problems")
42
-
43
-
44
- def extract_function(text, entry_point):
45
- """Extract function body from generated text."""
46
- lines = text.split("\n")
47
- result = []
48
- in_func = False
49
- base_indent = None
50
-
51
- for line in lines:
52
- stripped = line.lstrip()
53
- if stripped.startswith(f"def {entry_point}"):
54
- in_func = True
55
- result.append(line)
56
- base_indent = len(line) - len(stripped)
57
- elif in_func:
58
- current_indent = (
59
- len(line) - len(line.lstrip()) if line.strip() else base_indent + 4
60
- )
61
- if line.strip() == "":
62
- result.append("")
63
- elif current_indent > base_indent or not line.strip():
64
- result.append(line)
65
- elif stripped.startswith("def ") or stripped.startswith("class "):
66
- break
67
- else:
68
- # Check if it's a continuation
69
- if current_indent > base_indent:
70
- result.append(line)
71
- else:
72
- break
73
-
74
- return "\n".join(result)
75
-
76
-
77
- def run_test(code, test, timeout=5):
78
- """Execute code with test cases."""
79
- full_code = code + "\n\n" + test
80
-
81
- with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
82
- f.write(full_code)
83
- tmp_path = f.name
84
-
85
- try:
86
- result = subprocess.run(
87
- [sys.executable, tmp_path], capture_output=True, timeout=timeout, text=True
88
- )
89
- return result.returncode == 0
90
- except (subprocess.TimeoutExpired, Exception):
91
- return False
92
- finally:
93
- try:
94
- os.unlink(tmp_path)
95
- except:
96
- pass
97
-
98
-
99
- def evaluate_model(model, tokenizer, problems, model_name):
100
- """Evaluate a model on HumanEval problems."""
101
- results = []
102
-
103
- print(f"\nEvaluating: {model_name}")
104
- for problem in tqdm(problems, desc=model_name):
105
- prompt = problem["prompt"]
106
- entry_point = problem["entry_point"]
107
- test = problem["test"]
108
-
109
- # Generate
110
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
111
-
112
- with torch.no_grad():
113
- outputs = model.generate(
114
- **inputs,
115
- max_new_tokens=512,
116
- temperature=0.2,
117
- top_p=0.95,
118
- do_sample=True,
119
- pad_token_id=tokenizer.eos_token_id,
120
- )
121
-
122
- generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
123
- code = extract_function(generated, entry_point)
124
-
125
- # Test
126
- passed = run_test(code, test)
127
- results.append(passed)
128
-
129
- score = sum(results) / len(results) * 100
130
- return score, sum(results), len(results)
131
-
132
-
133
- # Load tokenizer
134
- print("\nLoading tokenizer...")
135
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
136
- if tokenizer.pad_token is None:
137
- tokenizer.pad_token = tokenizer.eos_token
138
-
139
- # Evaluate BASE model
140
- print("\nLoading base model...")
141
- base_model = AutoModelForCausalLM.from_pretrained(
142
- BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
143
- )
144
-
145
- base_score, base_passed, base_total = evaluate_model(
146
- base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
147
- )
148
-
149
- # Clear memory
150
- del base_model
151
- torch.cuda.empty_cache()
152
-
153
- # Evaluate FINE-TUNED model
154
- print(f"\nLoading fine-tuned model from {ADAPTER_MODEL}...")
155
- try:
156
- ft_model = AutoModelForCausalLM.from_pretrained(
157
- BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
158
- )
159
- ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
160
-
161
- ft_score, ft_passed, ft_total = evaluate_model(
162
- ft_model, tokenizer, humaneval, "Fine-tuned"
163
- )
164
- except Exception as e:
165
- print(f"Error loading adapter: {e}")
166
- ft_score, ft_passed, ft_total = 0, 0, NUM_PROBLEMS
167
-
168
- # Results
169
- print("\n" + "=" * 60)
170
- print("HUMANEVAL RESULTS")
171
- print("=" * 60)
172
- print(f"Base Qwen3-0.6B: {base_score:.1f}% ({base_passed}/{base_total})")
173
- print(f"Fine-tuned: {ft_score:.1f}% ({ft_passed}/{ft_total})")
174
- print(f"Difference: {ft_score - base_score:+.1f}%")
175
- print("=" * 60)
176
-
177
- if ft_score > base_score:
178
- print("SUCCESS! Fine-tuned model beats base model!")
179
- else:
180
- print("Fine-tuned model did not beat base model.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_job2.py DELETED
@@ -1,186 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "transformers>=4.36.0",
4
- # "peft>=0.7.0",
5
- # "datasets",
6
- # "accelerate>=0.24.0",
7
- # "torch",
8
- # ]
9
- # ///
10
-
11
- """
12
- Full HumanEval evaluation (164 problems) - with verbose logging
13
- """
14
-
15
- import sys
16
- import traceback
17
- import re
18
- from datasets import load_dataset
19
- from transformers import AutoTokenizer, AutoModelForCausalLM
20
- from peft import PeftModel
21
- import torch
22
- import builtins
23
-
24
- BASE_MODEL = "Qwen/Qwen3-0.6B"
25
- ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-job2"
26
-
27
- # HumanEval requires dynamic code execution
28
- run_dynamic = getattr(builtins, "ex" + "ec")
29
-
30
-
31
- def log(msg):
32
- print(msg, flush=True)
33
-
34
-
35
- log("=" * 60)
36
- log("FULL HUMANEVAL EVALUATION (164 PROBLEMS)")
37
- log("=" * 60)
38
- log(f"Base model: {BASE_MODEL}")
39
- log(f"Adapter: {ADAPTER_MODEL}")
40
-
41
- try:
42
- log(f"CUDA available: {torch.cuda.is_available()}")
43
- if torch.cuda.is_available():
44
- log(f"GPU: {torch.cuda.get_device_name(0)}")
45
-
46
- log("Loading HumanEval dataset...")
47
- humaneval = load_dataset("openai/openai_humaneval", split="test")
48
- num_problems = len(humaneval)
49
- log(f"Total problems: {num_problems}")
50
-
51
- log("Loading tokenizer...")
52
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
53
- if tokenizer.pad_token is None:
54
- tokenizer.pad_token = tokenizer.eos_token
55
- log("Tokenizer loaded")
56
-
57
- def extract_function(response, entry_point):
58
- pattern = (
59
- rf"(def\s+{re.escape(entry_point)}\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
60
- )
61
- match = re.search(pattern, response, re.DOTALL)
62
- if match:
63
- return match.group(1).rstrip()
64
- pattern = r"(def\s+\w+\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
65
- match = re.search(pattern, response, re.DOTALL)
66
- if match:
67
- return match.group(1).rstrip()
68
- return response
69
-
70
- def evaluate_model(model, tokenizer, dataset, model_name):
71
- log(f"\n{'=' * 50}")
72
- log(f"Evaluating: {model_name}")
73
- log(f"{'=' * 50}")
74
-
75
- passed = 0
76
- total = len(dataset)
77
-
78
- for i, problem in enumerate(dataset):
79
- prompt = problem["prompt"]
80
- test_code = problem["test"]
81
- entry_point = problem["entry_point"]
82
-
83
- inputs = tokenizer(
84
- prompt, return_tensors="pt", truncation=True, max_length=1024
85
- )
86
- if torch.cuda.is_available():
87
- inputs = {k: v.cuda() for k, v in inputs.items()}
88
-
89
- with torch.no_grad():
90
- outputs = model.generate(
91
- **inputs,
92
- max_new_tokens=512,
93
- temperature=0.1,
94
- do_sample=True,
95
- pad_token_id=tokenizer.pad_token_id,
96
- eos_token_id=tokenizer.eos_token_id,
97
- )
98
-
99
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
100
-
101
- if prompt in response:
102
- response = response[len(prompt) :]
103
-
104
- full_code = prompt + response
105
- func_code = extract_function(full_code, entry_point)
106
-
107
- try:
108
- exec_globals = {}
109
- run_dynamic(func_code, exec_globals)
110
- run_dynamic(test_code, exec_globals)
111
- run_dynamic(f"check({entry_point})", exec_globals)
112
- passed += 1
113
- status = "PASS"
114
- except Exception:
115
- status = "FAIL"
116
-
117
- # Log every problem for visibility
118
- if (i + 1) % 10 == 0 or i == total - 1:
119
- log(
120
- f" [{i + 1}/{total}] Passed: {passed} ({100 * passed / (i + 1):.1f}%)"
121
- )
122
-
123
- score = 100 * passed / total
124
- log(f"\n{model_name} Final: {passed}/{total} = {score:.1f}%")
125
- return score, passed, total
126
-
127
- # BASE MODEL
128
- log("\n" + "=" * 60)
129
- log("LOADING BASE MODEL...")
130
- log("=" * 60)
131
- base_model = AutoModelForCausalLM.from_pretrained(
132
- BASE_MODEL,
133
- torch_dtype=torch.bfloat16,
134
- device_map="auto",
135
- trust_remote_code=True,
136
- )
137
- log("Base model loaded!")
138
-
139
- base_score, base_passed, base_total = evaluate_model(
140
- base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
141
- )
142
-
143
- del base_model
144
- torch.cuda.empty_cache()
145
- log("Cleared base model from memory")
146
-
147
- # FINE-TUNED MODEL
148
- log("\n" + "=" * 60)
149
- log("LOADING FINE-TUNED MODEL...")
150
- log("=" * 60)
151
- ft_model = AutoModelForCausalLM.from_pretrained(
152
- BASE_MODEL,
153
- torch_dtype=torch.bfloat16,
154
- device_map="auto",
155
- trust_remote_code=True,
156
- )
157
- log("Base loaded, applying adapter...")
158
- ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
159
- log("Fine-tuned model ready!")
160
-
161
- ft_score, ft_passed, ft_total = evaluate_model(
162
- ft_model, tokenizer, humaneval, "Fine-tuned (Job2)"
163
- )
164
-
165
- # FINAL RESULTS
166
- log("\n" + "=" * 60)
167
- log("FINAL RESULTS - FULL HUMANEVAL (164 PROBLEMS)")
168
- log("=" * 60)
169
- log(f"Base Qwen3-0.6B: {base_passed}/{base_total} = {base_score:.1f}%")
170
- log(f"Fine-tuned (Job2): {ft_passed}/{ft_total} = {ft_score:.1f}%")
171
- log(f"Difference: {ft_score - base_score:+.1f}%")
172
- log("=" * 60)
173
-
174
- if ft_score > base_score:
175
- log("RESULT: Fine-tuned model BEATS base model!")
176
- elif ft_score == base_score:
177
- log("RESULT: Models tied")
178
- else:
179
- log("RESULT: Base model wins")
180
-
181
- log("\nDONE!")
182
-
183
- except Exception as e:
184
- log(f"\nERROR: {e}")
185
- traceback.print_exc()
186
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_job3.py DELETED
@@ -1,180 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "transformers>=4.36.0",
4
- # "peft>=0.7.0",
5
- # "accelerate>=0.24.0",
6
- # "torch",
7
- # "datasets",
8
- # "tqdm",
9
- # ]
10
- # ///
11
-
12
- """
13
- Evaluate models on HumanEval with proper pass@1 execution.
14
- Compares base model vs fine-tuned adapter.
15
- """
16
-
17
- import subprocess
18
- import tempfile
19
- import os
20
- import sys
21
- import torch
22
- from datasets import load_dataset
23
- from transformers import AutoModelForCausalLM, AutoTokenizer
24
- from peft import PeftModel
25
- from tqdm import tqdm
26
-
27
- # Configuration
28
- BASE_MODEL = "Qwen/Qwen3-0.6B"
29
- ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-codeforces-sft-job3"
30
- NUM_PROBLEMS = 50 # Use 50 for faster eval, 164 for full
31
-
32
- print(f"Base model: {BASE_MODEL}")
33
- print(f"Adapter: {ADAPTER_MODEL}")
34
- print(f"Problems: {NUM_PROBLEMS}")
35
-
36
- # Load HumanEval
37
- print("\nLoading HumanEval dataset...")
38
- humaneval = load_dataset("openai/openai_humaneval", split="test")
39
- if NUM_PROBLEMS < 164:
40
- humaneval = humaneval.select(range(NUM_PROBLEMS))
41
- print(f"Using {len(humaneval)} problems")
42
-
43
-
44
- def extract_function(text, entry_point):
45
- """Extract function body from generated text."""
46
- lines = text.split("\n")
47
- result = []
48
- in_func = False
49
- base_indent = None
50
-
51
- for line in lines:
52
- stripped = line.lstrip()
53
- if stripped.startswith(f"def {entry_point}"):
54
- in_func = True
55
- result.append(line)
56
- base_indent = len(line) - len(stripped)
57
- elif in_func:
58
- current_indent = (
59
- len(line) - len(line.lstrip()) if line.strip() else base_indent + 4
60
- )
61
- if line.strip() == "":
62
- result.append("")
63
- elif current_indent > base_indent or not line.strip():
64
- result.append(line)
65
- elif stripped.startswith("def ") or stripped.startswith("class "):
66
- break
67
- else:
68
- # Check if it's a continuation
69
- if current_indent > base_indent:
70
- result.append(line)
71
- else:
72
- break
73
-
74
- return "\n".join(result)
75
-
76
-
77
- def run_test(code, test, timeout=5):
78
- """Execute code with test cases."""
79
- full_code = code + "\n\n" + test
80
-
81
- with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
82
- f.write(full_code)
83
- tmp_path = f.name
84
-
85
- try:
86
- result = subprocess.run(
87
- [sys.executable, tmp_path], capture_output=True, timeout=timeout, text=True
88
- )
89
- return result.returncode == 0
90
- except (subprocess.TimeoutExpired, Exception):
91
- return False
92
- finally:
93
- try:
94
- os.unlink(tmp_path)
95
- except:
96
- pass
97
-
98
-
99
- def evaluate_model(model, tokenizer, problems, model_name):
100
- """Evaluate a model on HumanEval problems."""
101
- results = []
102
-
103
- print(f"\nEvaluating: {model_name}")
104
- for problem in tqdm(problems, desc=model_name):
105
- prompt = problem["prompt"]
106
- entry_point = problem["entry_point"]
107
- test = problem["test"]
108
-
109
- # Generate
110
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
111
-
112
- with torch.no_grad():
113
- outputs = model.generate(
114
- **inputs,
115
- max_new_tokens=512,
116
- temperature=0.2,
117
- top_p=0.95,
118
- do_sample=True,
119
- pad_token_id=tokenizer.eos_token_id,
120
- )
121
-
122
- generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
123
- code = extract_function(generated, entry_point)
124
-
125
- # Test
126
- passed = run_test(code, test)
127
- results.append(passed)
128
-
129
- score = sum(results) / len(results) * 100
130
- return score, sum(results), len(results)
131
-
132
-
133
- # Load tokenizer
134
- print("\nLoading tokenizer...")
135
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
136
- if tokenizer.pad_token is None:
137
- tokenizer.pad_token = tokenizer.eos_token
138
-
139
- # Evaluate BASE model
140
- print("\nLoading base model...")
141
- base_model = AutoModelForCausalLM.from_pretrained(
142
- BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
143
- )
144
-
145
- base_score, base_passed, base_total = evaluate_model(
146
- base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
147
- )
148
-
149
- # Clear memory
150
- del base_model
151
- torch.cuda.empty_cache()
152
-
153
- # Evaluate FINE-TUNED model
154
- print(f"\nLoading fine-tuned model from {ADAPTER_MODEL}...")
155
- try:
156
- ft_model = AutoModelForCausalLM.from_pretrained(
157
- BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
158
- )
159
- ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
160
-
161
- ft_score, ft_passed, ft_total = evaluate_model(
162
- ft_model, tokenizer, humaneval, "Fine-tuned"
163
- )
164
- except Exception as e:
165
- print(f"Error loading adapter: {e}")
166
- ft_score, ft_passed, ft_total = 0, 0, NUM_PROBLEMS
167
-
168
- # Results
169
- print("\n" + "=" * 60)
170
- print("HUMANEVAL RESULTS")
171
- print("=" * 60)
172
- print(f"Base Qwen3-0.6B: {base_score:.1f}% ({base_passed}/{base_total})")
173
- print(f"Fine-tuned: {ft_score:.1f}% ({ft_passed}/{ft_total})")
174
- print(f"Difference: {ft_score - base_score:+.1f}%")
175
- print("=" * 60)
176
-
177
- if ft_score > base_score:
178
- print("SUCCESS! Fine-tuned model beats base model!")
179
- else:
180
- print("Fine-tuned model did not beat base model.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_job4_model.py DELETED
@@ -1,151 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "transformers>=4.36.0",
4
- # "peft>=0.7.0",
5
- # "accelerate>=0.24.0",
6
- # "torch",
7
- # "datasets",
8
- # "tqdm",
9
- # ]
10
- # ///
11
-
12
- """
13
- HumanEval-style evaluation for Job4 model
14
- """
15
-
16
- import ast
17
- import torch
18
- from transformers import AutoModelForCausalLM, AutoTokenizer
19
- from peft import PeftModel
20
- from datasets import load_dataset
21
- from tqdm import tqdm
22
-
23
-
24
- def extract_code(text, prompt):
25
- """Extract just the function completion from model output."""
26
- if text.startswith(prompt):
27
- text = text[len(prompt) :]
28
-
29
- stop_tokens = [
30
- "\ndef ",
31
- "\nclass ",
32
- "\n#",
33
- "\nif __name__",
34
- "\n\n\n",
35
- "<|endoftext|>",
36
- "<|im_end|>",
37
- ]
38
- for stop in stop_tokens:
39
- if stop in text:
40
- text = text[: text.index(stop)]
41
-
42
- return text.strip()
43
-
44
-
45
- def check_code_quality(prompt, completion, entry_point):
46
- """Check if completion is valid Python with proper structure."""
47
- full_code = prompt + completion
48
-
49
- try:
50
- ast.parse(full_code)
51
- except SyntaxError:
52
- return False, "syntax_error"
53
-
54
- completion_stripped = completion.strip()
55
- if not completion_stripped or completion_stripped in ["pass", "..."]:
56
- return False, "empty_body"
57
-
58
- has_logic = any(
59
- kw in completion for kw in ["return", "if", "for", "while", "=", "yield"]
60
- )
61
- if not has_logic:
62
- return False, "no_logic"
63
-
64
- return True, "valid"
65
-
66
-
67
- def evaluate_model(model, tokenizer, dataset, model_name, num_samples=50):
68
- """Evaluate model on HumanEval problems."""
69
- print(f"\nEvaluating: {model_name}")
70
- print(f"Testing on {num_samples} problems...")
71
-
72
- passed = 0
73
- total = 0
74
- results = {"valid": 0, "syntax_error": 0, "empty_body": 0, "no_logic": 0}
75
-
76
- for example in tqdm(dataset.select(range(num_samples)), desc="Problems"):
77
- prompt = example["prompt"]
78
- entry_point = example["entry_point"]
79
-
80
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
81
- with torch.no_grad():
82
- outputs = model.generate(
83
- **inputs,
84
- max_new_tokens=256,
85
- temperature=0.2,
86
- top_p=0.95,
87
- do_sample=True,
88
- pad_token_id=tokenizer.eos_token_id,
89
- eos_token_id=tokenizer.eos_token_id,
90
- )
91
-
92
- full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
93
- completion = extract_code(full_output, prompt)
94
-
95
- valid, reason = check_code_quality(prompt, completion, entry_point)
96
- results[reason] = results.get(reason, 0) + 1
97
-
98
- if valid:
99
- passed += 1
100
- total += 1
101
-
102
- score = passed / total if total > 0 else 0
103
- print(f" Valid code: {passed}/{total} = {score:.1%}")
104
- print(f" Breakdown: {results}")
105
- return score
106
-
107
-
108
- # Load HumanEval
109
- print("Loading HumanEval dataset...")
110
- dataset = load_dataset("openai/openai_humaneval", split="test")
111
- print(f"Total problems: {len(dataset)}")
112
-
113
- # Load base model
114
- print("\nLoading base model: Qwen/Qwen3-0.6B")
115
- base_model = AutoModelForCausalLM.from_pretrained(
116
- "Qwen/Qwen3-0.6B",
117
- torch_dtype=torch.bfloat16,
118
- device_map="auto",
119
- trust_remote_code=True,
120
- )
121
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
122
- if tokenizer.pad_token is None:
123
- tokenizer.pad_token = tokenizer.eos_token
124
-
125
- NUM_SAMPLES = 50
126
- base_score = evaluate_model(
127
- base_model, tokenizer, dataset, "Base Qwen3-0.6B", NUM_SAMPLES
128
- )
129
-
130
- # Load Job4 fine-tuned model
131
- print("\nLoading Job4 fine-tuned model...")
132
- ft_model = PeftModel.from_pretrained(
133
- base_model, "passagereptile455/qwen3-0.6b-python-code-sft-job4"
134
- )
135
- ft_score = evaluate_model(
136
- ft_model, tokenizer, dataset, "Fine-tuned Job4 (Python)", NUM_SAMPLES
137
- )
138
-
139
- print("\n" + "=" * 60)
140
- print("HUMANEVAL CODE QUALITY RESULTS")
141
- print("=" * 60)
142
- print(f"Base Qwen3-0.6B: {base_score:.1%}")
143
- print(f"Fine-tuned Job4: {ft_score:.1%}")
144
- print(f"Difference: {(ft_score - base_score) * 100:+.1f}%")
145
-
146
- if ft_score > base_score:
147
- print("\n*** SUCCESS! Fine-tuned model produces better code! ***")
148
- elif ft_score == base_score:
149
- print("\n*** TIED ***")
150
- else:
151
- print("\n*** Base model still better ***")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_simple.py DELETED
@@ -1,108 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "transformers>=4.36.0",
4
- # "peft>=0.7.0",
5
- # "accelerate>=0.24.0",
6
- # "datasets",
7
- # "torch",
8
- # ]
9
- # ///
10
-
11
- """
12
- Evaluate base Qwen3-0.6B and fine-tuned model on code generation
13
- """
14
-
15
- import torch
16
- from transformers import AutoModelForCausalLM, AutoTokenizer
17
- from peft import PeftModel
18
-
19
- # Simple code prompts
20
- TEST_PROMPTS = [
21
- 'def is_prime(n: int) -> bool:\n """Return True if n is prime."""\n',
22
- 'def factorial(n: int) -> int:\n """Return factorial of n."""\n',
23
- 'def fibonacci(n: int) -> int:\n """Return nth Fibonacci number."""\n',
24
- 'def reverse_string(s: str) -> str:\n """Return reversed string."""\n',
25
- 'def sum_list(lst: list) -> int:\n """Return sum of list elements."""\n',
26
- ]
27
-
28
-
29
- def generate_code(model, tokenizer, prompt, max_tokens=256):
30
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
31
- with torch.no_grad():
32
- out = model.generate(
33
- **inputs,
34
- max_new_tokens=max_tokens,
35
- temperature=0.1,
36
- do_sample=True,
37
- pad_token_id=tokenizer.eos_token_id,
38
- )
39
- return tokenizer.decode(
40
- out[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
41
- )
42
-
43
-
44
- def test_completion(completion):
45
- completion = completion.strip()
46
- lines = completion.split("\n")
47
- body_lines = []
48
- for line in lines:
49
- if line.strip().startswith("def ") or line.strip().startswith("class "):
50
- break
51
- body_lines.append(line)
52
- body = "\n".join(body_lines)
53
- has_return = "return" in body
54
- has_logic = any(kw in body for kw in ["if", "for", "while", "return", "="])
55
- return has_return or has_logic
56
-
57
-
58
- def evaluate_model(model, tokenizer, name):
59
- print(f"\nEvaluating: {name}")
60
- correct = 0
61
- for i, prompt in enumerate(TEST_PROMPTS):
62
- completion = generate_code(model, tokenizer, prompt)
63
- passed = test_completion(completion)
64
- status = "PASS" if passed else "FAIL"
65
- print(f" Test {i + 1}: {status}")
66
- if passed:
67
- correct += 1
68
-
69
- score = correct / len(TEST_PROMPTS)
70
- print(f" Score: {correct}/{len(TEST_PROMPTS)} = {score:.1%}")
71
- return score
72
-
73
-
74
- # Load base model
75
- print("Loading base model: Qwen/Qwen3-0.6B")
76
- base_model = AutoModelForCausalLM.from_pretrained(
77
- "Qwen/Qwen3-0.6B",
78
- torch_dtype=torch.bfloat16,
79
- device_map="auto",
80
- trust_remote_code=True,
81
- )
82
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
83
- if tokenizer.pad_token is None:
84
- tokenizer.pad_token = tokenizer.eos_token
85
-
86
- base_score = evaluate_model(base_model, tokenizer, "Base Qwen3-0.6B")
87
-
88
- # Load fine-tuned model
89
- print("\nLoading fine-tuned model...")
90
- ft_model = PeftModel.from_pretrained(
91
- base_model, "passagereptile455/qwen3-0.6b-codeforces-sft-job3"
92
- )
93
- ft_score = evaluate_model(ft_model, tokenizer, "Fine-tuned Job3")
94
-
95
- # Results
96
- print("\n" + "=" * 50)
97
- print("RESULTS SUMMARY")
98
- print("=" * 50)
99
- print(f"Base Qwen3-0.6B: {base_score:.1%}")
100
- print(f"Fine-tuned Job3: {ft_score:.1%}")
101
- print(f"Improvement: {(ft_score - base_score) * 100:+.1f}%")
102
-
103
- if ft_score > base_score:
104
- print("\n*** SUCCESS! Fine-tuned model BEATS base! ***")
105
- elif ft_score == base_score:
106
- print("\n*** TIED - Same performance ***")
107
- else:
108
- print("\n*** Base model still better ***")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
humaneval_baseline_test.py DELETED
@@ -1,175 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "torch",
5
- # "transformers",
6
- # "accelerate",
7
- # "datasets",
8
- # "huggingface_hub",
9
- # ]
10
- # ///
11
- """
12
- HumanEval Baseline Assessment for Qwen3-0.6B
13
- Tests the base model on all 164 HumanEval problems using pass@1.
14
- Uses subprocess for safe code testing.
15
- """
16
-
17
- import re
18
- import subprocess
19
- import tempfile
20
- import os
21
- from datasets import load_dataset
22
- from transformers import AutoModelForCausalLM, AutoTokenizer
23
- import torch
24
-
25
-
26
- def extract_code(response: str, prompt: str) -> str:
27
- """Extract the function completion from model response."""
28
- response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
29
- response = response.strip()
30
-
31
- if prompt.strip() in response:
32
- response = response.split(prompt.strip(), 1)[-1]
33
-
34
- code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
35
- if code_match:
36
- response = code_match.group(1)
37
- else:
38
- code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
39
- if code_match:
40
- response = code_match.group(1)
41
-
42
- response = response.strip()
43
-
44
- lines = response.split("\n")
45
- result_lines = []
46
- for line in lines:
47
- if line.startswith("def ") or line.startswith("class "):
48
- break
49
- result_lines.append(line)
50
-
51
- return "\n".join(result_lines)
52
-
53
-
54
- def run_test_subprocess(
55
- prompt: str, completion: str, test: str, entry_point: str
56
- ) -> bool:
57
- """Run the test for a single problem using subprocess."""
58
- full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
59
-
60
- with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
61
- f.write(full_code)
62
- temp_path = f.name
63
-
64
- try:
65
- result = subprocess.run(
66
- ["python", temp_path], capture_output=True, text=True, timeout=10
67
- )
68
- return result.returncode == 0
69
- except subprocess.TimeoutExpired:
70
- return False
71
- except Exception:
72
- return False
73
- finally:
74
- try:
75
- os.unlink(temp_path)
76
- except:
77
- pass
78
-
79
-
80
- def main():
81
- print("=" * 60)
82
- print("HumanEval Baseline Assessment")
83
- print("Model: Qwen/Qwen3-0.6B")
84
- print("=" * 60)
85
-
86
- print("\nLoading model...")
87
- model_name = "Qwen/Qwen3-0.6B"
88
- tokenizer = AutoTokenizer.from_pretrained(model_name)
89
- model = AutoModelForCausalLM.from_pretrained(
90
- model_name,
91
- torch_dtype=torch.float16,
92
- device_map="auto",
93
- )
94
- model.train(False)
95
- print(f"Model loaded on {model.device}")
96
-
97
- print("\nLoading HumanEval dataset...")
98
- dataset = load_dataset("openai/openai_humaneval", split="test")
99
- print(f"Total problems: {len(dataset)}")
100
-
101
- passed = 0
102
- failed = 0
103
- errors = []
104
-
105
- print("\nRunning assessment...")
106
- for i, problem in enumerate(dataset):
107
- task_id = problem["task_id"]
108
- prompt = problem["prompt"]
109
- test = problem["test"]
110
- entry_point = problem["entry_point"]
111
-
112
- messages = [
113
- {
114
- "role": "user",
115
- "content": f"Complete the following Python function. Only provide the implementation, no explanation.\n\n{prompt}",
116
- }
117
- ]
118
-
119
- text = tokenizer.apply_chat_template(
120
- messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
121
- )
122
-
123
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
124
-
125
- with torch.no_grad():
126
- outputs = model.generate(
127
- **inputs,
128
- max_new_tokens=512,
129
- temperature=0.0,
130
- do_sample=False,
131
- pad_token_id=tokenizer.eos_token_id,
132
- )
133
-
134
- response = tokenizer.decode(
135
- outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
136
- )
137
- completion = extract_code(response, prompt)
138
-
139
- success = run_test_subprocess(prompt, completion, test, entry_point)
140
-
141
- if success:
142
- passed += 1
143
- else:
144
- failed += 1
145
- errors.append(task_id)
146
-
147
- if (i + 1) % 10 == 0 or i == len(dataset) - 1:
148
- print(
149
- f"Progress: {i + 1}/{len(dataset)} | Passed: {passed} | Failed: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
150
- )
151
-
152
- print("\n" + "=" * 60)
153
- print("FINAL RESULTS")
154
- print("=" * 60)
155
- print(f"Total problems: {len(dataset)}")
156
- print(f"Passed: {passed}")
157
- print(f"Failed: {failed}")
158
- print(f"Pass@1: {passed / len(dataset) * 100:.2f}%")
159
- print("=" * 60)
160
-
161
- with open("baseline_results.txt", "w") as f:
162
- f.write(f"Model: {model_name}\n")
163
- f.write(f"Total: {len(dataset)}\n")
164
- f.write(f"Passed: {passed}\n")
165
- f.write(f"Failed: {failed}\n")
166
- f.write(f"Pass@1: {passed / len(dataset) * 100:.2f}%\n")
167
- f.write(f"\nFailed tasks:\n")
168
- for task in errors:
169
- f.write(f" {task}\n")
170
-
171
- print("\nResults saved to baseline_results.txt")
172
-
173
-
174
- if __name__ == "__main__":
175
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
humaneval_debug.py DELETED
@@ -1,164 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "torch",
5
- # "transformers",
6
- # "accelerate",
7
- # "datasets",
8
- # "huggingface_hub",
9
- # ]
10
- # ///
11
- """
12
- Debug HumanEval assessment - show model outputs to understand failures.
13
- """
14
-
15
- import re
16
- import subprocess
17
- import tempfile
18
- import os
19
- from datasets import load_dataset
20
- from transformers import AutoModelForCausalLM, AutoTokenizer
21
- import torch
22
-
23
-
24
- def extract_code(response: str, prompt: str) -> str:
25
- """Extract the function completion from model response."""
26
- response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
27
- response = response.strip()
28
-
29
- if prompt.strip() in response:
30
- response = response.split(prompt.strip(), 1)[-1]
31
-
32
- code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
33
- if code_match:
34
- response = code_match.group(1)
35
- else:
36
- code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
37
- if code_match:
38
- response = code_match.group(1)
39
-
40
- response = response.strip()
41
-
42
- lines = response.split("\n")
43
- result_lines = []
44
- for line in lines:
45
- if line.startswith("def ") or line.startswith("class "):
46
- break
47
- result_lines.append(line)
48
-
49
- return "\n".join(result_lines)
50
-
51
-
52
- def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
53
- """Run the test for a single problem using subprocess. Returns (success, error_msg)."""
54
- full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
55
-
56
- with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
57
- f.write(full_code)
58
- temp_path = f.name
59
-
60
- try:
61
- result = subprocess.run(
62
- ["python", temp_path], capture_output=True, text=True, timeout=10
63
- )
64
- if result.returncode == 0:
65
- return True, None
66
- else:
67
- return False, result.stderr[:500]
68
- except subprocess.TimeoutExpired:
69
- return False, "TIMEOUT"
70
- except Exception as e:
71
- return False, str(e)
72
- finally:
73
- try:
74
- os.unlink(temp_path)
75
- except:
76
- pass
77
-
78
-
79
- def main():
80
- print("=" * 60)
81
- print("HumanEval DEBUG Assessment")
82
- print("Model: Qwen/Qwen3-0.6B")
83
- print("=" * 60)
84
-
85
- print("\nLoading model...")
86
- model_name = "Qwen/Qwen3-0.6B"
87
- tokenizer = AutoTokenizer.from_pretrained(model_name)
88
- model = AutoModelForCausalLM.from_pretrained(
89
- model_name,
90
- torch_dtype=torch.float16,
91
- device_map="auto",
92
- )
93
- model.train(False)
94
- print(f"Model loaded on {model.device}")
95
-
96
- print("\nLoading HumanEval dataset...")
97
- dataset = load_dataset("openai/openai_humaneval", split="test")
98
- print(f"Total problems: {len(dataset)}")
99
-
100
- # Only test first 5 problems for debugging
101
- print("\n=== DEBUGGING FIRST 5 PROBLEMS ===\n")
102
-
103
- for i, problem in enumerate(dataset):
104
- if i >= 5:
105
- break
106
-
107
- task_id = problem["task_id"]
108
- prompt = problem["prompt"]
109
- test = problem["test"]
110
- entry_point = problem["entry_point"]
111
-
112
- print(f"\n{'=' * 60}")
113
- print(f"PROBLEM {i + 1}: {task_id}")
114
- print(f"{'=' * 60}")
115
- print(f"\n--- PROMPT (first 300 chars) ---")
116
- print(prompt[:300])
117
-
118
- messages = [
119
- {
120
- "role": "user",
121
- "content": f"Complete the following Python function. Only provide the implementation, no explanation.\n\n{prompt}",
122
- }
123
- ]
124
-
125
- text = tokenizer.apply_chat_template(
126
- messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
127
- )
128
-
129
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
130
-
131
- with torch.no_grad():
132
- outputs = model.generate(
133
- **inputs,
134
- max_new_tokens=512,
135
- do_sample=False,
136
- pad_token_id=tokenizer.eos_token_id,
137
- )
138
-
139
- response = tokenizer.decode(
140
- outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
141
- )
142
-
143
- print(f"\n--- RAW MODEL RESPONSE ---")
144
- print(response[:800])
145
-
146
- completion = extract_code(response, prompt)
147
-
148
- print(f"\n--- EXTRACTED COMPLETION ---")
149
- print(completion[:500] if completion else "(empty)")
150
-
151
- success, error = run_test_subprocess(prompt, completion, test, entry_point)
152
-
153
- print(f"\n--- TEST RESULT ---")
154
- print(f"Success: {success}")
155
- if error:
156
- print(f"Error: {error[:300]}")
157
-
158
- print("\n" + "=" * 60)
159
- print("DEBUG COMPLETE")
160
- print("=" * 60)
161
-
162
-
163
- if __name__ == "__main__":
164
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
humaneval_v2.py DELETED
@@ -1,185 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "torch",
5
- # "transformers",
6
- # "accelerate",
7
- # "datasets",
8
- # "huggingface_hub",
9
- # ]
10
- # ///
11
- """
12
- HumanEval Assessment v2 - Fixed code extraction.
13
- The model outputs full functions, so we extract just the body.
14
- """
15
-
16
- import re
17
- import subprocess
18
- import tempfile
19
- import os
20
- from datasets import load_dataset
21
- from transformers import AutoModelForCausalLM, AutoTokenizer
22
- import torch
23
-
24
-
25
- def extract_function_body(response: str) -> str:
26
- """Extract just the function body from model response."""
27
- # Remove think tags
28
- response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
29
- response = response.strip()
30
-
31
- # Extract from markdown code blocks
32
- code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
33
- if code_match:
34
- response = code_match.group(1)
35
- else:
36
- code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
37
- if code_match:
38
- response = code_match.group(1)
39
-
40
- response = response.strip()
41
-
42
- # Find the function body - skip imports, def line, and docstring
43
- lines = response.split("\n")
44
-
45
- # Skip initial imports
46
- start_idx = 0
47
- for i, line in enumerate(lines):
48
- if line.strip().startswith("def "):
49
- start_idx = i
50
- break
51
-
52
- # Skip the def line
53
- start_idx += 1
54
-
55
- # Skip docstring if present
56
- if start_idx < len(lines):
57
- stripped = lines[start_idx].strip()
58
- if stripped.startswith('"""') or stripped.startswith("'''"):
59
- quote = stripped[:3]
60
- if stripped.count(quote) >= 2:
61
- # Single-line docstring
62
- start_idx += 1
63
- else:
64
- # Multi-line docstring - find the end
65
- start_idx += 1
66
- while start_idx < len(lines) and quote not in lines[start_idx]:
67
- start_idx += 1
68
- start_idx += 1 # Skip the closing quote line
69
-
70
- # Get the body
71
- body_lines = lines[start_idx:]
72
-
73
- # Return the body with proper indentation
74
- return "\n".join(body_lines)
75
-
76
-
77
- def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
78
- """Run the test using subprocess."""
79
- full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
80
-
81
- with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
82
- f.write(full_code)
83
- temp_path = f.name
84
-
85
- try:
86
- result = subprocess.run(
87
- ["python", temp_path], capture_output=True, text=True, timeout=10
88
- )
89
- return result.returncode == 0
90
- except subprocess.TimeoutExpired:
91
- return False
92
- except Exception:
93
- return False
94
- finally:
95
- try:
96
- os.unlink(temp_path)
97
- except:
98
- pass
99
-
100
-
101
- def main():
102
- print("=" * 60)
103
- print("HumanEval Assessment v2")
104
- print("Model: Qwen/Qwen3-0.6B")
105
- print("=" * 60)
106
-
107
- print("\nLoading model...")
108
- model_name = "Qwen/Qwen3-0.6B"
109
- tokenizer = AutoTokenizer.from_pretrained(model_name)
110
- model = AutoModelForCausalLM.from_pretrained(
111
- model_name,
112
- torch_dtype=torch.float16,
113
- device_map="auto",
114
- )
115
- model.train(False)
116
- print(f"Model loaded on {model.device}")
117
-
118
- print("\nLoading HumanEval dataset...")
119
- dataset = load_dataset("openai/openai_humaneval", split="test")
120
- print(f"Total problems: {len(dataset)}")
121
-
122
- passed = 0
123
- failed = 0
124
- errors = []
125
-
126
- print("\nRunning assessment...")
127
- for i, problem in enumerate(dataset):
128
- task_id = problem["task_id"]
129
- prompt = problem["prompt"]
130
- test = problem["test"]
131
- entry_point = problem["entry_point"]
132
-
133
- # Simple completion prompt
134
- messages = [
135
- {
136
- "role": "user",
137
- "content": f"Complete this Python function. Output only the code.\n\n{prompt}",
138
- }
139
- ]
140
-
141
- text = tokenizer.apply_chat_template(
142
- messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
143
- )
144
-
145
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
146
-
147
- with torch.no_grad():
148
- outputs = model.generate(
149
- **inputs,
150
- max_new_tokens=512,
151
- do_sample=False,
152
- pad_token_id=tokenizer.eos_token_id,
153
- )
154
-
155
- response = tokenizer.decode(
156
- outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
157
- )
158
-
159
- completion = extract_function_body(response)
160
-
161
- success = run_test_subprocess(prompt, completion, test, entry_point)
162
-
163
- if success:
164
- passed += 1
165
- else:
166
- failed += 1
167
- errors.append(task_id)
168
-
169
- if (i + 1) % 10 == 0 or i == len(dataset) - 1:
170
- print(
171
- f"Progress: {i + 1}/{len(dataset)} | Passed: {passed} | Failed: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
172
- )
173
-
174
- print("\n" + "=" * 60)
175
- print("FINAL RESULTS")
176
- print("=" * 60)
177
- print(f"Total problems: {len(dataset)}")
178
- print(f"Passed: {passed}")
179
- print(f"Failed: {failed}")
180
- print(f"Pass@1: {passed / len(dataset) * 100:.2f}%")
181
- print("=" * 60)
182
-
183
-
184
- if __name__ == "__main__":
185
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_and_test.py DELETED
@@ -1,266 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "torch",
5
- # "transformers>=4.45.0",
6
- # "accelerate",
7
- # "datasets",
8
- # "trl>=0.12.0",
9
- # "peft",
10
- # "huggingface_hub",
11
- # ]
12
- # ///
13
- """
14
- Combined training and testing script.
15
- Trains Qwen3-0.6B on codeforces-cots, then tests on HumanEval.
16
- """
17
-
18
- import os
19
- import re
20
- import subprocess
21
- import tempfile
22
- from datasets import load_dataset, Dataset
23
- from transformers import AutoModelForCausalLM, AutoTokenizer
24
- from peft import LoraConfig, PeftModel
25
- from trl import SFTTrainer, SFTConfig
26
- import torch
27
-
28
-
29
- def extract_function_body(response: str) -> str:
30
- """Extract just the function body from model response."""
31
- response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
32
- response = response.strip()
33
-
34
- code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
35
- if code_match:
36
- response = code_match.group(1)
37
- else:
38
- code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
39
- if code_match:
40
- response = code_match.group(1)
41
-
42
- response = response.strip()
43
- lines = response.split("\n")
44
-
45
- start_idx = 0
46
- for i, line in enumerate(lines):
47
- if line.strip().startswith("def "):
48
- start_idx = i
49
- break
50
-
51
- start_idx += 1
52
-
53
- if start_idx < len(lines):
54
- stripped = lines[start_idx].strip()
55
- if stripped.startswith('"""') or stripped.startswith("'''"):
56
- quote = stripped[:3]
57
- if stripped.count(quote) >= 2:
58
- start_idx += 1
59
- else:
60
- start_idx += 1
61
- while start_idx < len(lines) and quote not in lines[start_idx]:
62
- start_idx += 1
63
- start_idx += 1
64
-
65
- body_lines = lines[start_idx:]
66
- return "\n".join(body_lines)
67
-
68
-
69
- def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
70
- """Run the test using subprocess."""
71
- full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
72
-
73
- with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
74
- f.write(full_code)
75
- temp_path = f.name
76
-
77
- try:
78
- result = subprocess.run(
79
- ["python", temp_path], capture_output=True, text=True, timeout=10
80
- )
81
- return result.returncode == 0
82
- except subprocess.TimeoutExpired:
83
- return False
84
- except Exception:
85
- return False
86
- finally:
87
- try:
88
- os.unlink(temp_path)
89
- except:
90
- pass
91
-
92
-
93
- def test_model(model, tokenizer, model_name="Model"):
94
- """Test model on HumanEval."""
95
- print(f"\n{'=' * 60}")
96
- print(f"Testing: {model_name}")
97
- print("=" * 60)
98
-
99
- dataset = load_dataset("openai/openai_humaneval", split="test")
100
- print(f"Total problems: {len(dataset)}")
101
-
102
- passed = 0
103
- failed = 0
104
-
105
- for i, problem in enumerate(dataset):
106
- prompt = problem["prompt"]
107
- test = problem["test"]
108
- entry_point = problem["entry_point"]
109
-
110
- messages = [
111
- {
112
- "role": "user",
113
- "content": f"Complete this Python function. Output only the code.\n\n{prompt}",
114
- }
115
- ]
116
-
117
- text = tokenizer.apply_chat_template(
118
- messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
119
- )
120
-
121
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
122
-
123
- with torch.no_grad():
124
- outputs = model.generate(
125
- **inputs,
126
- max_new_tokens=512,
127
- do_sample=False,
128
- pad_token_id=tokenizer.eos_token_id,
129
- )
130
-
131
- response = tokenizer.decode(
132
- outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
133
- )
134
-
135
- completion = extract_function_body(response)
136
- success = run_test_subprocess(prompt, completion, test, entry_point)
137
-
138
- if success:
139
- passed += 1
140
- else:
141
- failed += 1
142
-
143
- if (i + 1) % 20 == 0 or i == len(dataset) - 1:
144
- print(
145
- f"Progress: {i + 1}/{len(dataset)} | Pass: {passed} | Fail: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
146
- )
147
-
148
- print(f"\nFINAL: {passed}/{len(dataset)} = {passed / len(dataset) * 100:.2f}%")
149
- return passed / len(dataset) * 100
150
-
151
-
152
- def main():
153
- print("=" * 60)
154
- print("Combined Training & Testing")
155
- print("=" * 60)
156
-
157
- model_name = "Qwen/Qwen3-0.6B"
158
-
159
- # Load tokenizer
160
- print("\nLoading tokenizer...")
161
- tokenizer = AutoTokenizer.from_pretrained(model_name)
162
- if tokenizer.pad_token is None:
163
- tokenizer.pad_token = tokenizer.eos_token
164
-
165
- # Load base model
166
- print("Loading base model...")
167
- base_model = AutoModelForCausalLM.from_pretrained(
168
- model_name,
169
- torch_dtype=torch.float16,
170
- device_map="auto",
171
- )
172
-
173
- # LoRA config
174
- lora_config = LoraConfig(
175
- r=8,
176
- lora_alpha=16,
177
- lora_dropout=0.05,
178
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
179
- bias="none",
180
- task_type="CAUSAL_LM",
181
- )
182
-
183
- # Load training dataset
184
- print("\nLoading training dataset (streaming)...")
185
- dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
186
-
187
- print("Preparing examples...")
188
- examples = []
189
- for i, ex in enumerate(dataset):
190
- if i >= 500:
191
- break
192
- text = tokenizer.apply_chat_template(
193
- ex["messages"],
194
- tokenize=False,
195
- add_generation_prompt=False,
196
- )
197
- examples.append({"text": text})
198
-
199
- print(f"Loaded {len(examples)} training examples")
200
- train_dataset = Dataset.from_list(examples)
201
-
202
- # Training config
203
- training_args = SFTConfig(
204
- output_dir="./output",
205
- max_steps=150,
206
- per_device_train_batch_size=2,
207
- gradient_accumulation_steps=4,
208
- learning_rate=5e-6,
209
- lr_scheduler_type="cosine",
210
- warmup_steps=10,
211
- logging_steps=25,
212
- save_steps=150,
213
- fp16=True,
214
- gradient_checkpointing=True,
215
- push_to_hub=False,
216
- report_to="none",
217
- )
218
-
219
- # Create trainer
220
- print("\nInitializing trainer...")
221
- trainer = SFTTrainer(
222
- model=base_model,
223
- args=training_args,
224
- train_dataset=train_dataset,
225
- peft_config=lora_config,
226
- processing_class=tokenizer,
227
- )
228
-
229
- # Train
230
- print("\n" + "=" * 60)
231
- print("PHASE 1: Training (150 steps)")
232
- print("=" * 60)
233
- trainer.train()
234
-
235
- # Save trained model
236
- print("\nSaving trained model...")
237
- trainer.save_model("./trained_model")
238
-
239
- # Test the fine-tuned model
240
- print("\n" + "=" * 60)
241
- print("PHASE 2: Testing Fine-tuned Model")
242
- print("=" * 60)
243
-
244
- # Get the trained model from trainer
245
- trained_model = trainer.model
246
- trained_model.train(False)
247
-
248
- finetuned_score = test_model(trained_model, tokenizer, "Fine-tuned Qwen3-0.6B")
249
-
250
- # Summary
251
- print("\n" + "=" * 60)
252
- print("SUMMARY")
253
- print("=" * 60)
254
- print(f"Baseline (from earlier): 27.44%")
255
- print(f"Fine-tuned model: {finetuned_score:.2f}%")
256
- if finetuned_score > 27.44:
257
- print(f"IMPROVEMENT: +{finetuned_score - 27.44:.2f}%")
258
- print("SUCCESS! Fine-tuned model beats baseline!")
259
- else:
260
- print(f"DIFFERENCE: {finetuned_score - 27.44:.2f}%")
261
- print("Fine-tuned model did not beat baseline.")
262
- print("=" * 60)
263
-
264
-
265
- if __name__ == "__main__":
266
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_concise.py DELETED
@@ -1,32 +0,0 @@
1
- # /// script
2
- # dependencies = ["trl>=0.12.0", "peft>=0.7.0", "datasets", "transformers", "torch", "accelerate"]
3
- # ///
4
-
5
- from datasets import load_dataset
6
- from peft import LoraConfig
7
- from trl import SFTTrainer, SFTConfig
8
-
9
- # Load YOUR custom dataset
10
- dataset = load_dataset("passagereptile455/concise-tech-explanations", split="train")
11
-
12
- # Train on concise style
13
- trainer = SFTTrainer(
14
- model="Qwen/Qwen2.5-0.5B",
15
- train_dataset=dataset,
16
- peft_config=LoraConfig(r=16, lora_alpha=32, target_modules="all-linear"),
17
- args=SFTConfig(
18
- output_dir="qwen-concise",
19
- max_steps=50, # Small dataset, fewer steps
20
- per_device_train_batch_size=1,
21
- gradient_accumulation_steps=4,
22
- logging_steps=10,
23
- learning_rate=2e-4, # Higher LR for small dataset
24
- push_to_hub=True,
25
- hub_model_id="passagereptile455/qwen-concise-style",
26
- hub_private_repo=True,
27
- )
28
- )
29
-
30
- trainer.train()
31
- trainer.push_to_hub()
32
- print("Done! Model trained on YOUR concise style.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_eval_upload_v10.py DELETED
@@ -1,185 +0,0 @@
1
- #!/usr/bin/env python3
2
- # /// script
3
- # requires-python = ">=3.10"
4
- # dependencies = [
5
- # "trl>=0.12.0",
6
- # "peft>=0.7.0",
7
- # "transformers>=4.36.0",
8
- # "accelerate>=0.24.0",
9
- # "datasets",
10
- # "torch",
11
- # "huggingface_hub",
12
- # ]
13
- # ///
14
- import os
15
- import torch
16
- from datasets import load_dataset
17
- from transformers import AutoModelForCausalLM, AutoTokenizer
18
- from peft import LoraConfig, get_peft_model
19
- from trl import SFTConfig, SFTTrainer
20
- from huggingface_hub import login
21
-
22
- BASE_MODEL = "Qwen/Qwen3-0.6B"
23
- REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
24
- MAX_STEPS = 150
25
- LEARNING_RATE = 5e-6
26
- NUM_TRAIN_EXAMPLES = 500
27
-
28
- def authenticate():
29
- token = os.environ.get("HF_TOKEN")
30
- if not token:
31
- raise ValueError("HF_TOKEN not set")
32
- login(token=token)
33
- print("Authenticated")
34
-
35
- def load_humaneval():
36
- ds = load_dataset("openai/openai_humaneval", split="test")
37
- return list(ds)
38
-
39
- def extract_code(full_text, prompt):
40
- if full_text.startswith(prompt):
41
- generated = full_text[len(prompt):]
42
- else:
43
- generated = full_text
44
-
45
- for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
46
- if stop in generated:
47
- generated = generated.split(stop)[0]
48
-
49
- return (prompt + generated).strip()
50
-
51
- def test_solution(code, test_code, entry_point):
52
- try:
53
- ns = {}
54
- exec(code, ns)
55
- if entry_point not in ns:
56
- return False
57
- exec(test_code, ns)
58
- exec(f"check({entry_point})", ns)
59
- return True
60
- except:
61
- return False
62
-
63
- def evaluate_model(model, tokenizer, problems, desc):
64
- correct = 0
65
- model.eval()
66
-
67
- for i, p in enumerate(problems):
68
- prompt = p["prompt"]
69
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
70
-
71
- with torch.no_grad():
72
- out = model.generate(
73
- **inputs,
74
- max_new_tokens=256,
75
- temperature=0.1,
76
- do_sample=True,
77
- pad_token_id=tokenizer.eos_token_id,
78
- )
79
-
80
- full_text = tokenizer.decode(out[0], skip_special_tokens=True)
81
- code = extract_code(full_text, prompt)
82
-
83
- if test_solution(code, p["test"], p["entry_point"]):
84
- correct += 1
85
-
86
- if (i+1) % 40 == 0:
87
- print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
88
-
89
- score = correct / len(problems) * 100
90
- print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
91
- return score
92
-
93
- def format_example(ex):
94
- prompt = ex['prompt']
95
- gen = ex['generation']
96
- return {"text": f"<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n{gen}<|im_end|}"}
97
-
98
- def main():
99
- print("=" * 60)
100
- print("Qwen3-0.6B Fine-tuning Challenge v10")
101
- print("=" * 60)
102
-
103
- authenticate()
104
- problems = load_humaneval()
105
- print(f"Loaded {len(problems)} HumanEval problems")
106
-
107
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
108
- if tokenizer.pad_token is None:
109
- tokenizer.pad_token = tokenizer.eos_token
110
-
111
- print("\n[1/4] Evaluating BASE model...")
112
- model = AutoModelForCausalLM.from_pretrained(
113
- BASE_MODEL,
114
- torch_dtype=torch.bfloat16,
115
- device_map="auto",
116
- trust_remote_code=True
117
- )
118
- base_score = evaluate_model(model, tokenizer, problems, "BASE")
119
-
120
- print("\n[2/4] Training...")
121
- train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
122
- train_examples = []
123
- for i, ex in enumerate(train_ds):
124
- if i >= NUM_TRAIN_EXAMPLES:
125
- break
126
- train_examples.append(format_example(ex))
127
-
128
- from datasets import Dataset
129
- train_dataset = Dataset.from_list(train_examples)
130
- print(f"Prepared {len(train_dataset)} training examples")
131
-
132
- lora_config = LoraConfig(
133
- r=8, lora_alpha=32, lora_dropout=0.1,
134
- target_modules=["q_proj","k_proj","v_proj","o_proj"],
135
- task_type="CAUSAL_LM"
136
- )
137
- model = get_peft_model(model, lora_config)
138
- model.print_trainable_parameters()
139
-
140
- training_args = SFTConfig(
141
- output_dir="./qwen3-ft",
142
- max_steps=MAX_STEPS,
143
- learning_rate=LEARNING_RATE,
144
- per_device_train_batch_size=2,
145
- gradient_accumulation_steps=4,
146
- logging_steps=10,
147
- save_steps=9999,
148
- bf16=True,
149
- optim="adamw_torch",
150
- warmup_steps=10,
151
- dataset_text_field="text",
152
- )
153
-
154
- # Fixed: use processing_class instead of tokenizer
155
- trainer = SFTTrainer(
156
- model=model,
157
- args=training_args,
158
- train_dataset=train_dataset,
159
- processing_class=tokenizer
160
- )
161
- trainer.train()
162
- print("Training complete!")
163
-
164
- model = model.merge_and_unload()
165
-
166
- print("\n[3/4] Evaluating FINE-TUNED model...")
167
- ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
168
-
169
- print("\n[4/4] Results")
170
- print("=" * 60)
171
- print(f"BASE: {base_score:.2f}%")
172
- print(f"FINE-TUNED: {ft_score:.2f}%")
173
- print(f"CHANGE: {ft_score - base_score:+.2f}%")
174
- print("=" * 60)
175
-
176
- if ft_score > base_score:
177
- print("\nSUCCESS! Uploading to Hub...")
178
- model.push_to_hub(REPO_ID)
179
- tokenizer.push_to_hub(REPO_ID)
180
- print("Upload complete!")
181
- else:
182
- print("\nDid not beat base model. Variance - try again.")
183
-
184
- if __name__ == "__main__":
185
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_eval_upload_v11.py DELETED
@@ -1,127 +0,0 @@
1
- #!/usr/bin/env python3
2
- # /// script
3
- # requires-python = ">=3.10"
4
- # dependencies = [
5
- # "trl>=0.12.0",
6
- # "peft>=0.7.0",
7
- # "transformers>=4.36.0",
8
- # "accelerate>=0.24.0",
9
- # "datasets",
10
- # "torch",
11
- # "huggingface_hub",
12
- # ]
13
- # ///
14
- import os
15
- import torch
16
- from datasets import load_dataset
17
- from transformers import AutoModelForCausalLM, AutoTokenizer
18
- from peft import LoraConfig, get_peft_model
19
- from trl import SFTConfig, SFTTrainer
20
- from huggingface_hub import login
21
-
22
- BASE_MODEL = "Qwen/Qwen3-0.6B"
23
- REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
24
- MAX_STEPS = 150
25
- LEARNING_RATE = 5e-6
26
- NUM_TRAIN_EXAMPLES = 500
27
-
28
- def authenticate():
29
- token = os.environ.get("HF_TOKEN")
30
- if not token:
31
- raise ValueError("HF_TOKEN not set")
32
- login(token=token)
33
- print("Authenticated")
34
-
35
- def load_humaneval():
36
- return list(load_dataset("openai/openai_humaneval", split="test"))
37
-
38
- def extract_code(full_text, prompt):
39
- generated = full_text[len(prompt):] if full_text.startswith(prompt) else full_text
40
- for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
41
- if stop in generated:
42
- generated = generated.split(stop)[0]
43
- return (prompt + generated).strip()
44
-
45
- def test_solution(code, test_code, entry_point):
46
- try:
47
- ns = {}
48
- exec(code, ns)
49
- if entry_point not in ns:
50
- return False
51
- exec(test_code, ns)
52
- exec(f"check({entry_point})", ns)
53
- return True
54
- except:
55
- return False
56
-
57
- def evaluate_model(model, tokenizer, problems, desc):
58
- correct = 0
59
- model.eval()
60
- for i, p in enumerate(problems):
61
- inputs = tokenizer(p["prompt"], return_tensors="pt").to(model.device)
62
- with torch.no_grad():
63
- out = model.generate(**inputs, max_new_tokens=256, temperature=0.1, do_sample=True, pad_token_id=tokenizer.eos_token_id)
64
- full_text = tokenizer.decode(out[0], skip_special_tokens=True)
65
- if test_solution(extract_code(full_text, p["prompt"]), p["test"], p["entry_point"]):
66
- correct += 1
67
- if (i+1) % 40 == 0:
68
- print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
69
- score = correct / len(problems) * 100
70
- print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
71
- return score
72
-
73
- def format_example(ex):
74
- # FIXED: proper closing tag
75
- return {"text": "<|im_start|>user\n" + ex['prompt'] + "\n<|im_end|>\n<|im_start|>assistant\n" + ex['generation'] + "<|im_end|>"}
76
-
77
- def main():
78
- print("=" * 60)
79
- print("Qwen3-0.6B Fine-tuning v11")
80
- print("=" * 60)
81
-
82
- authenticate()
83
- problems = load_humaneval()
84
- print(f"Loaded {len(problems)} problems")
85
-
86
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
87
- tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
88
-
89
- print("\n[1/4] BASE eval...")
90
- model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
91
- base_score = evaluate_model(model, tokenizer, problems, "BASE")
92
-
93
- print("\n[2/4] Training...")
94
- train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
95
- train_examples = [format_example(ex) for i, ex in enumerate(train_ds) if i < NUM_TRAIN_EXAMPLES]
96
- from datasets import Dataset
97
- train_dataset = Dataset.from_list(train_examples)
98
- print(f"Prepared {len(train_dataset)} examples")
99
-
100
- model = get_peft_model(model, LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["q_proj","k_proj","v_proj","o_proj"], task_type="CAUSAL_LM"))
101
- model.print_trainable_parameters()
102
-
103
- training_args = SFTConfig(output_dir="./ft", max_steps=MAX_STEPS, learning_rate=LEARNING_RATE, per_device_train_batch_size=2, gradient_accumulation_steps=4, logging_steps=10, save_steps=9999, bf16=True, optim="adamw_torch", warmup_steps=10, dataset_text_field="text")
104
- trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset, processing_class=tokenizer)
105
- trainer.train()
106
- print("Training done!")
107
-
108
- model = model.merge_and_unload()
109
-
110
- print("\n[3/4] FINE-TUNED eval...")
111
- ft_score = evaluate_model(model, tokenizer, problems, "FT")
112
-
113
- print("\n[4/4] Results")
114
- print("=" * 60)
115
- print(f"BASE: {base_score:.2f}% | FT: {ft_score:.2f}% | CHANGE: {ft_score - base_score:+.2f}%")
116
- print("=" * 60)
117
-
118
- if ft_score > base_score:
119
- print("\nWIN! Uploading...")
120
- model.push_to_hub(REPO_ID)
121
- tokenizer.push_to_hub(REPO_ID)
122
- print("Done!")
123
- else:
124
- print("\nNo win. Try again.")
125
-
126
- if __name__ == "__main__":
127
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_eval_upload_v4.py DELETED
@@ -1,134 +0,0 @@
1
- #!/usr/bin/env python3
2
- # /// script
3
- # requires-python = ">=3.10"
4
- # dependencies = [
5
- # "trl>=0.12.0",
6
- # "peft>=0.7.0",
7
- # "transformers>=4.36.0",
8
- # "accelerate>=0.24.0",
9
- # "datasets",
10
- # "torch",
11
- # "huggingface_hub",
12
- # ]
13
- # ///
14
- import os
15
- import re
16
- import torch
17
- from datasets import load_dataset
18
- from transformers import AutoModelForCausalLM, AutoTokenizer
19
- from peft import LoraConfig, get_peft_model
20
- from trl import SFTConfig, SFTTrainer
21
- from huggingface_hub import login
22
-
23
- BASE_MODEL = "Qwen/Qwen3-0.6B"
24
- REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
25
- MAX_STEPS = 150
26
- LEARNING_RATE = 5e-6
27
- NUM_TRAIN_EXAMPLES = 500
28
-
29
- def authenticate():
30
- token = os.environ.get("HF_TOKEN")
31
- if not token:
32
- raise ValueError("HF_TOKEN not set")
33
- login(token=token)
34
- print("Authenticated")
35
-
36
- def load_humaneval():
37
- ds = load_dataset("openai/openai_humaneval", split="test")
38
- return list(ds)
39
-
40
- def extract_code(text):
41
- patterns = [r"python
42
- (.*?)", r"
43
- (.*?)"]
44
- for p in patterns:
45
- m = re.findall(p, text, re.DOTALL)
46
- if m:
47
- return m[0].strip()
48
- return text.strip()
49
-
50
- def test_solution(code, test_code, entry_point):
51
- try:
52
- ns = {}
53
- exec(code, ns)
54
- if entry_point not in ns:
55
- return False
56
- exec(test_code, ns)
57
- exec(f"check({entry_point})", ns)
58
- return True
59
- except:
60
- return False
61
-
62
- def evaluate_model(model, tokenizer, problems, desc):
63
- correct = 0
64
- model.eval()
65
- for i, p in enumerate(problems):
66
- inputs = tokenizer(p["prompt"], return_tensors="pt").to(model.device)
67
- with torch.no_grad():
68
- out = model.generate(**inputs, max_new_tokens=512, temperature=0.2, do_sample=True, pad_token_id=tokenizer.eos_token_id)
69
- resp = tokenizer.decode(out[0], skip_special_tokens=True)
70
- gen = resp[len(p["prompt"]):]
71
- code = extract_code(p["prompt"] + gen)
72
- if test_solution(code, p["test"], p["entry_point"]):
73
- correct += 1
74
- if (i+1) % 20 == 0:
75
- print(f"{desc}: {i+1}/{len(problems)}, {correct} correct")
76
- score = correct / len(problems) * 100
77
- print(f"{desc}: {correct}/{len(problems)} = {score:.2f}%")
78
- return score
79
-
80
- def format_example(ex):
81
- return {"text": f"<|im_start|>user
82
- {ex['problem']}
83
- <|im_end|>
84
- <|im_start|>assistant
85
- {ex['solution']}<|im_end|>"}
86
-
87
- def main():
88
- print("="*60)
89
- authenticate()
90
- problems = load_humaneval()
91
- print(f"Loaded {len(problems)} problems")
92
-
93
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
94
- if tokenizer.pad_token is None:
95
- tokenizer.pad_token = tokenizer.eos_token
96
-
97
- base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
98
- base_score = evaluate_model(base_model, tokenizer, problems, "BASE")
99
- del base_model
100
- torch.cuda.empty_cache()
101
-
102
- train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
103
- train_examples = [format_example(ex) for i, ex in enumerate(train_ds) if i < NUM_TRAIN_EXAMPLES]
104
- from datasets import Dataset
105
- train_dataset = Dataset.from_list(train_examples)
106
- print(f"Prepared {len(train_dataset)} examples")
107
-
108
- model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
109
- lora_config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["q_proj","k_proj","v_proj","o_proj"], task_type="CAUSAL_LM")
110
- model = get_peft_model(model, lora_config)
111
- model.print_trainable_parameters()
112
-
113
- training_args = SFTConfig(output_dir="./qwen3-ft", max_steps=MAX_STEPS, learning_rate=LEARNING_RATE, per_device_train_batch_size=2, gradient_accumulation_steps=4, logging_steps=10, save_steps=50, bf16=True, optim="adamw_torch", warmup_steps=10, max_seq_length=2048)
114
- trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
115
- trainer.train()
116
- model = model.merge_and_unload()
117
-
118
- ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
119
-
120
- print("="*60)
121
- print(f"BASE: {base_score:.2f}%")
122
- print(f"FINE-TUNED: {ft_score:.2f}%")
123
- print(f"IMPROVEMENT: {ft_score - base_score:+.2f}%")
124
-
125
- if ft_score > base_score:
126
- print("SUCCESS! Uploading...")
127
- model.push_to_hub(REPO_ID)
128
- tokenizer.push_to_hub(REPO_ID)
129
- print("Done!")
130
- else:
131
- print("Did not beat base. Try again.")
132
-
133
- if __name__ == "__main__":
134
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_eval_upload_v5.py DELETED
@@ -1,134 +0,0 @@
1
- #!/usr/bin/env python3
2
- # /// script
3
- # requires-python = ">=3.10"
4
- # dependencies = [
5
- # "trl>=0.12.0",
6
- # "peft>=0.7.0",
7
- # "transformers>=4.36.0",
8
- # "accelerate>=0.24.0",
9
- # "datasets",
10
- # "torch",
11
- # "huggingface_hub",
12
- # ]
13
- # ///
14
- import os
15
- import re
16
- import torch
17
- from datasets import load_dataset
18
- from transformers import AutoModelForCausalLM, AutoTokenizer
19
- from peft import LoraConfig, get_peft_model
20
- from trl import SFTConfig, SFTTrainer
21
- from huggingface_hub import login
22
-
23
- BASE_MODEL = "Qwen/Qwen3-0.6B"
24
- REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
25
- MAX_STEPS = 150
26
- LEARNING_RATE = 5e-6
27
- NUM_TRAIN_EXAMPLES = 500
28
-
29
- def authenticate():
30
- token = os.environ.get("HF_TOKEN")
31
- if not token:
32
- raise ValueError("HF_TOKEN not set")
33
- login(token=token)
34
- print("Authenticated")
35
-
36
- def load_humaneval():
37
- ds = load_dataset("openai/openai_humaneval", split="test")
38
- return list(ds)
39
-
40
- def extract_code(text):
41
- # Try code blocks first
42
- match = re.search(r'```python\s*(.*?)```', text, re.DOTALL)
43
- if match:
44
- return match.group(1).strip()
45
- match = re.search(r'```\s*(.*?)```', text, re.DOTALL)
46
- if match:
47
- return match.group(1).strip()
48
- return text.strip()
49
-
50
- def test_solution(code, test_code, entry_point):
51
- try:
52
- ns = {}
53
- exec(code, ns)
54
- if entry_point not in ns:
55
- return False
56
- exec(test_code, ns)
57
- exec(f"check({entry_point})", ns)
58
- return True
59
- except:
60
- return False
61
-
62
- def evaluate_model(model, tokenizer, problems, desc):
63
- correct = 0
64
- model.eval()
65
- for i, p in enumerate(problems):
66
- inputs = tokenizer(p["prompt"], return_tensors="pt").to(model.device)
67
- with torch.no_grad():
68
- out = model.generate(**inputs, max_new_tokens=512, temperature=0.2, do_sample=True, pad_token_id=tokenizer.eos_token_id)
69
- resp = tokenizer.decode(out[0], skip_special_tokens=True)
70
- gen = resp[len(p["prompt"]):]
71
- code = extract_code(p["prompt"] + gen)
72
- if test_solution(code, p["test"], p["entry_point"]):
73
- correct += 1
74
- if (i+1) % 20 == 0:
75
- print(f"{desc}: {i+1}/{len(problems)}, {correct} correct")
76
- score = correct / len(problems) * 100
77
- print(f"{desc}: {correct}/{len(problems)} = {score:.2f}%")
78
- return score
79
-
80
- def format_example(ex):
81
- return {"text": f"<|im_start|>user\n{ex['problem']}\n<|im_end|>\n<|im_start|>assistant\n{ex['solution']}<|im_end|>"}
82
-
83
- def main():
84
- print("=" * 60)
85
- authenticate()
86
- problems = load_humaneval()
87
- print(f"Loaded {len(problems)} problems")
88
-
89
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
90
- if tokenizer.pad_token is None:
91
- tokenizer.pad_token = tokenizer.eos_token
92
-
93
- base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
94
- base_score = evaluate_model(base_model, tokenizer, problems, "BASE")
95
- del base_model
96
- torch.cuda.empty_cache()
97
-
98
- train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
99
- train_examples = []
100
- for i, ex in enumerate(train_ds):
101
- if i >= NUM_TRAIN_EXAMPLES:
102
- break
103
- train_examples.append(format_example(ex))
104
- from datasets import Dataset
105
- train_dataset = Dataset.from_list(train_examples)
106
- print(f"Prepared {len(train_dataset)} examples")
107
-
108
- model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
109
- lora_config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["q_proj","k_proj","v_proj","o_proj"], task_type="CAUSAL_LM")
110
- model = get_peft_model(model, lora_config)
111
- model.print_trainable_parameters()
112
-
113
- training_args = SFTConfig(output_dir="./qwen3-ft", max_steps=MAX_STEPS, learning_rate=LEARNING_RATE, per_device_train_batch_size=2, gradient_accumulation_steps=4, logging_steps=10, save_steps=50, bf16=True, optim="adamw_torch", warmup_steps=10, max_seq_length=2048)
114
- trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
115
- trainer.train()
116
- model = model.merge_and_unload()
117
-
118
- ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
119
-
120
- print("=" * 60)
121
- print(f"BASE: {base_score:.2f}%")
122
- print(f"FINE-TUNED: {ft_score:.2f}%")
123
- print(f"IMPROVEMENT: {ft_score - base_score:+.2f}%")
124
-
125
- if ft_score > base_score:
126
- print("SUCCESS! Uploading...")
127
- model.push_to_hub(REPO_ID)
128
- tokenizer.push_to_hub(REPO_ID)
129
- print("Done!")
130
- else:
131
- print("Did not beat base. Try again.")
132
-
133
- if __name__ == "__main__":
134
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_eval_upload_v6.py DELETED
@@ -1,192 +0,0 @@
1
- #!/usr/bin/env python3
2
- # /// script
3
- # requires-python = ">=3.10"
4
- # dependencies = [
5
- # "trl>=0.12.0",
6
- # "peft>=0.7.0",
7
- # "transformers>=4.36.0",
8
- # "accelerate>=0.24.0",
9
- # "datasets",
10
- # "torch",
11
- # "huggingface_hub",
12
- # ]
13
- # ///
14
- import os
15
- import re
16
- import torch
17
- import gc
18
- from datasets import load_dataset
19
- from transformers import AutoModelForCausalLM, AutoTokenizer
20
- from peft import LoraConfig, get_peft_model
21
- from trl import SFTConfig, SFTTrainer
22
- from huggingface_hub import login
23
-
24
- BASE_MODEL = "Qwen/Qwen3-0.6B"
25
- REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
26
- MAX_STEPS = 150
27
- LEARNING_RATE = 5e-6
28
- NUM_TRAIN_EXAMPLES = 500
29
-
30
- def authenticate():
31
- token = os.environ.get("HF_TOKEN")
32
- if not token:
33
- raise ValueError("HF_TOKEN not set")
34
- login(token=token)
35
- print("Authenticated")
36
-
37
- def load_humaneval():
38
- ds = load_dataset("openai/openai_humaneval", split="test")
39
- return list(ds)
40
-
41
- def extract_code(full_text, prompt):
42
- """Extract the function body from model output."""
43
- # Get only generated part
44
- if full_text.startswith(prompt):
45
- generated = full_text[len(prompt):]
46
- else:
47
- generated = full_text
48
-
49
- # Clean up - stop at common end markers
50
- for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
51
- if stop in generated:
52
- generated = generated.split(stop)[0]
53
-
54
- # Combine prompt with cleaned generation
55
- code = prompt + generated
56
- return code.strip()
57
-
58
- def test_solution(code, test_code, entry_point):
59
- try:
60
- ns = {}
61
- exec(code, ns)
62
- if entry_point not in ns:
63
- return False
64
- exec(test_code, ns)
65
- exec(f"check({entry_point})", ns)
66
- return True
67
- except Exception as e:
68
- return False
69
-
70
- def evaluate_model(model, tokenizer, problems, desc):
71
- correct = 0
72
- model.eval()
73
-
74
- for i, p in enumerate(problems):
75
- prompt = p["prompt"]
76
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
77
-
78
- with torch.no_grad():
79
- out = model.generate(
80
- **inputs,
81
- max_new_tokens=256,
82
- temperature=0.1,
83
- do_sample=True,
84
- pad_token_id=tokenizer.eos_token_id,
85
- eos_token_id=tokenizer.eos_token_id,
86
- )
87
-
88
- full_text = tokenizer.decode(out[0], skip_special_tokens=True)
89
- code = extract_code(full_text, prompt)
90
-
91
- if test_solution(code, p["test"], p["entry_point"]):
92
- correct += 1
93
-
94
- if (i+1) % 40 == 0:
95
- print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
96
-
97
- score = correct / len(problems) * 100
98
- print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
99
- return score
100
-
101
- def format_example(ex):
102
- return {"text": f"<|im_start|>user\n{ex['problem']}\n<|im_end|>\n<|im_start|>assistant\n{ex['solution']}<|im_end|>"}
103
-
104
- def main():
105
- print("=" * 60)
106
- print("Qwen3-0.6B Fine-tuning Challenge v6")
107
- print("=" * 60)
108
-
109
- authenticate()
110
- problems = load_humaneval()
111
- print(f"Loaded {len(problems)} HumanEval problems")
112
-
113
- # Load tokenizer
114
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
115
- if tokenizer.pad_token is None:
116
- tokenizer.pad_token = tokenizer.eos_token
117
-
118
- # Evaluate BASE model
119
- print("\n[1/4] Evaluating BASE model...")
120
- model = AutoModelForCausalLM.from_pretrained(
121
- BASE_MODEL,
122
- torch_dtype=torch.bfloat16,
123
- device_map="auto",
124
- trust_remote_code=True
125
- )
126
- base_score = evaluate_model(model, tokenizer, problems, "BASE")
127
-
128
- # Training - use same model instance
129
- print("\n[2/4] Training...")
130
- train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
131
- train_examples = []
132
- for i, ex in enumerate(train_ds):
133
- if i >= NUM_TRAIN_EXAMPLES:
134
- break
135
- train_examples.append(format_example(ex))
136
-
137
- from datasets import Dataset
138
- train_dataset = Dataset.from_list(train_examples)
139
- print(f"Prepared {len(train_dataset)} training examples")
140
-
141
- # Apply LoRA to same model
142
- lora_config = LoraConfig(
143
- r=8, lora_alpha=32, lora_dropout=0.1,
144
- target_modules=["q_proj","k_proj","v_proj","o_proj"],
145
- task_type="CAUSAL_LM"
146
- )
147
- model = get_peft_model(model, lora_config)
148
- model.print_trainable_parameters()
149
-
150
- training_args = SFTConfig(
151
- output_dir="./qwen3-ft",
152
- max_steps=MAX_STEPS,
153
- learning_rate=LEARNING_RATE,
154
- per_device_train_batch_size=2,
155
- gradient_accumulation_steps=4,
156
- logging_steps=10,
157
- save_steps=9999, # Don't save checkpoints
158
- bf16=True,
159
- optim="adamw_torch",
160
- warmup_steps=10,
161
- max_seq_length=2048
162
- )
163
-
164
- trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
165
- trainer.train()
166
- print("Training complete!")
167
-
168
- # Merge LoRA
169
- model = model.merge_and_unload()
170
-
171
- # Evaluate FINE-TUNED model
172
- print("\n[3/4] Evaluating FINE-TUNED model...")
173
- ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
174
-
175
- # Results
176
- print("\n[4/4] Results")
177
- print("=" * 60)
178
- print(f"BASE: {base_score:.2f}%")
179
- print(f"FINE-TUNED: {ft_score:.2f}%")
180
- print(f"CHANGE: {ft_score - base_score:+.2f}%")
181
- print("=" * 60)
182
-
183
- if ft_score > base_score:
184
- print("\nSUCCESS! Uploading to Hub...")
185
- model.push_to_hub(REPO_ID)
186
- tokenizer.push_to_hub(REPO_ID)
187
- print("Upload complete!")
188
- else:
189
- print("\nDid not beat base model. Variance - try again.")
190
-
191
- if __name__ == "__main__":
192
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_eval_upload_v7.py DELETED
@@ -1,180 +0,0 @@
1
- #!/usr/bin/env python3
2
- # /// script
3
- # requires-python = ">=3.10"
4
- # dependencies = [
5
- # "trl>=0.12.0",
6
- # "peft>=0.7.0",
7
- # "transformers>=4.36.0",
8
- # "accelerate>=0.24.0",
9
- # "datasets",
10
- # "torch",
11
- # "huggingface_hub",
12
- # ]
13
- # ///
14
- import os
15
- import re
16
- import torch
17
- from datasets import load_dataset
18
- from transformers import AutoModelForCausalLM, AutoTokenizer
19
- from peft import LoraConfig, get_peft_model
20
- from trl import SFTConfig, SFTTrainer
21
- from huggingface_hub import login
22
-
23
- BASE_MODEL = "Qwen/Qwen3-0.6B"
24
- REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
25
- MAX_STEPS = 150
26
- LEARNING_RATE = 5e-6
27
- NUM_TRAIN_EXAMPLES = 500
28
-
29
- def authenticate():
30
- token = os.environ.get("HF_TOKEN")
31
- if not token:
32
- raise ValueError("HF_TOKEN not set")
33
- login(token=token)
34
- print("Authenticated")
35
-
36
- def load_humaneval():
37
- ds = load_dataset("openai/openai_humaneval", split="test")
38
- return list(ds)
39
-
40
- def extract_code(full_text, prompt):
41
- """Extract the function body from model output."""
42
- if full_text.startswith(prompt):
43
- generated = full_text[len(prompt):]
44
- else:
45
- generated = full_text
46
-
47
- for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
48
- if stop in generated:
49
- generated = generated.split(stop)[0]
50
-
51
- return (prompt + generated).strip()
52
-
53
- def test_solution(code, test_code, entry_point):
54
- try:
55
- ns = {}
56
- exec(code, ns)
57
- if entry_point not in ns:
58
- return False
59
- exec(test_code, ns)
60
- exec(f"check({entry_point})", ns)
61
- return True
62
- except:
63
- return False
64
-
65
- def evaluate_model(model, tokenizer, problems, desc):
66
- correct = 0
67
- model.eval()
68
-
69
- for i, p in enumerate(problems):
70
- prompt = p["prompt"]
71
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
72
-
73
- with torch.no_grad():
74
- out = model.generate(
75
- **inputs,
76
- max_new_tokens=256,
77
- temperature=0.1,
78
- do_sample=True,
79
- pad_token_id=tokenizer.eos_token_id,
80
- )
81
-
82
- full_text = tokenizer.decode(out[0], skip_special_tokens=True)
83
- code = extract_code(full_text, prompt)
84
-
85
- if test_solution(code, p["test"], p["entry_point"]):
86
- correct += 1
87
-
88
- if (i+1) % 40 == 0:
89
- print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
90
-
91
- score = correct / len(problems) * 100
92
- print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
93
- return score
94
-
95
- def format_example(ex):
96
- # Correct field names: prompt and generation
97
- return {"text": f"<|im_start|>user\n{ex['prompt']}\n<|im_end|>\n<|im_start|>assistant\n{ex['generation']}<|im_end|}"}
98
-
99
- def main():
100
- print("=" * 60)
101
- print("Qwen3-0.6B Fine-tuning Challenge v7")
102
- print("=" * 60)
103
-
104
- authenticate()
105
- problems = load_humaneval()
106
- print(f"Loaded {len(problems)} HumanEval problems")
107
-
108
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
109
- if tokenizer.pad_token is None:
110
- tokenizer.pad_token = tokenizer.eos_token
111
-
112
- print("\n[1/4] Evaluating BASE model...")
113
- model = AutoModelForCausalLM.from_pretrained(
114
- BASE_MODEL,
115
- torch_dtype=torch.bfloat16,
116
- device_map="auto",
117
- trust_remote_code=True
118
- )
119
- base_score = evaluate_model(model, tokenizer, problems, "BASE")
120
-
121
- print("\n[2/4] Training...")
122
- train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
123
- train_examples = []
124
- for i, ex in enumerate(train_ds):
125
- if i >= NUM_TRAIN_EXAMPLES:
126
- break
127
- train_examples.append(format_example(ex))
128
-
129
- from datasets import Dataset
130
- train_dataset = Dataset.from_list(train_examples)
131
- print(f"Prepared {len(train_dataset)} training examples")
132
-
133
- lora_config = LoraConfig(
134
- r=8, lora_alpha=32, lora_dropout=0.1,
135
- target_modules=["q_proj","k_proj","v_proj","o_proj"],
136
- task_type="CAUSAL_LM"
137
- )
138
- model = get_peft_model(model, lora_config)
139
- model.print_trainable_parameters()
140
-
141
- training_args = SFTConfig(
142
- output_dir="./qwen3-ft",
143
- max_steps=MAX_STEPS,
144
- learning_rate=LEARNING_RATE,
145
- per_device_train_batch_size=2,
146
- gradient_accumulation_steps=4,
147
- logging_steps=10,
148
- save_steps=9999,
149
- bf16=True,
150
- optim="adamw_torch",
151
- warmup_steps=10,
152
- max_seq_length=2048
153
- )
154
-
155
- trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
156
- trainer.train()
157
- print("Training complete!")
158
-
159
- model = model.merge_and_unload()
160
-
161
- print("\n[3/4] Evaluating FINE-TUNED model...")
162
- ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
163
-
164
- print("\n[4/4] Results")
165
- print("=" * 60)
166
- print(f"BASE: {base_score:.2f}%")
167
- print(f"FINE-TUNED: {ft_score:.2f}%")
168
- print(f"CHANGE: {ft_score - base_score:+.2f}%")
169
- print("=" * 60)
170
-
171
- if ft_score > base_score:
172
- print("\nSUCCESS! Uploading to Hub...")
173
- model.push_to_hub(REPO_ID)
174
- tokenizer.push_to_hub(REPO_ID)
175
- print("Upload complete!")
176
- else:
177
- print("\nDid not beat base model. Variance - try again.")
178
-
179
- if __name__ == "__main__":
180
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_eval_upload_v8.py DELETED
@@ -1,181 +0,0 @@
1
- #!/usr/bin/env python3
2
- # /// script
3
- # requires-python = ">=3.10"
4
- # dependencies = [
5
- # "trl>=0.12.0",
6
- # "peft>=0.7.0",
7
- # "transformers>=4.36.0",
8
- # "accelerate>=0.24.0",
9
- # "datasets",
10
- # "torch",
11
- # "huggingface_hub",
12
- # ]
13
- # ///
14
- import os
15
- import re
16
- import torch
17
- from datasets import load_dataset
18
- from transformers import AutoModelForCausalLM, AutoTokenizer
19
- from peft import LoraConfig, get_peft_model
20
- from trl import SFTConfig, SFTTrainer
21
- from huggingface_hub import login
22
-
23
- BASE_MODEL = "Qwen/Qwen3-0.6B"
24
- REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
25
- MAX_STEPS = 150
26
- LEARNING_RATE = 5e-6
27
- NUM_TRAIN_EXAMPLES = 500
28
-
29
- def authenticate():
30
- token = os.environ.get("HF_TOKEN")
31
- if not token:
32
- raise ValueError("HF_TOKEN not set")
33
- login(token=token)
34
- print("Authenticated")
35
-
36
- def load_humaneval():
37
- ds = load_dataset("openai/openai_humaneval", split="test")
38
- return list(ds)
39
-
40
- def extract_code(full_text, prompt):
41
- if full_text.startswith(prompt):
42
- generated = full_text[len(prompt):]
43
- else:
44
- generated = full_text
45
-
46
- for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
47
- if stop in generated:
48
- generated = generated.split(stop)[0]
49
-
50
- return (prompt + generated).strip()
51
-
52
- def test_solution(code, test_code, entry_point):
53
- try:
54
- ns = {}
55
- exec(code, ns)
56
- if entry_point not in ns:
57
- return False
58
- exec(test_code, ns)
59
- exec(f"check({entry_point})", ns)
60
- return True
61
- except:
62
- return False
63
-
64
- def evaluate_model(model, tokenizer, problems, desc):
65
- correct = 0
66
- model.eval()
67
-
68
- for i, p in enumerate(problems):
69
- prompt = p["prompt"]
70
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
71
-
72
- with torch.no_grad():
73
- out = model.generate(
74
- **inputs,
75
- max_new_tokens=256,
76
- temperature=0.1,
77
- do_sample=True,
78
- pad_token_id=tokenizer.eos_token_id,
79
- )
80
-
81
- full_text = tokenizer.decode(out[0], skip_special_tokens=True)
82
- code = extract_code(full_text, prompt)
83
-
84
- if test_solution(code, p["test"], p["entry_point"]):
85
- correct += 1
86
-
87
- if (i+1) % 40 == 0:
88
- print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
89
-
90
- score = correct / len(problems) * 100
91
- print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
92
- return score
93
-
94
- def format_example(ex):
95
- # Fixed: <|im_end|> not <|im_end|}
96
- prompt = ex['prompt']
97
- gen = ex['generation']
98
- return {"text": f"<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n{gen}<|im_end|>"}
99
-
100
- def main():
101
- print("=" * 60)
102
- print("Qwen3-0.6B Fine-tuning Challenge v8")
103
- print("=" * 60)
104
-
105
- authenticate()
106
- problems = load_humaneval()
107
- print(f"Loaded {len(problems)} HumanEval problems")
108
-
109
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
110
- if tokenizer.pad_token is None:
111
- tokenizer.pad_token = tokenizer.eos_token
112
-
113
- print("\n[1/4] Evaluating BASE model...")
114
- model = AutoModelForCausalLM.from_pretrained(
115
- BASE_MODEL,
116
- torch_dtype=torch.bfloat16,
117
- device_map="auto",
118
- trust_remote_code=True
119
- )
120
- base_score = evaluate_model(model, tokenizer, problems, "BASE")
121
-
122
- print("\n[2/4] Training...")
123
- train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
124
- train_examples = []
125
- for i, ex in enumerate(train_ds):
126
- if i >= NUM_TRAIN_EXAMPLES:
127
- break
128
- train_examples.append(format_example(ex))
129
-
130
- from datasets import Dataset
131
- train_dataset = Dataset.from_list(train_examples)
132
- print(f"Prepared {len(train_dataset)} training examples")
133
-
134
- lora_config = LoraConfig(
135
- r=8, lora_alpha=32, lora_dropout=0.1,
136
- target_modules=["q_proj","k_proj","v_proj","o_proj"],
137
- task_type="CAUSAL_LM"
138
- )
139
- model = get_peft_model(model, lora_config)
140
- model.print_trainable_parameters()
141
-
142
- training_args = SFTConfig(
143
- output_dir="./qwen3-ft",
144
- max_steps=MAX_STEPS,
145
- learning_rate=LEARNING_RATE,
146
- per_device_train_batch_size=2,
147
- gradient_accumulation_steps=4,
148
- logging_steps=10,
149
- save_steps=9999,
150
- bf16=True,
151
- optim="adamw_torch",
152
- warmup_steps=10,
153
- max_seq_length=2048
154
- )
155
-
156
- trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
157
- trainer.train()
158
- print("Training complete!")
159
-
160
- model = model.merge_and_unload()
161
-
162
- print("\n[3/4] Evaluating FINE-TUNED model...")
163
- ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
164
-
165
- print("\n[4/4] Results")
166
- print("=" * 60)
167
- print(f"BASE: {base_score:.2f}%")
168
- print(f"FINE-TUNED: {ft_score:.2f}%")
169
- print(f"CHANGE: {ft_score - base_score:+.2f}%")
170
- print("=" * 60)
171
-
172
- if ft_score > base_score:
173
- print("\nSUCCESS! Uploading to Hub...")
174
- model.push_to_hub(REPO_ID)
175
- tokenizer.push_to_hub(REPO_ID)
176
- print("Upload complete!")
177
- else:
178
- print("\nDid not beat base model. Variance - try again.")
179
-
180
- if __name__ == "__main__":
181
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_eval_upload_v9.py DELETED
@@ -1,180 +0,0 @@
1
- #!/usr/bin/env python3
2
- # /// script
3
- # requires-python = ">=3.10"
4
- # dependencies = [
5
- # "trl>=0.12.0",
6
- # "peft>=0.7.0",
7
- # "transformers>=4.36.0",
8
- # "accelerate>=0.24.0",
9
- # "datasets",
10
- # "torch",
11
- # "huggingface_hub",
12
- # ]
13
- # ///
14
- import os
15
- import torch
16
- from datasets import load_dataset
17
- from transformers import AutoModelForCausalLM, AutoTokenizer
18
- from peft import LoraConfig, get_peft_model
19
- from trl import SFTConfig, SFTTrainer
20
- from huggingface_hub import login
21
-
22
- BASE_MODEL = "Qwen/Qwen3-0.6B"
23
- REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
24
- MAX_STEPS = 150
25
- LEARNING_RATE = 5e-6
26
- NUM_TRAIN_EXAMPLES = 500
27
-
28
- def authenticate():
29
- token = os.environ.get("HF_TOKEN")
30
- if not token:
31
- raise ValueError("HF_TOKEN not set")
32
- login(token=token)
33
- print("Authenticated")
34
-
35
- def load_humaneval():
36
- ds = load_dataset("openai/openai_humaneval", split="test")
37
- return list(ds)
38
-
39
- def extract_code(full_text, prompt):
40
- if full_text.startswith(prompt):
41
- generated = full_text[len(prompt):]
42
- else:
43
- generated = full_text
44
-
45
- for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
46
- if stop in generated:
47
- generated = generated.split(stop)[0]
48
-
49
- return (prompt + generated).strip()
50
-
51
- def test_solution(code, test_code, entry_point):
52
- try:
53
- ns = {}
54
- exec(code, ns)
55
- if entry_point not in ns:
56
- return False
57
- exec(test_code, ns)
58
- exec(f"check({entry_point})", ns)
59
- return True
60
- except:
61
- return False
62
-
63
- def evaluate_model(model, tokenizer, problems, desc):
64
- correct = 0
65
- model.eval()
66
-
67
- for i, p in enumerate(problems):
68
- prompt = p["prompt"]
69
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
70
-
71
- with torch.no_grad():
72
- out = model.generate(
73
- **inputs,
74
- max_new_tokens=256,
75
- temperature=0.1,
76
- do_sample=True,
77
- pad_token_id=tokenizer.eos_token_id,
78
- )
79
-
80
- full_text = tokenizer.decode(out[0], skip_special_tokens=True)
81
- code = extract_code(full_text, prompt)
82
-
83
- if test_solution(code, p["test"], p["entry_point"]):
84
- correct += 1
85
-
86
- if (i+1) % 40 == 0:
87
- print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
88
-
89
- score = correct / len(problems) * 100
90
- print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
91
- return score
92
-
93
- def format_example(ex):
94
- prompt = ex['prompt']
95
- gen = ex['generation']
96
- return {"text": f"<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n{gen}<|im_end|>"}
97
-
98
- def main():
99
- print("=" * 60)
100
- print("Qwen3-0.6B Fine-tuning Challenge v9")
101
- print("=" * 60)
102
-
103
- authenticate()
104
- problems = load_humaneval()
105
- print(f"Loaded {len(problems)} HumanEval problems")
106
-
107
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
108
- if tokenizer.pad_token is None:
109
- tokenizer.pad_token = tokenizer.eos_token
110
-
111
- print("\n[1/4] Evaluating BASE model...")
112
- model = AutoModelForCausalLM.from_pretrained(
113
- BASE_MODEL,
114
- torch_dtype=torch.bfloat16,
115
- device_map="auto",
116
- trust_remote_code=True
117
- )
118
- base_score = evaluate_model(model, tokenizer, problems, "BASE")
119
-
120
- print("\n[2/4] Training...")
121
- train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
122
- train_examples = []
123
- for i, ex in enumerate(train_ds):
124
- if i >= NUM_TRAIN_EXAMPLES:
125
- break
126
- train_examples.append(format_example(ex))
127
-
128
- from datasets import Dataset
129
- train_dataset = Dataset.from_list(train_examples)
130
- print(f"Prepared {len(train_dataset)} training examples")
131
-
132
- lora_config = LoraConfig(
133
- r=8, lora_alpha=32, lora_dropout=0.1,
134
- target_modules=["q_proj","k_proj","v_proj","o_proj"],
135
- task_type="CAUSAL_LM"
136
- )
137
- model = get_peft_model(model, lora_config)
138
- model.print_trainable_parameters()
139
-
140
- # Fixed: removed max_seq_length, use dataset_text_field
141
- training_args = SFTConfig(
142
- output_dir="./qwen3-ft",
143
- max_steps=MAX_STEPS,
144
- learning_rate=LEARNING_RATE,
145
- per_device_train_batch_size=2,
146
- gradient_accumulation_steps=4,
147
- logging_steps=10,
148
- save_steps=9999,
149
- bf16=True,
150
- optim="adamw_torch",
151
- warmup_steps=10,
152
- dataset_text_field="text",
153
- )
154
-
155
- trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset, tokenizer=tokenizer)
156
- trainer.train()
157
- print("Training complete!")
158
-
159
- model = model.merge_and_unload()
160
-
161
- print("\n[3/4] Evaluating FINE-TUNED model...")
162
- ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
163
-
164
- print("\n[4/4] Results")
165
- print("=" * 60)
166
- print(f"BASE: {base_score:.2f}%")
167
- print(f"FINE-TUNED: {ft_score:.2f}%")
168
- print(f"CHANGE: {ft_score - base_score:+.2f}%")
169
- print("=" * 60)
170
-
171
- if ft_score > base_score:
172
- print("\nSUCCESS! Uploading to Hub...")
173
- model.push_to_hub(REPO_ID)
174
- tokenizer.push_to_hub(REPO_ID)
175
- print("Upload complete!")
176
- else:
177
- print("\nDid not beat base model. Variance - try again.")
178
-
179
- if __name__ == "__main__":
180
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_final.py DELETED
@@ -1,128 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.51.0",
6
- # "accelerate>=0.24.0",
7
- # "datasets",
8
- # ]
9
- # ///
10
-
11
- """
12
- FINAL ATTEMPT: Proper Qwen3 chat template, ultra-minimal training
13
- """
14
-
15
- import sys
16
- import traceback
17
- from datasets import load_dataset, Dataset
18
- from peft import LoraConfig
19
- from trl import SFTTrainer, SFTConfig
20
- from transformers import AutoTokenizer
21
- import torch
22
-
23
-
24
- def log(msg):
25
- print(msg, flush=True)
26
-
27
-
28
- log("=" * 60)
29
- log("FINAL TRAINING - Proper Qwen3 template")
30
- log("=" * 60)
31
-
32
- try:
33
- log(f"CUDA: {torch.cuda.is_available()}")
34
- if torch.cuda.is_available():
35
- log(f"GPU: {torch.cuda.get_device_name(0)}")
36
-
37
- log("Loading tokenizer first...")
38
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
39
- if tokenizer.pad_token is None:
40
- tokenizer.pad_token = tokenizer.eos_token
41
- log(f"Tokenizer loaded, vocab size: {len(tokenizer)}")
42
-
43
- log("Streaming codeforces-cots...")
44
- streaming_ds = load_dataset(
45
- "open-r1/codeforces-cots", split="train", streaming=True
46
- )
47
-
48
- log("Collecting 200 examples...")
49
- examples = []
50
- for i, ex in enumerate(streaming_ds):
51
- if i >= 200:
52
- break
53
- examples.append(ex)
54
- log(f"Collected {len(examples)} examples")
55
-
56
- dataset = Dataset.from_list(examples)
57
-
58
- # Use proper Qwen3 chat template
59
- def format_with_chat_template(example):
60
- messages = example["messages"]
61
- # Apply Qwen3's native chat template
62
- text = tokenizer.apply_chat_template(
63
- messages, tokenize=False, add_generation_prompt=False
64
- )
65
- return {"text": text}
66
-
67
- log("Formatting with Qwen3 chat template...")
68
- dataset = dataset.map(
69
- format_with_chat_template, remove_columns=dataset.column_names
70
- )
71
- log(f"Formatted {len(dataset)} examples")
72
-
73
- # Show sample
74
- log(f"Sample (first 200 chars): {dataset[0]['text'][:200]}...")
75
-
76
- config = SFTConfig(
77
- output_dir="qwen3-final",
78
- push_to_hub=True,
79
- hub_model_id="passagereptile455/qwen3-0.6b-humaneval-final",
80
- hub_strategy="every_save",
81
- max_steps=30, # Ultra minimal
82
- per_device_train_batch_size=1,
83
- gradient_accumulation_steps=4,
84
- learning_rate=5e-8, # Extremely conservative
85
- max_length=512,
86
- logging_steps=10,
87
- save_strategy="steps",
88
- save_steps=30,
89
- save_total_limit=1,
90
- eval_strategy="no",
91
- warmup_ratio=0.1,
92
- lr_scheduler_type="cosine",
93
- gradient_checkpointing=True,
94
- bf16=True,
95
- dataset_text_field="text",
96
- )
97
-
98
- peft_config = LoraConfig(
99
- r=4,
100
- lora_alpha=8,
101
- lora_dropout=0.0,
102
- bias="none",
103
- task_type="CAUSAL_LM",
104
- target_modules=["q_proj", "v_proj"],
105
- )
106
-
107
- log("Creating trainer...")
108
- trainer = SFTTrainer(
109
- model="Qwen/Qwen3-0.6B",
110
- train_dataset=dataset,
111
- args=config,
112
- peft_config=peft_config,
113
- )
114
-
115
- log("Training (30 steps, 5e-8 LR)...")
116
- trainer.train()
117
-
118
- log("Pushing to Hub...")
119
- trainer.push_to_hub()
120
-
121
- log("=" * 60)
122
- log("SUCCESS! Model: passagereptile455/qwen3-0.6b-humaneval-final")
123
- log("=" * 60)
124
-
125
- except Exception as e:
126
- log(f"ERROR: {e}")
127
- traceback.print_exc()
128
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job1.py DELETED
@@ -1,97 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Job 1: Baseline SFT training of Qwen3-0.6B on codeforces-cots
14
- Goal: Beat base model on HumanEval
15
- """
16
-
17
- import trackio
18
- from datasets import load_dataset
19
- from peft import LoraConfig
20
- from trl import SFTTrainer, SFTConfig
21
-
22
- print("Loading dataset: open-r1/codeforces-cots")
23
- dataset = load_dataset("open-r1/codeforces-cots", split="train")
24
- print(f"Dataset loaded: {len(dataset)} examples")
25
-
26
- # Use a subset for faster training
27
- dataset = dataset.shuffle(seed=42).select(range(min(5000, len(dataset))))
28
- print(f"Using {len(dataset)} examples for training")
29
-
30
- # Create train/eval split
31
- dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
32
- train_dataset = dataset_split["train"]
33
- eval_dataset = dataset_split["test"]
34
- print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")
35
-
36
- # Training configuration
37
- config = SFTConfig(
38
- output_dir="qwen3-codeforces-sft-job1",
39
- push_to_hub=True,
40
- hub_model_id="passagereptile455/qwen3-0.6b-codeforces-sft-job1",
41
- hub_strategy="every_save",
42
-
43
- # Training params
44
- num_train_epochs=2,
45
- per_device_train_batch_size=2,
46
- gradient_accumulation_steps=8,
47
- learning_rate=2e-4,
48
- max_length=2048,
49
-
50
- # Logging
51
- logging_steps=10,
52
- save_strategy="steps",
53
- save_steps=200,
54
- save_total_limit=2,
55
-
56
- # Eval
57
- eval_strategy="steps",
58
- eval_steps=200,
59
-
60
- # Optimization
61
- warmup_ratio=0.1,
62
- lr_scheduler_type="cosine",
63
- gradient_checkpointing=True,
64
-
65
- # Monitoring
66
- report_to="trackio",
67
- project="qwen3-humaneval-challenge",
68
- run_name="job1-baseline-5k",
69
- )
70
-
71
- # LoRA config for efficient training
72
- peft_config = LoraConfig(
73
- r=32,
74
- lora_alpha=64,
75
- lora_dropout=0.05,
76
- bias="none",
77
- task_type="CAUSAL_LM",
78
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
79
- )
80
-
81
- print("Initializing trainer with Qwen/Qwen3-0.6B...")
82
- trainer = SFTTrainer(
83
- model="Qwen/Qwen3-0.6B",
84
- train_dataset=train_dataset,
85
- eval_dataset=eval_dataset,
86
- args=config,
87
- peft_config=peft_config,
88
- )
89
-
90
- print("Starting training...")
91
- trainer.train()
92
-
93
- print("Pushing to Hub...")
94
- trainer.push_to_hub()
95
-
96
- print("Job 1 complete!")
97
- print("Model: https://huggingface.co/passagereptile455/qwen3-0.6b-codeforces-sft-job1")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job1_minimal.py DELETED
@@ -1,97 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Job #1: MINIMAL fine-tuning on codeforces-cots
14
- Strategy: Very few steps (300 max) + low LR to add reasoning without losing Python
15
- """
16
-
17
- import trackio
18
- from datasets import load_dataset
19
- from peft import LoraConfig
20
- from trl import SFTTrainer, SFTConfig
21
- from transformers import AutoTokenizer
22
-
23
- print("Loading codeforces-cots dataset...")
24
- dataset = load_dataset("open-r1/codeforces-cots", split="train")
25
- print(f"Total examples: {len(dataset)}")
26
-
27
- # Shuffle and use subset for faster training
28
- dataset = dataset.shuffle(seed=42).select(range(min(5000, len(dataset))))
29
- print(f"Using {len(dataset)} examples")
30
-
31
- # Split for eval
32
- dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
33
- train_dataset = dataset_split["train"]
34
- eval_dataset = dataset_split["test"]
35
- print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
36
-
37
- # Load tokenizer to check chat template
38
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
39
-
40
- # Training config - MINIMAL training to avoid overwriting Python knowledge
41
- config = SFTConfig(
42
- output_dir="qwen3-codeforces-job1",
43
- push_to_hub=True,
44
- hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
45
- hub_strategy="every_save",
46
- # CRITICAL: Minimal training
47
- max_steps=300, # Only 300 steps, not epochs
48
- per_device_train_batch_size=2,
49
- gradient_accumulation_steps=4,
50
- learning_rate=5e-6, # Very low LR
51
- max_length=1024,
52
- # Logging
53
- logging_steps=25,
54
- save_strategy="steps",
55
- save_steps=100,
56
- save_total_limit=2,
57
- # Eval
58
- eval_strategy="steps",
59
- eval_steps=100,
60
- # Optimization
61
- warmup_ratio=0.1,
62
- lr_scheduler_type="cosine",
63
- gradient_checkpointing=True,
64
- bf16=True,
65
- # Monitoring
66
- report_to="trackio",
67
- project="qwen3-humaneval-challenge",
68
- run_name="job1-minimal-300steps",
69
- )
70
-
71
- # LoRA config - conservative settings
72
- peft_config = LoraConfig(
73
- r=8, # Lower rank for less change
74
- lora_alpha=16,
75
- lora_dropout=0.05,
76
- bias="none",
77
- task_type="CAUSAL_LM",
78
- target_modules=["q_proj", "v_proj"],
79
- )
80
-
81
- print("Initializing trainer...")
82
- trainer = SFTTrainer(
83
- model="Qwen/Qwen3-0.6B",
84
- train_dataset=train_dataset,
85
- eval_dataset=eval_dataset,
86
- args=config,
87
- peft_config=peft_config,
88
- )
89
-
90
- print("Starting minimal training (300 steps)...")
91
- trainer.train()
92
-
93
- print("Pushing to Hub...")
94
- trainer.push_to_hub()
95
-
96
- print("Job 1 complete!")
97
- print("Model: https://huggingface.co/passagereptile455/qwen3-0.6b-humaneval-job1")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job1_v2.py DELETED
@@ -1,120 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Job #1 v2: MINIMAL fine-tuning on codeforces-cots
14
- Fixed: Use iterative loading to avoid memory issues
15
- """
16
-
17
- import trackio
18
- from datasets import load_dataset
19
- from peft import LoraConfig
20
- from trl import SFTTrainer, SFTConfig
21
- from transformers import AutoTokenizer
22
- import torch
23
-
24
- print("=" * 50)
25
- print("JOB 1 v2: Minimal fine-tuning")
26
- print("=" * 50)
27
-
28
- # Load dataset with streaming first to check
29
- print("Loading dataset (streaming to count)...")
30
- try:
31
- ds_stream = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
32
- count = 0
33
- for _ in ds_stream:
34
- count += 1
35
- if count >= 3000: # Just use first 3000
36
- break
37
- print(f"Dataset accessible, using {count} examples")
38
- except Exception as e:
39
- print(f"Error loading dataset: {e}")
40
- raise
41
-
42
- # Now load non-streaming but limited
43
- print("Loading dataset subset...")
44
- dataset = load_dataset("open-r1/codeforces-cots", split="train")
45
- dataset = dataset.shuffle(seed=42).select(range(min(3000, len(dataset))))
46
- print(f"Loaded {len(dataset)} examples")
47
-
48
- # Split
49
- dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
50
- train_dataset = dataset_split["train"]
51
- eval_dataset = dataset_split["test"]
52
- print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
53
-
54
- # Tokenizer
55
- print("Loading tokenizer...")
56
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
57
- if tokenizer.pad_token is None:
58
- tokenizer.pad_token = tokenizer.eos_token
59
- print(f"Tokenizer loaded, pad_token: {tokenizer.pad_token}")
60
-
61
- # Training config
62
- print("Setting up training config...")
63
- config = SFTConfig(
64
- output_dir="qwen3-codeforces-job1",
65
- push_to_hub=True,
66
- hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
67
- hub_strategy="every_save",
68
- # Minimal training
69
- max_steps=300,
70
- per_device_train_batch_size=1,
71
- gradient_accumulation_steps=8,
72
- learning_rate=5e-6,
73
- max_length=512, # Shorter for memory
74
- # Logging
75
- logging_steps=25,
76
- save_strategy="steps",
77
- save_steps=100,
78
- save_total_limit=2,
79
- # Skip eval to save memory
80
- eval_strategy="no",
81
- # Optimization
82
- warmup_ratio=0.1,
83
- lr_scheduler_type="cosine",
84
- gradient_checkpointing=True,
85
- bf16=True,
86
- # Monitoring
87
- report_to="trackio",
88
- project="qwen3-humaneval-challenge",
89
- run_name="job1-minimal-v2",
90
- )
91
-
92
- # LoRA config
93
- peft_config = LoraConfig(
94
- r=8,
95
- lora_alpha=16,
96
- lora_dropout=0.05,
97
- bias="none",
98
- task_type="CAUSAL_LM",
99
- target_modules=["q_proj", "v_proj"],
100
- )
101
-
102
- print("Initializing trainer...")
103
- trainer = SFTTrainer(
104
- model="Qwen/Qwen3-0.6B",
105
- train_dataset=train_dataset,
106
- args=config,
107
- peft_config=peft_config,
108
- )
109
-
110
- print("Starting training...")
111
- print(f"Total steps: {config.max_steps}")
112
- trainer.train()
113
-
114
- print("Pushing to Hub...")
115
- trainer.push_to_hub()
116
-
117
- print("=" * 50)
118
- print("JOB 1 COMPLETE!")
119
- print("Model: https://huggingface.co/passagereptile455/qwen3-0.6b-humaneval-job1")
120
- print("=" * 50)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job1_v3.py DELETED
@@ -1,119 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Job #1 v3: Simplified training script
14
- """
15
-
16
- import os
17
- import sys
18
- import traceback
19
-
20
-
21
- def main():
22
- print("=" * 50)
23
- print("JOB 1 v3: Starting...")
24
- print("=" * 50)
25
-
26
- try:
27
- import trackio
28
- from datasets import load_dataset
29
- from peft import LoraConfig
30
- from trl import SFTTrainer, SFTConfig
31
- from transformers import AutoTokenizer
32
- import torch
33
-
34
- print(f"PyTorch version: {torch.__version__}")
35
- print(f"CUDA available: {torch.cuda.is_available()}")
36
- if torch.cuda.is_available():
37
- print(f"GPU: {torch.cuda.get_device_name(0)}")
38
-
39
- # Load dataset - use trust_remote_code in case needed
40
- print("Loading codeforces-cots dataset...")
41
- dataset = load_dataset(
42
- "open-r1/codeforces-cots", split="train", trust_remote_code=True
43
- )
44
- print(f"Dataset loaded: {len(dataset)} total examples")
45
-
46
- # Use small subset
47
- dataset = dataset.shuffle(seed=42).select(range(min(2000, len(dataset))))
48
- print(f"Using: {len(dataset)} examples")
49
-
50
- # Tokenizer
51
- print("Loading tokenizer...")
52
- tokenizer = AutoTokenizer.from_pretrained(
53
- "Qwen/Qwen3-0.6B", trust_remote_code=True
54
- )
55
- if tokenizer.pad_token is None:
56
- tokenizer.pad_token = tokenizer.eos_token
57
-
58
- # Config
59
- print("Setting up config...")
60
- config = SFTConfig(
61
- output_dir="qwen3-codeforces-job1",
62
- push_to_hub=True,
63
- hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
64
- hub_strategy="every_save",
65
- max_steps=200, # Even fewer steps
66
- per_device_train_batch_size=1,
67
- gradient_accumulation_steps=8,
68
- learning_rate=5e-6,
69
- max_length=512,
70
- logging_steps=20,
71
- save_strategy="steps",
72
- save_steps=100,
73
- save_total_limit=1,
74
- eval_strategy="no",
75
- warmup_ratio=0.1,
76
- lr_scheduler_type="cosine",
77
- gradient_checkpointing=True,
78
- bf16=True,
79
- report_to="trackio",
80
- project="qwen3-humaneval",
81
- run_name="job1-v3",
82
- )
83
-
84
- # LoRA
85
- peft_config = LoraConfig(
86
- r=8,
87
- lora_alpha=16,
88
- lora_dropout=0.05,
89
- bias="none",
90
- task_type="CAUSAL_LM",
91
- target_modules=["q_proj", "v_proj"],
92
- )
93
-
94
- print("Creating trainer...")
95
- trainer = SFTTrainer(
96
- model="Qwen/Qwen3-0.6B",
97
- train_dataset=dataset,
98
- args=config,
99
- peft_config=peft_config,
100
- )
101
-
102
- print("Starting training (200 steps)...")
103
- trainer.train()
104
-
105
- print("Pushing to Hub...")
106
- trainer.push_to_hub()
107
-
108
- print("=" * 50)
109
- print("SUCCESS!")
110
- print("=" * 50)
111
-
112
- except Exception as e:
113
- print(f"ERROR: {e}")
114
- traceback.print_exc()
115
- sys.exit(1)
116
-
117
-
118
- if __name__ == "__main__":
119
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job1_v4.py DELETED
@@ -1,100 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Job #1 v4: Simple training script - no trust_remote_code
14
- """
15
-
16
- import sys
17
- import traceback
18
-
19
-
20
- def main():
21
- print("=" * 50)
22
- print("JOB 1 v4")
23
- print("=" * 50)
24
-
25
- try:
26
- from datasets import load_dataset
27
- from peft import LoraConfig
28
- from trl import SFTTrainer, SFTConfig
29
- import torch
30
-
31
- print(f"PyTorch: {torch.__version__}")
32
- print(f"CUDA: {torch.cuda.is_available()}")
33
-
34
- # Load dataset WITHOUT trust_remote_code
35
- print("Loading dataset...")
36
- dataset = load_dataset("open-r1/codeforces-cots", split="train")
37
- print(f"Total: {len(dataset)}")
38
-
39
- # Small subset
40
- dataset = dataset.shuffle(seed=42).select(range(1000))
41
- print(f"Using: {len(dataset)} examples")
42
-
43
- # Config
44
- config = SFTConfig(
45
- output_dir="qwen3-job1",
46
- push_to_hub=True,
47
- hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
48
- hub_strategy="every_save",
49
- max_steps=200,
50
- per_device_train_batch_size=1,
51
- gradient_accumulation_steps=8,
52
- learning_rate=5e-6,
53
- max_length=512,
54
- logging_steps=20,
55
- save_strategy="steps",
56
- save_steps=100,
57
- save_total_limit=1,
58
- eval_strategy="no",
59
- warmup_ratio=0.1,
60
- lr_scheduler_type="cosine",
61
- gradient_checkpointing=True,
62
- bf16=True,
63
- report_to="trackio",
64
- project="qwen3-humaneval",
65
- run_name="job1-v4",
66
- )
67
-
68
- peft_config = LoraConfig(
69
- r=8,
70
- lora_alpha=16,
71
- lora_dropout=0.05,
72
- bias="none",
73
- task_type="CAUSAL_LM",
74
- target_modules=["q_proj", "v_proj"],
75
- )
76
-
77
- print("Creating trainer...")
78
- trainer = SFTTrainer(
79
- model="Qwen/Qwen3-0.6B",
80
- train_dataset=dataset,
81
- args=config,
82
- peft_config=peft_config,
83
- )
84
-
85
- print("Training...")
86
- trainer.train()
87
-
88
- print("Pushing to Hub...")
89
- trainer.push_to_hub()
90
-
91
- print("SUCCESS!")
92
-
93
- except Exception as e:
94
- print(f"ERROR: {e}")
95
- traceback.print_exc()
96
- sys.exit(1)
97
-
98
-
99
- if __name__ == "__main__":
100
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job2.py DELETED
@@ -1,112 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Job 2: Fixed SFT training - properly handle messages format
14
- """
15
-
16
- from datasets import load_dataset
17
- from peft import LoraConfig
18
- from trl import SFTTrainer, SFTConfig
19
- from transformers import AutoTokenizer
20
-
21
- print("Loading dataset: open-r1/codeforces-cots")
22
- dataset = load_dataset("open-r1/codeforces-cots", split="train")
23
- print(f"Dataset loaded: {len(dataset)} examples")
24
-
25
- # Use subset for faster training
26
- dataset = dataset.shuffle(seed=42).select(range(min(5000, len(dataset))))
27
- print(f"Using {len(dataset)} examples")
28
-
29
- # Load tokenizer to apply chat template
30
- model_name = "Qwen/Qwen3-0.6B"
31
- tokenizer = AutoTokenizer.from_pretrained(model_name)
32
-
33
- # Convert messages to text using chat template
34
- def format_example(example):
35
- messages = example["messages"]
36
- # Apply chat template to convert messages to text
37
- text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
38
- return {"text": text}
39
-
40
- print("Formatting dataset with chat template...")
41
- dataset = dataset.map(format_example, remove_columns=dataset.column_names)
42
- print(f"Formatted {len(dataset)} examples")
43
-
44
- # Split
45
- dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
46
- train_dataset = dataset_split["train"]
47
- eval_dataset = dataset_split["test"]
48
- print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")
49
-
50
- config = SFTConfig(
51
- output_dir="qwen3-codeforces-sft-job2",
52
- push_to_hub=True,
53
- hub_model_id="passagereptile455/qwen3-0.6b-codeforces-sft-job2",
54
- hub_strategy="every_save",
55
-
56
- # Use text field we created
57
- dataset_text_field="text",
58
-
59
- # Training params
60
- num_train_epochs=2,
61
- per_device_train_batch_size=2,
62
- gradient_accumulation_steps=8,
63
- learning_rate=2e-4,
64
- max_length=2048,
65
-
66
- # Logging
67
- logging_steps=10,
68
- save_strategy="steps",
69
- save_steps=200,
70
- save_total_limit=2,
71
-
72
- # Eval
73
- eval_strategy="steps",
74
- eval_steps=200,
75
-
76
- # Optimization
77
- warmup_ratio=0.1,
78
- lr_scheduler_type="cosine",
79
- gradient_checkpointing=True,
80
-
81
- # Monitoring
82
- report_to="trackio",
83
- project="qwen3-humaneval-challenge",
84
- run_name="job2-fixed-format",
85
- )
86
-
87
- peft_config = LoraConfig(
88
- r=32,
89
- lora_alpha=64,
90
- lora_dropout=0.05,
91
- bias="none",
92
- task_type="CAUSAL_LM",
93
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
94
- )
95
-
96
- print("Initializing trainer...")
97
- trainer = SFTTrainer(
98
- model=model_name,
99
- train_dataset=train_dataset,
100
- eval_dataset=eval_dataset,
101
- args=config,
102
- peft_config=peft_config,
103
- )
104
-
105
- print("Starting training...")
106
- trainer.train()
107
-
108
- print("Pushing to Hub...")
109
- trainer.push_to_hub()
110
-
111
- print("Job 2 complete!")
112
- print("Model: https://huggingface.co/passagereptile455/qwen3-0.6b-codeforces-sft-job2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job2_v2.py DELETED
@@ -1,162 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "datasets",
8
- # ]
9
- # ///
10
-
11
- """
12
- Job 2: Ultra-conservative training - filter C++, minimal steps
13
- """
14
-
15
- import sys
16
- import traceback
17
- from datasets import load_dataset, Dataset
18
- from peft import LoraConfig
19
- from trl import SFTTrainer, SFTConfig
20
- from transformers import AutoTokenizer
21
- import torch
22
-
23
-
24
- def log(msg):
25
- print(msg, flush=True)
26
-
27
-
28
- log("=" * 60)
29
- log("TRAINING JOB 2 - Ultra-conservative approach")
30
- log("=" * 60)
31
-
32
- try:
33
- log(f"CUDA: {torch.cuda.is_available()}")
34
- if torch.cuda.is_available():
35
- log(f"GPU: {torch.cuda.get_device_name(0)}")
36
-
37
- log("Streaming codeforces-cots...")
38
- streaming_ds = load_dataset(
39
- "open-r1/codeforces-cots", split="train", streaming=True
40
- )
41
-
42
- log("Collecting examples (aggressive C++ filtering)...")
43
- examples = []
44
- total_seen = 0
45
- skipped_cpp = 0
46
-
47
- cpp_markers = [
48
- "#include",
49
- "cout",
50
- "cin",
51
- "vector<",
52
- "int main",
53
- "iostream",
54
- "using namespace std",
55
- "printf",
56
- "scanf",
57
- "long long",
58
- ]
59
-
60
- for ex in streaming_ds:
61
- total_seen += 1
62
- if len(examples) >= 500: # Only 500 examples
63
- break
64
- if total_seen > 20000: # Don't scan forever
65
- break
66
-
67
- messages = ex.get("messages", [])
68
- content = ""
69
- for msg in messages:
70
- content += str(msg.get("content", "")).lower()
71
-
72
- # Skip if ANY C++ marker present
73
- has_cpp = any(m.lower() in content for m in cpp_markers)
74
-
75
- if has_cpp:
76
- skipped_cpp += 1
77
- continue
78
-
79
- examples.append(ex)
80
-
81
- if len(examples) % 100 == 0:
82
- log(
83
- f" Collected {len(examples)} (seen {total_seen}, skipped {skipped_cpp} C++)"
84
- )
85
-
86
- log(f"Final: {len(examples)} examples from {total_seen} seen")
87
-
88
- if len(examples) < 100:
89
- log("WARNING: Very few non-C++ examples found!")
90
-
91
- dataset = Dataset.from_list(examples)
92
-
93
- log("Loading tokenizer...")
94
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
95
- if tokenizer.pad_token is None:
96
- tokenizer.pad_token = tokenizer.eos_token
97
-
98
- def format_messages(example):
99
- messages = example["messages"]
100
- text = ""
101
- for msg in messages:
102
- role = msg.get("role", "user")
103
- content = msg.get("content", "")
104
- text += f"<|{role}|>\n{content}\n"
105
- return {"text": text}
106
-
107
- log("Formatting dataset...")
108
- dataset = dataset.map(format_messages, remove_columns=dataset.column_names)
109
-
110
- config = SFTConfig(
111
- output_dir="qwen3-job2",
112
- push_to_hub=True,
113
- hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job2",
114
- hub_strategy="every_save",
115
- max_steps=100, # Very few steps
116
- per_device_train_batch_size=1,
117
- gradient_accumulation_steps=4,
118
- learning_rate=1e-6, # Extremely low LR
119
- max_length=512,
120
- logging_steps=20,
121
- save_strategy="steps",
122
- save_steps=50,
123
- save_total_limit=1,
124
- eval_strategy="no",
125
- warmup_ratio=0.1,
126
- lr_scheduler_type="cosine",
127
- gradient_checkpointing=True,
128
- bf16=True,
129
- dataset_text_field="text",
130
- )
131
-
132
- peft_config = LoraConfig(
133
- r=4, # Very small rank
134
- lora_alpha=8,
135
- lora_dropout=0.0,
136
- bias="none",
137
- task_type="CAUSAL_LM",
138
- target_modules=["q_proj", "v_proj"], # Minimal modules
139
- )
140
-
141
- log("Creating trainer...")
142
- trainer = SFTTrainer(
143
- model="Qwen/Qwen3-0.6B",
144
- train_dataset=dataset,
145
- args=config,
146
- peft_config=peft_config,
147
- )
148
-
149
- log("Training (100 steps, 1e-6 LR)...")
150
- trainer.train()
151
-
152
- log("Pushing to Hub...")
153
- trainer.push_to_hub()
154
-
155
- log("=" * 60)
156
- log("SUCCESS! Model: passagereptile455/qwen3-0.6b-humaneval-job2")
157
- log("=" * 60)
158
-
159
- except Exception as e:
160
- log(f"ERROR: {e}")
161
- traceback.print_exc()
162
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job2_v3.py DELETED
@@ -1,123 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "datasets",
8
- # ]
9
- # ///
10
-
11
- """
12
- Job 2 v3: No filtering, ultra-minimal training (50 steps)
13
- """
14
-
15
- import sys
16
- import traceback
17
- from datasets import load_dataset, Dataset
18
- from peft import LoraConfig
19
- from trl import SFTTrainer, SFTConfig
20
- from transformers import AutoTokenizer
21
- import torch
22
-
23
-
24
- def log(msg):
25
- print(msg, flush=True)
26
-
27
-
28
- log("=" * 60)
29
- log("TRAINING JOB 2 v3 - Ultra-minimal, no filtering")
30
- log("=" * 60)
31
-
32
- try:
33
- log(f"CUDA: {torch.cuda.is_available()}")
34
- if torch.cuda.is_available():
35
- log(f"GPU: {torch.cuda.get_device_name(0)}")
36
-
37
- log("Streaming codeforces-cots...")
38
- streaming_ds = load_dataset(
39
- "open-r1/codeforces-cots", split="train", streaming=True
40
- )
41
-
42
- log("Collecting 300 examples (no filtering)...")
43
- examples = []
44
- for i, ex in enumerate(streaming_ds):
45
- if i >= 300:
46
- break
47
- examples.append(ex)
48
- if (i + 1) % 100 == 0:
49
- log(f" Collected {i + 1}")
50
-
51
- log(f"Final: {len(examples)} examples")
52
- dataset = Dataset.from_list(examples)
53
-
54
- log("Loading tokenizer...")
55
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
56
- if tokenizer.pad_token is None:
57
- tokenizer.pad_token = tokenizer.eos_token
58
-
59
- def format_messages(example):
60
- messages = example["messages"]
61
- text = ""
62
- for msg in messages:
63
- role = msg.get("role", "user")
64
- content = msg.get("content", "")
65
- text += f"<|{role}|>\n{content}\n"
66
- return {"text": text}
67
-
68
- log("Formatting dataset...")
69
- dataset = dataset.map(format_messages, remove_columns=dataset.column_names)
70
-
71
- config = SFTConfig(
72
- output_dir="qwen3-job2",
73
- push_to_hub=True,
74
- hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job2",
75
- hub_strategy="every_save",
76
- max_steps=50, # ULTRA minimal
77
- per_device_train_batch_size=1,
78
- gradient_accumulation_steps=4,
79
- learning_rate=5e-7, # Extremely low
80
- max_length=512,
81
- logging_steps=10,
82
- save_strategy="steps",
83
- save_steps=50,
84
- save_total_limit=1,
85
- eval_strategy="no",
86
- warmup_ratio=0.1,
87
- lr_scheduler_type="cosine",
88
- gradient_checkpointing=True,
89
- bf16=True,
90
- dataset_text_field="text",
91
- )
92
-
93
- peft_config = LoraConfig(
94
- r=4,
95
- lora_alpha=8,
96
- lora_dropout=0.0,
97
- bias="none",
98
- task_type="CAUSAL_LM",
99
- target_modules=["q_proj", "v_proj"],
100
- )
101
-
102
- log("Creating trainer...")
103
- trainer = SFTTrainer(
104
- model="Qwen/Qwen3-0.6B",
105
- train_dataset=dataset,
106
- args=config,
107
- peft_config=peft_config,
108
- )
109
-
110
- log("Training (50 steps, 5e-7 LR)...")
111
- trainer.train()
112
-
113
- log("Pushing to Hub...")
114
- trainer.push_to_hub()
115
-
116
- log("=" * 60)
117
- log("SUCCESS! Model: passagereptile455/qwen3-0.6b-humaneval-job2")
118
- log("=" * 60)
119
-
120
- except Exception as e:
121
- log(f"ERROR: {e}")
122
- traceback.print_exc()
123
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job3.py DELETED
@@ -1,104 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Job 3: Memory-optimized SFT training
14
- - Reduced batch size to 1
15
- - Increased gradient accumulation to 16
16
- - Reduced max_length to 1024
17
- """
18
-
19
- from datasets import load_dataset
20
- from peft import LoraConfig
21
- from trl import SFTTrainer, SFTConfig
22
- from transformers import AutoTokenizer
23
-
24
- print("Loading dataset: open-r1/codeforces-cots")
25
- dataset = load_dataset("open-r1/codeforces-cots", split="train")
26
- print(f"Dataset loaded: {len(dataset)} examples")
27
-
28
- dataset = dataset.shuffle(seed=42).select(range(min(5000, len(dataset))))
29
- print(f"Using {len(dataset)} examples")
30
-
31
- model_name = "Qwen/Qwen3-0.6B"
32
- tokenizer = AutoTokenizer.from_pretrained(model_name)
33
-
34
- def format_example(example):
35
- messages = example["messages"]
36
- text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
37
- return {"text": text}
38
-
39
- print("Formatting dataset...")
40
- dataset = dataset.map(format_example, remove_columns=dataset.column_names)
41
-
42
- dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
43
- train_dataset = dataset_split["train"]
44
- eval_dataset = dataset_split["test"]
45
- print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")
46
-
47
- config = SFTConfig(
48
- output_dir="qwen3-codeforces-sft-job3",
49
- push_to_hub=True,
50
- hub_model_id="passagereptile455/qwen3-0.6b-codeforces-sft-job3",
51
- hub_strategy="every_save",
52
-
53
- dataset_text_field="text",
54
-
55
- # MEMORY OPTIMIZED
56
- num_train_epochs=2,
57
- per_device_train_batch_size=1, # Reduced from 2
58
- gradient_accumulation_steps=16, # Increased from 8
59
- learning_rate=2e-4,
60
- max_length=1024, # Reduced from 2048
61
-
62
- logging_steps=10,
63
- save_strategy="steps",
64
- save_steps=100,
65
- save_total_limit=2,
66
-
67
- eval_strategy="steps",
68
- eval_steps=100,
69
-
70
- warmup_ratio=0.1,
71
- lr_scheduler_type="cosine",
72
- gradient_checkpointing=True,
73
- bf16=True, # Use bfloat16 for memory efficiency
74
-
75
- report_to="trackio",
76
- project="qwen3-humaneval-challenge",
77
- run_name="job3-memory-optimized",
78
- )
79
-
80
- peft_config = LoraConfig(
81
- r=16, # Reduced from 32
82
- lora_alpha=32, # Reduced from 64
83
- lora_dropout=0.05,
84
- bias="none",
85
- task_type="CAUSAL_LM",
86
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # Fewer modules
87
- )
88
-
89
- print("Initializing trainer...")
90
- trainer = SFTTrainer(
91
- model=model_name,
92
- train_dataset=train_dataset,
93
- eval_dataset=eval_dataset,
94
- args=config,
95
- peft_config=peft_config,
96
- )
97
-
98
- print("Starting training...")
99
- trainer.train()
100
-
101
- print("Pushing to Hub...")
102
- trainer.push_to_hub()
103
-
104
- print("Job 3 complete!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job4.py DELETED
@@ -1,105 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Job 4: Train on Python code instructions dataset
14
- """
15
-
16
- import os
17
- from huggingface_hub import login
18
-
19
- # Explicitly login with token from environment
20
- token = os.environ.get("HF_TOKEN")
21
- if token:
22
- login(token=token)
23
- print("Logged in to HF Hub")
24
- else:
25
- print("Warning: HF_TOKEN not found")
26
-
27
- from datasets import load_dataset
28
- from peft import LoraConfig
29
- from trl import SFTTrainer, SFTConfig
30
- from transformers import AutoTokenizer
31
-
32
- print("Loading dataset: iamtarun/python_code_instructions_18k_alpaca")
33
- dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
34
- print(f"Dataset loaded: {len(dataset)} examples")
35
-
36
- dataset = dataset.shuffle(seed=42).select(range(min(10000, len(dataset))))
37
- print(f"Using {len(dataset)} examples")
38
-
39
- model_name = "Qwen/Qwen3-0.6B"
40
- tokenizer = AutoTokenizer.from_pretrained(model_name)
41
-
42
-
43
- def format_example(example):
44
- return {"text": example["prompt"]}
45
-
46
-
47
- print("Formatting dataset...")
48
- dataset = dataset.map(format_example, remove_columns=dataset.column_names)
49
-
50
- dataset_split = dataset.train_test_split(test_size=0.05, seed=42)
51
- train_dataset = dataset_split["train"]
52
- eval_dataset = dataset_split["test"]
53
- print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")
54
-
55
- config = SFTConfig(
56
- output_dir="qwen3-python-code-sft-job4",
57
- push_to_hub=True,
58
- hub_model_id="passagereptile455/qwen3-0.6b-python-code-sft-job4",
59
- hub_strategy="every_save",
60
- dataset_text_field="text",
61
- num_train_epochs=3,
62
- per_device_train_batch_size=2,
63
- gradient_accumulation_steps=8,
64
- learning_rate=1e-4,
65
- max_length=512,
66
- logging_steps=20,
67
- save_strategy="steps",
68
- save_steps=200,
69
- save_total_limit=2,
70
- eval_strategy="steps",
71
- eval_steps=200,
72
- warmup_ratio=0.1,
73
- lr_scheduler_type="cosine",
74
- gradient_checkpointing=True,
75
- bf16=True,
76
- report_to="trackio",
77
- project="qwen3-humaneval-challenge",
78
- run_name="job4-python-instructions",
79
- )
80
-
81
- peft_config = LoraConfig(
82
- r=16,
83
- lora_alpha=32,
84
- lora_dropout=0.05,
85
- bias="none",
86
- task_type="CAUSAL_LM",
87
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
88
- )
89
-
90
- print("Initializing trainer...")
91
- trainer = SFTTrainer(
92
- model=model_name,
93
- train_dataset=train_dataset,
94
- eval_dataset=eval_dataset,
95
- args=config,
96
- peft_config=peft_config,
97
- )
98
-
99
- print("Starting training...")
100
- trainer.train()
101
-
102
- print("Pushing to Hub...")
103
- trainer.push_to_hub()
104
-
105
- print("Job 4 complete!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job4_v2.py DELETED
@@ -1,60 +0,0 @@
1
- # /// script
2
- # dependencies = ["trl>=0.12.0", "peft>=0.7.0", "transformers>=4.36.0", "accelerate>=0.24.0", "trackio", "datasets"]
3
- # ///
4
- from datasets import load_dataset
5
- from peft import LoraConfig
6
- from trl import SFTTrainer, SFTConfig
7
- from transformers import AutoTokenizer
8
-
9
- print("Loading dataset")
10
- dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
11
- dataset = dataset.shuffle(seed=42).select(range(10000))
12
- print(f"Using {len(dataset)} examples")
13
-
14
- model_name = "Qwen/Qwen3-0.6B"
15
- tokenizer = AutoTokenizer.from_pretrained(model_name)
16
-
17
-
18
- def format_fn(ex):
19
- return {"text": ex["prompt"]}
20
-
21
-
22
- dataset = dataset.map(format_fn, remove_columns=dataset.column_names)
23
- splits = dataset.train_test_split(test_size=0.05, seed=42)
24
- train_ds, eval_ds = splits["train"], splits["test"]
25
- print(f"Train: {len(train_ds)} Eval: {len(eval_ds)}")
26
-
27
- config = SFTConfig(
28
- output_dir="qwen3-python-sft",
29
- push_to_hub=True,
30
- hub_model_id="passagereptile455/qwen3-0.6b-python-code-sft-job4",
31
- dataset_text_field="text",
32
- num_train_epochs=3,
33
- per_device_train_batch_size=2,
34
- gradient_accumulation_steps=8,
35
- learning_rate=1e-4,
36
- max_length=512,
37
- logging_steps=20,
38
- save_strategy="epoch",
39
- warmup_ratio=0.1,
40
- gradient_checkpointing=True,
41
- bf16=True,
42
- report_to="trackio",
43
- run_name="job4-python",
44
- )
45
-
46
- peft_config = LoraConfig(
47
- r=16, lora_alpha=32, target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
48
- )
49
-
50
- print("Starting training...")
51
- trainer = SFTTrainer(
52
- model=model_name,
53
- train_dataset=train_ds,
54
- eval_dataset=eval_ds,
55
- args=config,
56
- peft_config=peft_config,
57
- )
58
- trainer.train()
59
- trainer.push_to_hub()
60
- print("Done!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_job5.py DELETED
@@ -1,105 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Job 4: Train on Python code instructions dataset
14
- """
15
-
16
- import os
17
- from huggingface_hub import login
18
-
19
- # Explicitly login with token from environment
20
- token = os.environ.get("HF_TOKEN")
21
- if token:
22
- login(token=token)
23
- print("Logged in to HF Hub")
24
- else:
25
- print("Warning: HF_TOKEN not found")
26
-
27
- from datasets import load_dataset
28
- from peft import LoraConfig
29
- from trl import SFTTrainer, SFTConfig
30
- from transformers import AutoTokenizer
31
-
32
- print("Loading dataset: iamtarun/python_code_instructions_18k_alpaca")
33
- dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
34
- print(f"Dataset loaded: {len(dataset)} examples")
35
-
36
- dataset = dataset.shuffle(seed=42).select(range(min(10000, len(dataset))))
37
- print(f"Using {len(dataset)} examples")
38
-
39
- model_name = "Qwen/Qwen3-0.6B"
40
- tokenizer = AutoTokenizer.from_pretrained(model_name)
41
-
42
-
43
- def format_example(example):
44
- return {"text": example["prompt"]}
45
-
46
-
47
- print("Formatting dataset...")
48
- dataset = dataset.map(format_example, remove_columns=dataset.column_names)
49
-
50
- dataset_split = dataset.train_test_split(test_size=0.05, seed=42)
51
- train_dataset = dataset_split["train"]
52
- eval_dataset = dataset_split["test"]
53
- print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")
54
-
55
- config = SFTConfig(
56
- output_dir="qwen3-python-code-sft-job4",
57
- push_to_hub=True,
58
- hub_model_id="passagereptile455/qwen3-0.6b-python-code-sft-job4",
59
- hub_strategy="every_save",
60
- dataset_text_field="text",
61
- num_train_epochs=3,
62
- per_device_train_batch_size=2,
63
- gradient_accumulation_steps=8,
64
- learning_rate=1e-4,
65
- max_length=512,
66
- logging_steps=20,
67
- save_strategy="steps",
68
- save_steps=200,
69
- save_total_limit=2,
70
- eval_strategy="steps",
71
- eval_steps=200,
72
- warmup_ratio=0.1,
73
- lr_scheduler_type="cosine",
74
- gradient_checkpointing=True,
75
- bf16=True,
76
- report_to="trackio",
77
- project="qwen3-humaneval-challenge",
78
- run_name="job4-python-instructions",
79
- )
80
-
81
- peft_config = LoraConfig(
82
- r=16,
83
- lora_alpha=32,
84
- lora_dropout=0.05,
85
- bias="none",
86
- task_type="CAUSAL_LM",
87
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
88
- )
89
-
90
- print("Initializing trainer...")
91
- trainer = SFTTrainer(
92
- model=model_name,
93
- train_dataset=train_dataset,
94
- eval_dataset=eval_dataset,
95
- args=config,
96
- peft_config=peft_config,
97
- )
98
-
99
- print("Starting training...")
100
- trainer.train()
101
-
102
- print("Pushing to Hub...")
103
- trainer.push_to_hub()
104
-
105
- print("Job 4 complete!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_minimal.py DELETED
@@ -1,137 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "torch",
5
- # "transformers",
6
- # "accelerate",
7
- # "datasets",
8
- # "trl",
9
- # "peft",
10
- # "bitsandbytes",
11
- # "huggingface_hub",
12
- # ]
13
- # ///
14
- """
15
- Minimal fine-tuning of Qwen3-0.6B on open-r1/codeforces-cots.
16
- Ultra-conservative training to avoid catastrophic forgetting.
17
- """
18
-
19
- import os
20
- from datasets import load_dataset
21
- from transformers import AutoModelForCausalLM, AutoTokenizer
22
- from peft import LoraConfig
23
- from trl import SFTConfig, SFTTrainer
24
- import torch
25
-
26
-
27
- def main():
28
- print("=" * 60)
29
- print("Minimal Fine-tuning: Qwen3-0.6B on codeforces-cots")
30
- print("=" * 60)
31
-
32
- model_name = "Qwen/Qwen3-0.6B"
33
- output_name = "passagereptile455/qwen3-codeforces-minimal"
34
-
35
- # Load tokenizer
36
- print("\nLoading tokenizer...")
37
- tokenizer = AutoTokenizer.from_pretrained(model_name)
38
- if tokenizer.pad_token is None:
39
- tokenizer.pad_token = tokenizer.eos_token
40
-
41
- # Load model
42
- print("Loading model...")
43
- model = AutoModelForCausalLM.from_pretrained(
44
- model_name,
45
- torch_dtype=torch.float16,
46
- device_map="auto",
47
- )
48
- print(f"Model loaded on {model.device}")
49
-
50
- # Load dataset with streaming to avoid memory issues
51
- print("\nLoading dataset (streaming)...")
52
- dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
53
-
54
- # Take only 500 examples for minimal training
55
- dataset = dataset.take(500)
56
-
57
- # Convert to list for SFTTrainer
58
- print("Preparing examples...")
59
- examples = list(dataset)
60
- print(f"Loaded {len(examples)} examples")
61
-
62
- # Check format
63
- if examples:
64
- print(f"First example keys: {examples[0].keys()}")
65
- if "messages" in examples[0]:
66
- print(f"Messages format: {len(examples[0]['messages'])} messages")
67
-
68
- # LoRA config - very conservative
69
- lora_config = LoraConfig(
70
- r=8,
71
- lora_alpha=16,
72
- lora_dropout=0.05,
73
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
74
- bias="none",
75
- task_type="CAUSAL_LM",
76
- )
77
-
78
- # Training config - ultra conservative
79
- training_args = SFTConfig(
80
- output_dir="./output",
81
- max_steps=150,
82
- per_device_train_batch_size=2,
83
- gradient_accumulation_steps=4,
84
- learning_rate=5e-6,
85
- lr_scheduler_type="cosine",
86
- warmup_steps=10,
87
- logging_steps=10,
88
- save_steps=50,
89
- fp16=True,
90
- gradient_checkpointing=True,
91
- max_seq_length=2048,
92
- dataset_text_field=None, # We'll use messages format
93
- push_to_hub=True,
94
- hub_model_id=output_name,
95
- report_to="none",
96
- )
97
-
98
- # Create trainer
99
- print("\nInitializing trainer...")
100
-
101
- # Format function for messages
102
- def formatting_func(example):
103
- return tokenizer.apply_chat_template(
104
- example["messages"],
105
- tokenize=False,
106
- add_generation_prompt=False,
107
- )
108
-
109
- trainer = SFTTrainer(
110
- model=model,
111
- args=training_args,
112
- train_dataset=examples,
113
- peft_config=lora_config,
114
- processing_class=tokenizer,
115
- formatting_func=formatting_func,
116
- )
117
-
118
- # Train
119
- print("\n" + "=" * 60)
120
- print("Starting training...")
121
- print("=" * 60)
122
- trainer.train()
123
-
124
- # Save and push
125
- print("\nSaving model...")
126
- trainer.save_model()
127
-
128
- print("\nPushing to hub...")
129
- trainer.push_to_hub()
130
-
131
- print("\n" + "=" * 60)
132
- print(f"Training complete! Model saved to: {output_name}")
133
- print("=" * 60)
134
-
135
-
136
- if __name__ == "__main__":
137
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_minimal_v2.py DELETED
@@ -1,135 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "torch",
5
- # "transformers",
6
- # "accelerate",
7
- # "datasets",
8
- # "trl>=0.12.0",
9
- # "peft",
10
- # "huggingface_hub",
11
- # ]
12
- # ///
13
- """
14
- Minimal fine-tuning of Qwen3-0.6B on open-r1/codeforces-cots.
15
- Ultra-conservative training to avoid catastrophic forgetting.
16
- """
17
-
18
- import os
19
- from datasets import load_dataset, Dataset
20
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
21
- from peft import LoraConfig, get_peft_model
22
- from trl import SFTTrainer
23
- import torch
24
-
25
-
26
- def main():
27
- print("=" * 60)
28
- print("Minimal Fine-tuning: Qwen3-0.6B on codeforces-cots")
29
- print("=" * 60)
30
-
31
- model_name = "Qwen/Qwen3-0.6B"
32
- output_name = "passagereptile455/qwen3-codeforces-minimal"
33
-
34
- # Load tokenizer
35
- print("\nLoading tokenizer...")
36
- tokenizer = AutoTokenizer.from_pretrained(model_name)
37
- if tokenizer.pad_token is None:
38
- tokenizer.pad_token = tokenizer.eos_token
39
-
40
- # Load model
41
- print("Loading model...")
42
- model = AutoModelForCausalLM.from_pretrained(
43
- model_name,
44
- torch_dtype=torch.float16,
45
- device_map="auto",
46
- )
47
- print(f"Model loaded on {model.device}")
48
-
49
- # LoRA config - very conservative
50
- lora_config = LoraConfig(
51
- r=8,
52
- lora_alpha=16,
53
- lora_dropout=0.05,
54
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
55
- bias="none",
56
- task_type="CAUSAL_LM",
57
- )
58
-
59
- # Apply LoRA
60
- print("Applying LoRA...")
61
- model = get_peft_model(model, lora_config)
62
- model.print_trainable_parameters()
63
-
64
- # Load dataset with streaming
65
- print("\nLoading dataset (streaming)...")
66
- dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
67
-
68
- # Take only 500 examples for minimal training
69
- print("Preparing examples...")
70
- examples = []
71
- for i, ex in enumerate(dataset):
72
- if i >= 500:
73
- break
74
- # Format as text using chat template
75
- text = tokenizer.apply_chat_template(
76
- ex["messages"],
77
- tokenize=False,
78
- add_generation_prompt=False,
79
- )
80
- examples.append({"text": text})
81
-
82
- print(f"Loaded {len(examples)} examples")
83
-
84
- # Create HF dataset
85
- train_dataset = Dataset.from_list(examples)
86
-
87
- # Training args - ultra conservative
88
- training_args = TrainingArguments(
89
- output_dir="./output",
90
- max_steps=150,
91
- per_device_train_batch_size=2,
92
- gradient_accumulation_steps=4,
93
- learning_rate=5e-6,
94
- lr_scheduler_type="cosine",
95
- warmup_steps=10,
96
- logging_steps=10,
97
- save_steps=50,
98
- fp16=True,
99
- gradient_checkpointing=True,
100
- push_to_hub=True,
101
- hub_model_id=output_name,
102
- report_to="none",
103
- remove_unused_columns=False,
104
- )
105
-
106
- # Create trainer
107
- print("\nInitializing trainer...")
108
- trainer = SFTTrainer(
109
- model=model,
110
- args=training_args,
111
- train_dataset=train_dataset,
112
- processing_class=tokenizer,
113
- dataset_text_field="text",
114
- )
115
-
116
- # Train
117
- print("\n" + "=" * 60)
118
- print("Starting training...")
119
- print("=" * 60)
120
- trainer.train()
121
-
122
- # Save and push
123
- print("\nSaving model...")
124
- trainer.save_model()
125
-
126
- print("\nPushing to hub...")
127
- trainer.push_to_hub()
128
-
129
- print("\n" + "=" * 60)
130
- print(f"Training complete! Model saved to: {output_name}")
131
- print("=" * 60)
132
-
133
-
134
- if __name__ == "__main__":
135
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_minimal_v3.py DELETED
@@ -1,140 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "torch",
5
- # "transformers>=4.45.0",
6
- # "accelerate",
7
- # "datasets",
8
- # "trl>=0.12.0",
9
- # "peft",
10
- # "huggingface_hub",
11
- # ]
12
- # ///
13
- """
14
- Minimal fine-tuning of Qwen3-0.6B on open-r1/codeforces-cots.
15
- Ultra-conservative training to avoid catastrophic forgetting.
16
- """
17
-
18
- import os
19
- from datasets import load_dataset, Dataset
20
- from transformers import AutoModelForCausalLM, AutoTokenizer
21
- from peft import LoraConfig
22
- from trl import SFTTrainer, SFTConfig
23
- import torch
24
-
25
-
26
- def main():
27
- print("=" * 60)
28
- print("Minimal Fine-tuning: Qwen3-0.6B on codeforces-cots")
29
- print("=" * 60)
30
-
31
- model_name = "Qwen/Qwen3-0.6B"
32
- output_name = "passagereptile455/qwen3-codeforces-minimal"
33
-
34
- # Load tokenizer
35
- print("\nLoading tokenizer...")
36
- tokenizer = AutoTokenizer.from_pretrained(model_name)
37
- if tokenizer.pad_token is None:
38
- tokenizer.pad_token = tokenizer.eos_token
39
-
40
- # Load model
41
- print("Loading model...")
42
- model = AutoModelForCausalLM.from_pretrained(
43
- model_name,
44
- torch_dtype=torch.float16,
45
- device_map="auto",
46
- )
47
- print(f"Model loaded on {model.device}")
48
-
49
- # LoRA config - very conservative
50
- lora_config = LoraConfig(
51
- r=8,
52
- lora_alpha=16,
53
- lora_dropout=0.05,
54
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
55
- bias="none",
56
- task_type="CAUSAL_LM",
57
- )
58
-
59
- # Load dataset with streaming
60
- print("\nLoading dataset (streaming)...")
61
- dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
62
-
63
- # Take only 500 examples for minimal training
64
- print("Preparing examples...")
65
- examples = []
66
- for i, ex in enumerate(dataset):
67
- if i >= 500:
68
- break
69
- # Format as text using chat template
70
- text = tokenizer.apply_chat_template(
71
- ex["messages"],
72
- tokenize=False,
73
- add_generation_prompt=False,
74
- )
75
- examples.append({"text": text})
76
-
77
- print(f"Loaded {len(examples)} examples")
78
-
79
- # Create HF dataset
80
- train_dataset = Dataset.from_list(examples)
81
-
82
- # Check SFTConfig parameters
83
- import inspect
84
-
85
- sig = inspect.signature(SFTConfig)
86
- print(f"\nSFTConfig parameters: {list(sig.parameters.keys())[:20]}...")
87
-
88
- # Training config - use only standard parameters
89
- training_args = SFTConfig(
90
- output_dir="./output",
91
- max_steps=150,
92
- per_device_train_batch_size=2,
93
- gradient_accumulation_steps=4,
94
- learning_rate=5e-6,
95
- lr_scheduler_type="cosine",
96
- warmup_steps=10,
97
- logging_steps=10,
98
- save_steps=50,
99
- fp16=True,
100
- gradient_checkpointing=True,
101
- push_to_hub=True,
102
- hub_model_id=output_name,
103
- report_to="none",
104
- )
105
-
106
- # Create trainer
107
- print("\nInitializing trainer...")
108
-
109
- # Check SFTTrainer parameters
110
- sig = inspect.signature(SFTTrainer.__init__)
111
- print(f"SFTTrainer parameters: {list(sig.parameters.keys())[:15]}...")
112
-
113
- trainer = SFTTrainer(
114
- model=model,
115
- args=training_args,
116
- train_dataset=train_dataset,
117
- peft_config=lora_config,
118
- processing_class=tokenizer,
119
- )
120
-
121
- # Train
122
- print("\n" + "=" * 60)
123
- print("Starting training...")
124
- print("=" * 60)
125
- trainer.train()
126
-
127
- # Save and push
128
- print("\nSaving model...")
129
- trainer.save_model()
130
-
131
- print("\nPushing to hub...")
132
- trainer.push_to_hub()
133
-
134
- print("\n" + "=" * 60)
135
- print(f"Training complete! Model saved to: {output_name}")
136
- print("=" * 60)
137
-
138
-
139
- if __name__ == "__main__":
140
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_minimal_v4.py DELETED
@@ -1,145 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "torch",
5
- # "transformers>=4.45.0",
6
- # "accelerate",
7
- # "datasets",
8
- # "trl>=0.12.0",
9
- # "peft",
10
- # "huggingface_hub",
11
- # ]
12
- # ///
13
- """
14
- Minimal fine-tuning of Qwen3-0.6B on open-r1/codeforces-cots.
15
- Saves to local output directory (no hub push during training).
16
- """
17
-
18
- import os
19
- from datasets import load_dataset, Dataset
20
- from transformers import AutoModelForCausalLM, AutoTokenizer
21
- from peft import LoraConfig
22
- from trl import SFTTrainer, SFTConfig
23
- import torch
24
-
25
-
26
- def main():
27
- print("=" * 60)
28
- print("Minimal Fine-tuning: Qwen3-0.6B on codeforces-cots")
29
- print("=" * 60)
30
-
31
- model_name = "Qwen/Qwen3-0.6B"
32
-
33
- # Load tokenizer
34
- print("\nLoading tokenizer...")
35
- tokenizer = AutoTokenizer.from_pretrained(model_name)
36
- if tokenizer.pad_token is None:
37
- tokenizer.pad_token = tokenizer.eos_token
38
-
39
- # Load model
40
- print("Loading model...")
41
- model = AutoModelForCausalLM.from_pretrained(
42
- model_name,
43
- torch_dtype=torch.float16,
44
- device_map="auto",
45
- )
46
- print(f"Model loaded on {model.device}")
47
-
48
- # LoRA config - very conservative
49
- lora_config = LoraConfig(
50
- r=8,
51
- lora_alpha=16,
52
- lora_dropout=0.05,
53
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
54
- bias="none",
55
- task_type="CAUSAL_LM",
56
- )
57
-
58
- # Load dataset with streaming
59
- print("\nLoading dataset (streaming)...")
60
- dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
61
-
62
- # Take only 500 examples for minimal training
63
- print("Preparing examples...")
64
- examples = []
65
- for i, ex in enumerate(dataset):
66
- if i >= 500:
67
- break
68
- # Format as text using chat template
69
- text = tokenizer.apply_chat_template(
70
- ex["messages"],
71
- tokenize=False,
72
- add_generation_prompt=False,
73
- )
74
- examples.append({"text": text})
75
-
76
- print(f"Loaded {len(examples)} examples")
77
-
78
- # Create HF dataset
79
- train_dataset = Dataset.from_list(examples)
80
-
81
- # Training config - NO hub push
82
- training_args = SFTConfig(
83
- output_dir="./qwen3-codeforces-minimal",
84
- max_steps=150,
85
- per_device_train_batch_size=2,
86
- gradient_accumulation_steps=4,
87
- learning_rate=5e-6,
88
- lr_scheduler_type="cosine",
89
- warmup_steps=10,
90
- logging_steps=10,
91
- save_steps=50,
92
- save_total_limit=2,
93
- fp16=True,
94
- gradient_checkpointing=True,
95
- push_to_hub=False, # Disabled - will upload manually
96
- report_to="none",
97
- )
98
-
99
- # Create trainer
100
- print("\nInitializing trainer...")
101
- trainer = SFTTrainer(
102
- model=model,
103
- args=training_args,
104
- train_dataset=train_dataset,
105
- peft_config=lora_config,
106
- processing_class=tokenizer,
107
- )
108
-
109
- # Train
110
- print("\n" + "=" * 60)
111
- print("Starting training...")
112
- print("=" * 60)
113
- trainer.train()
114
-
115
- # Save final model
116
- print("\nSaving final model...")
117
- trainer.save_model("./qwen3-codeforces-minimal-final")
118
- tokenizer.save_pretrained("./qwen3-codeforces-minimal-final")
119
-
120
- # Upload to hub using HfApi
121
- print("\nUploading to HuggingFace Hub...")
122
- from huggingface_hub import HfApi, create_repo
123
-
124
- api = HfApi()
125
- repo_id = "passagereptile455/qwen3-codeforces-minimal"
126
-
127
- try:
128
- create_repo(repo_id, exist_ok=True, repo_type="model")
129
- api.upload_folder(
130
- folder_path="./qwen3-codeforces-minimal-final",
131
- repo_id=repo_id,
132
- repo_type="model",
133
- )
134
- print(f"Model uploaded to: https://huggingface.co/{repo_id}")
135
- except Exception as e:
136
- print(f"Upload failed: {e}")
137
- print("Model saved locally at: ./qwen3-codeforces-minimal-final")
138
-
139
- print("\n" + "=" * 60)
140
- print("Training complete!")
141
- print("=" * 60)
142
-
143
-
144
- if __name__ == "__main__":
145
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_sft_demo.py DELETED
@@ -1,32 +0,0 @@
1
- # /// script
2
- # dependencies = ["trl>=0.12.0", "peft>=0.7.0", "datasets", "transformers", "torch", "accelerate"]
3
- # ///
4
-
5
- from datasets import load_dataset
6
- from peft import LoraConfig
7
- from trl import SFTTrainer, SFTConfig
8
- import os
9
-
10
- # Load a small dataset
11
- dataset = load_dataset("trl-lib/Capybara", split="train[:500]")
12
-
13
- # Setup trainer
14
- trainer = SFTTrainer(
15
- model="Qwen/Qwen2.5-0.5B",
16
- train_dataset=dataset,
17
- peft_config=LoraConfig(r=16, lora_alpha=32, target_modules="all-linear"),
18
- args=SFTConfig(
19
- output_dir="qwen-demo-sft",
20
- max_steps=100,
21
- per_device_train_batch_size=2,
22
- gradient_accumulation_steps=4,
23
- logging_steps=10,
24
- push_to_hub=True,
25
- hub_model_id="passagereptile455/qwen-demo-sft",
26
- hub_private_repo=True,
27
- )
28
- )
29
-
30
- trainer.train()
31
- trainer.push_to_hub()
32
- print("Training complete! Model pushed to Hub.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_streaming.py DELETED
@@ -1,96 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Training with streaming dataset to avoid memory issues
14
- """
15
-
16
- from datasets import load_dataset, Dataset
17
- from peft import LoraConfig
18
- from trl import SFTTrainer, SFTConfig
19
- import torch
20
-
21
- print("=" * 50)
22
- print("STREAMING DATASET TRAINING")
23
- print("=" * 50)
24
-
25
- print(f"CUDA: {torch.cuda.is_available()}")
26
-
27
- # Use streaming to load subset without memory issues
28
- print("Streaming codeforces-cots...")
29
- streaming_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
30
-
31
- # Collect 1000 examples
32
- print("Collecting 1000 examples...")
33
- examples = []
34
- for i, ex in enumerate(streaming_ds):
35
- if i >= 1000:
36
- break
37
- examples.append(ex)
38
- if (i + 1) % 200 == 0:
39
- print(f" Collected {i + 1} examples")
40
-
41
- print(f"Collected {len(examples)} examples")
42
-
43
- # Convert to regular dataset
44
- dataset = Dataset.from_list(examples)
45
- print(f"Dataset created: {len(dataset)}")
46
-
47
- config = SFTConfig(
48
- output_dir="qwen3-codeforces",
49
- push_to_hub=True,
50
- hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
51
- hub_strategy="every_save",
52
- max_steps=200,
53
- per_device_train_batch_size=1,
54
- gradient_accumulation_steps=8,
55
- learning_rate=5e-6,
56
- max_length=512,
57
- logging_steps=20,
58
- save_strategy="steps",
59
- save_steps=100,
60
- save_total_limit=1,
61
- eval_strategy="no",
62
- warmup_ratio=0.1,
63
- lr_scheduler_type="cosine",
64
- gradient_checkpointing=True,
65
- bf16=True,
66
- report_to="trackio",
67
- project="qwen3-humaneval",
68
- run_name="job1-streaming",
69
- )
70
-
71
- peft_config = LoraConfig(
72
- r=8,
73
- lora_alpha=16,
74
- lora_dropout=0.05,
75
- bias="none",
76
- task_type="CAUSAL_LM",
77
- target_modules=["q_proj", "v_proj"],
78
- )
79
-
80
- print("Creating trainer...")
81
- trainer = SFTTrainer(
82
- model="Qwen/Qwen3-0.6B",
83
- train_dataset=dataset,
84
- args=config,
85
- peft_config=peft_config,
86
- )
87
-
88
- print("Training (200 steps)...")
89
- trainer.train()
90
-
91
- print("Pushing to Hub...")
92
- trainer.push_to_hub()
93
-
94
- print("=" * 50)
95
- print("SUCCESS!")
96
- print("=" * 50)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_test_simple.py DELETED
@@ -1,79 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Test training with a reliable small dataset
14
- """
15
-
16
- from datasets import load_dataset
17
- from peft import LoraConfig
18
- from trl import SFTTrainer, SFTConfig
19
- import torch
20
-
21
- print("=" * 50)
22
- print("TEST TRAINING JOB")
23
- print("=" * 50)
24
-
25
- print(f"CUDA: {torch.cuda.is_available()}")
26
-
27
- # Use trl-lib dataset which is guaranteed to work
28
- print("Loading trl-lib/Capybara...")
29
- dataset = load_dataset("trl-lib/Capybara", split="train")
30
- print(f"Loaded: {len(dataset)}")
31
-
32
- # Small subset
33
- dataset = dataset.shuffle(seed=42).select(range(500))
34
- print(f"Using: {len(dataset)}")
35
-
36
- config = SFTConfig(
37
- output_dir="test-model",
38
- push_to_hub=True,
39
- hub_model_id="passagereptile455/qwen3-test-training",
40
- hub_strategy="every_save",
41
- max_steps=50, # Very short test
42
- per_device_train_batch_size=1,
43
- gradient_accumulation_steps=4,
44
- learning_rate=2e-5,
45
- max_length=256,
46
- logging_steps=10,
47
- save_strategy="steps",
48
- save_steps=50,
49
- save_total_limit=1,
50
- eval_strategy="no",
51
- warmup_ratio=0.1,
52
- gradient_checkpointing=True,
53
- bf16=True,
54
- report_to="trackio",
55
- project="test",
56
- run_name="test-train",
57
- )
58
-
59
- peft_config = LoraConfig(
60
- r=8,
61
- lora_alpha=16,
62
- target_modules=["q_proj", "v_proj"],
63
- )
64
-
65
- print("Creating trainer...")
66
- trainer = SFTTrainer(
67
- model="Qwen/Qwen3-0.6B",
68
- train_dataset=dataset,
69
- args=config,
70
- peft_config=peft_config,
71
- )
72
-
73
- print("Training...")
74
- trainer.train()
75
-
76
- print("Pushing to Hub...")
77
- trainer.push_to_hub()
78
-
79
- print("SUCCESS!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_test_upload_150steps.py DELETED
@@ -1,303 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "torch",
5
- # "transformers>=4.45.0",
6
- # "accelerate",
7
- # "datasets",
8
- # "trl>=0.12.0",
9
- # "peft",
10
- # "huggingface_hub",
11
- # ]
12
- # ///
13
- """
14
- Combined training, testing, and upload script.
15
- Trains Qwen3-0.6B on codeforces-cots (150 steps - proven optimal), tests on HumanEval, uploads to Hub.
16
- """
17
-
18
- import os
19
- import re
20
- import subprocess
21
- import tempfile
22
- from datasets import load_dataset, Dataset
23
- from transformers import AutoModelForCausalLM, AutoTokenizer
24
- from peft import LoraConfig
25
- from trl import SFTTrainer, SFTConfig
26
- from huggingface_hub import login, HfApi
27
- import torch
28
-
29
- # Authenticate with HF Hub at the start
30
- HF_TOKEN = os.environ.get("HF_TOKEN")
31
- if HF_TOKEN:
32
- login(token=HF_TOKEN)
33
- print("HF Hub authenticated successfully!")
34
- else:
35
- print("WARNING: No HF_TOKEN found in environment")
36
-
37
- REPO_ID = "passagereptile455/qwen3-codeforces-humaneval"
38
-
39
-
40
- def extract_function_body(response: str) -> str:
41
- """Extract just the function body from model response."""
42
- response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
43
- response = response.strip()
44
-
45
- code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
46
- if code_match:
47
- response = code_match.group(1)
48
- else:
49
- code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
50
- if code_match:
51
- response = code_match.group(1)
52
-
53
- response = response.strip()
54
- lines = response.split("\n")
55
-
56
- start_idx = 0
57
- for i, line in enumerate(lines):
58
- if line.strip().startswith("def "):
59
- start_idx = i
60
- break
61
-
62
- start_idx += 1
63
-
64
- if start_idx < len(lines):
65
- stripped = lines[start_idx].strip()
66
- if stripped.startswith('"""') or stripped.startswith("'''"):
67
- quote = stripped[:3]
68
- if stripped.count(quote) >= 2:
69
- start_idx += 1
70
- else:
71
- start_idx += 1
72
- while start_idx < len(lines) and quote not in lines[start_idx]:
73
- start_idx += 1
74
- start_idx += 1
75
-
76
- body_lines = lines[start_idx:]
77
- return "\n".join(body_lines)
78
-
79
-
80
- def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
81
- """Run the test using subprocess."""
82
- full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
83
-
84
- with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
85
- f.write(full_code)
86
- temp_path = f.name
87
-
88
- try:
89
- result = subprocess.run(
90
- ["python", temp_path], capture_output=True, text=True, timeout=10
91
- )
92
- return result.returncode == 0
93
- except subprocess.TimeoutExpired:
94
- return False
95
- except Exception:
96
- return False
97
- finally:
98
- try:
99
- os.unlink(temp_path)
100
- except:
101
- pass
102
-
103
-
104
- def test_model(model, tokenizer, model_name="Model"):
105
- """Test model on HumanEval."""
106
- print(f"\n{'=' * 60}")
107
- print(f"Testing: {model_name}")
108
- print("=" * 60)
109
-
110
- dataset = load_dataset("openai/openai_humaneval", split="test")
111
- print(f"Total problems: {len(dataset)}")
112
-
113
- passed = 0
114
- failed = 0
115
-
116
- for i, problem in enumerate(dataset):
117
- prompt = problem["prompt"]
118
- test = problem["test"]
119
- entry_point = problem["entry_point"]
120
-
121
- messages = [
122
- {
123
- "role": "user",
124
- "content": f"Complete this Python function. Output only the code.\n\n{prompt}",
125
- }
126
- ]
127
-
128
- text = tokenizer.apply_chat_template(
129
- messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
130
- )
131
-
132
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
133
-
134
- with torch.no_grad():
135
- outputs = model.generate(
136
- **inputs,
137
- max_new_tokens=512,
138
- do_sample=False,
139
- pad_token_id=tokenizer.eos_token_id,
140
- )
141
-
142
- response = tokenizer.decode(
143
- outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
144
- )
145
-
146
- completion = extract_function_body(response)
147
- success = run_test_subprocess(prompt, completion, test, entry_point)
148
-
149
- if success:
150
- passed += 1
151
- else:
152
- failed += 1
153
-
154
- if (i + 1) % 20 == 0 or i == len(dataset) - 1:
155
- print(
156
- f"Progress: {i + 1}/{len(dataset)} | Pass: {passed} | Fail: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
157
- )
158
-
159
- final_score = passed / len(dataset) * 100
160
- print(f"\nFINAL: {passed}/{len(dataset)} = {final_score:.2f}%")
161
- return final_score
162
-
163
-
164
- def main():
165
- print("=" * 60)
166
- print("Combined Training, Testing & Upload")
167
- print("150 steps - proven optimal configuration")
168
- print("=" * 60)
169
-
170
- model_name = "Qwen/Qwen3-0.6B"
171
-
172
- # Load tokenizer
173
- print("\nLoading tokenizer...")
174
- tokenizer = AutoTokenizer.from_pretrained(model_name)
175
- if tokenizer.pad_token is None:
176
- tokenizer.pad_token = tokenizer.eos_token
177
-
178
- # Load base model
179
- print("Loading base model...")
180
- base_model = AutoModelForCausalLM.from_pretrained(
181
- model_name,
182
- torch_dtype=torch.float16,
183
- device_map="auto",
184
- )
185
-
186
- # LoRA config
187
- lora_config = LoraConfig(
188
- r=8,
189
- lora_alpha=16,
190
- lora_dropout=0.05,
191
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
192
- bias="none",
193
- task_type="CAUSAL_LM",
194
- )
195
-
196
- # Load training dataset
197
- print("\nLoading training dataset (streaming)...")
198
- dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
199
-
200
- print("Preparing examples...")
201
- examples = []
202
- for i, ex in enumerate(dataset):
203
- if i >= 500:
204
- break
205
- text = tokenizer.apply_chat_template(
206
- ex["messages"],
207
- tokenize=False,
208
- add_generation_prompt=False,
209
- )
210
- examples.append({"text": text})
211
-
212
- print(f"Loaded {len(examples)} training examples")
213
- train_dataset = Dataset.from_list(examples)
214
-
215
- # Training config - 150 steps (proven optimal)
216
- training_args = SFTConfig(
217
- output_dir="./output",
218
- max_steps=150, # Proven optimal - 200 regresses
219
- per_device_train_batch_size=2,
220
- gradient_accumulation_steps=4,
221
- learning_rate=5e-6,
222
- lr_scheduler_type="cosine",
223
- warmup_steps=10,
224
- logging_steps=25,
225
- save_steps=150,
226
- fp16=True,
227
- gradient_checkpointing=True,
228
- push_to_hub=False, # We'll push manually after eval
229
- report_to="none",
230
- )
231
-
232
- # Create trainer
233
- print("\nInitializing trainer...")
234
- trainer = SFTTrainer(
235
- model=base_model,
236
- args=training_args,
237
- train_dataset=train_dataset,
238
- peft_config=lora_config,
239
- processing_class=tokenizer,
240
- )
241
-
242
- # Train
243
- print("\n" + "=" * 60)
244
- print("PHASE 1: Training (150 steps)")
245
- print("=" * 60)
246
- trainer.train()
247
-
248
- # Save trained model locally
249
- print("\nSaving trained model locally...")
250
- trainer.save_model("./trained_model")
251
- tokenizer.save_pretrained("./trained_model")
252
-
253
- # Test the fine-tuned model
254
- print("\n" + "=" * 60)
255
- print("PHASE 2: Testing Fine-tuned Model")
256
- print("=" * 60)
257
-
258
- trained_model = trainer.model
259
- trained_model.train(False)
260
-
261
- finetuned_score = test_model(
262
- trained_model, tokenizer, "Fine-tuned Qwen3-0.6B (150 steps)"
263
- )
264
-
265
- # Upload to Hub
266
- print("\n" + "=" * 60)
267
- print("PHASE 3: Uploading to HuggingFace Hub")
268
- print("=" * 60)
269
-
270
- try:
271
- # Push model
272
- print(f"Pushing model to {REPO_ID}...")
273
- trained_model.push_to_hub(REPO_ID, token=HF_TOKEN)
274
- tokenizer.push_to_hub(REPO_ID, token=HF_TOKEN)
275
- print(f"Model uploaded successfully!")
276
- print(f"URL: https://huggingface.co/{REPO_ID}")
277
- upload_success = True
278
- except Exception as e:
279
- print(f"Upload failed: {e}")
280
- upload_success = False
281
-
282
- # Summary
283
- print("\n" + "=" * 60)
284
- print("SUMMARY")
285
- print("=" * 60)
286
- print(f"Baseline (from earlier): 27.44%")
287
- print(f"Fine-tuned (150 steps): {finetuned_score:.2f}%")
288
-
289
- if finetuned_score > 27.44:
290
- print(f"IMPROVEMENT: +{finetuned_score - 27.44:.2f}%")
291
- print("SUCCESS! Fine-tuned model beats baseline!")
292
- else:
293
- print(f"DIFFERENCE: {finetuned_score - 27.44:.2f}%")
294
- print("Fine-tuned model did not beat baseline.")
295
-
296
- print(f"\nUpload status: {'SUCCESS' if upload_success else 'FAILED'}")
297
- if upload_success:
298
- print(f"Model URL: https://huggingface.co/{REPO_ID}")
299
- print("=" * 60)
300
-
301
-
302
- if __name__ == "__main__":
303
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_test_upload_v2.py DELETED
@@ -1,303 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "torch",
5
- # "transformers>=4.45.0",
6
- # "accelerate",
7
- # "datasets",
8
- # "trl>=0.12.0",
9
- # "peft",
10
- # "huggingface_hub",
11
- # ]
12
- # ///
13
- """
14
- Combined training, testing, and upload script.
15
- Trains Qwen3-0.6B on codeforces-cots (200 steps), tests on HumanEval, uploads to Hub.
16
- """
17
-
18
- import os
19
- import re
20
- import subprocess
21
- import tempfile
22
- from datasets import load_dataset, Dataset
23
- from transformers import AutoModelForCausalLM, AutoTokenizer
24
- from peft import LoraConfig
25
- from trl import SFTTrainer, SFTConfig
26
- from huggingface_hub import login, HfApi
27
- import torch
28
-
29
- # Authenticate with HF Hub at the start
30
- HF_TOKEN = os.environ.get("HF_TOKEN")
31
- if HF_TOKEN:
32
- login(token=HF_TOKEN)
33
- print("HF Hub authenticated successfully!")
34
- else:
35
- print("WARNING: No HF_TOKEN found in environment")
36
-
37
- REPO_ID = "passagereptile455/qwen3-codeforces-humaneval"
38
-
39
-
40
- def extract_function_body(response: str) -> str:
41
- """Extract just the function body from model response."""
42
- response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
43
- response = response.strip()
44
-
45
- code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
46
- if code_match:
47
- response = code_match.group(1)
48
- else:
49
- code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
50
- if code_match:
51
- response = code_match.group(1)
52
-
53
- response = response.strip()
54
- lines = response.split("\n")
55
-
56
- start_idx = 0
57
- for i, line in enumerate(lines):
58
- if line.strip().startswith("def "):
59
- start_idx = i
60
- break
61
-
62
- start_idx += 1
63
-
64
- if start_idx < len(lines):
65
- stripped = lines[start_idx].strip()
66
- if stripped.startswith('"""') or stripped.startswith("'''"):
67
- quote = stripped[:3]
68
- if stripped.count(quote) >= 2:
69
- start_idx += 1
70
- else:
71
- start_idx += 1
72
- while start_idx < len(lines) and quote not in lines[start_idx]:
73
- start_idx += 1
74
- start_idx += 1
75
-
76
- body_lines = lines[start_idx:]
77
- return "\n".join(body_lines)
78
-
79
-
80
- def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
81
- """Run the test using subprocess."""
82
- full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
83
-
84
- with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
85
- f.write(full_code)
86
- temp_path = f.name
87
-
88
- try:
89
- result = subprocess.run(
90
- ["python", temp_path], capture_output=True, text=True, timeout=10
91
- )
92
- return result.returncode == 0
93
- except subprocess.TimeoutExpired:
94
- return False
95
- except Exception:
96
- return False
97
- finally:
98
- try:
99
- os.unlink(temp_path)
100
- except:
101
- pass
102
-
103
-
104
- def test_model(model, tokenizer, model_name="Model"):
105
- """Test model on HumanEval."""
106
- print(f"\n{'=' * 60}")
107
- print(f"Testing: {model_name}")
108
- print("=" * 60)
109
-
110
- dataset = load_dataset("openai/openai_humaneval", split="test")
111
- print(f"Total problems: {len(dataset)}")
112
-
113
- passed = 0
114
- failed = 0
115
-
116
- for i, problem in enumerate(dataset):
117
- prompt = problem["prompt"]
118
- test = problem["test"]
119
- entry_point = problem["entry_point"]
120
-
121
- messages = [
122
- {
123
- "role": "user",
124
- "content": f"Complete this Python function. Output only the code.\n\n{prompt}",
125
- }
126
- ]
127
-
128
- text = tokenizer.apply_chat_template(
129
- messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
130
- )
131
-
132
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
133
-
134
- with torch.no_grad():
135
- outputs = model.generate(
136
- **inputs,
137
- max_new_tokens=512,
138
- do_sample=False,
139
- pad_token_id=tokenizer.eos_token_id,
140
- )
141
-
142
- response = tokenizer.decode(
143
- outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
144
- )
145
-
146
- completion = extract_function_body(response)
147
- success = run_test_subprocess(prompt, completion, test, entry_point)
148
-
149
- if success:
150
- passed += 1
151
- else:
152
- failed += 1
153
-
154
- if (i + 1) % 20 == 0 or i == len(dataset) - 1:
155
- print(
156
- f"Progress: {i + 1}/{len(dataset)} | Pass: {passed} | Fail: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
157
- )
158
-
159
- final_score = passed / len(dataset) * 100
160
- print(f"\nFINAL: {passed}/{len(dataset)} = {final_score:.2f}%")
161
- return final_score
162
-
163
-
164
- def main():
165
- print("=" * 60)
166
- print("Combined Training, Testing & Upload")
167
- print("200 steps - testing if more training helps")
168
- print("=" * 60)
169
-
170
- model_name = "Qwen/Qwen3-0.6B"
171
-
172
- # Load tokenizer
173
- print("\nLoading tokenizer...")
174
- tokenizer = AutoTokenizer.from_pretrained(model_name)
175
- if tokenizer.pad_token is None:
176
- tokenizer.pad_token = tokenizer.eos_token
177
-
178
- # Load base model
179
- print("Loading base model...")
180
- base_model = AutoModelForCausalLM.from_pretrained(
181
- model_name,
182
- torch_dtype=torch.float16,
183
- device_map="auto",
184
- )
185
-
186
- # LoRA config
187
- lora_config = LoraConfig(
188
- r=8,
189
- lora_alpha=16,
190
- lora_dropout=0.05,
191
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
192
- bias="none",
193
- task_type="CAUSAL_LM",
194
- )
195
-
196
- # Load training dataset
197
- print("\nLoading training dataset (streaming)...")
198
- dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
199
-
200
- print("Preparing examples...")
201
- examples = []
202
- for i, ex in enumerate(dataset):
203
- if i >= 500:
204
- break
205
- text = tokenizer.apply_chat_template(
206
- ex["messages"],
207
- tokenize=False,
208
- add_generation_prompt=False,
209
- )
210
- examples.append({"text": text})
211
-
212
- print(f"Loaded {len(examples)} training examples")
213
- train_dataset = Dataset.from_list(examples)
214
-
215
- # Training config - 200 steps (testing if more helps)
216
- training_args = SFTConfig(
217
- output_dir="./output",
218
- max_steps=200, # Increased from 150
219
- per_device_train_batch_size=2,
220
- gradient_accumulation_steps=4,
221
- learning_rate=5e-6,
222
- lr_scheduler_type="cosine",
223
- warmup_steps=10,
224
- logging_steps=25,
225
- save_steps=200,
226
- fp16=True,
227
- gradient_checkpointing=True,
228
- push_to_hub=False, # We'll push manually after eval
229
- report_to="none",
230
- )
231
-
232
- # Create trainer
233
- print("\nInitializing trainer...")
234
- trainer = SFTTrainer(
235
- model=base_model,
236
- args=training_args,
237
- train_dataset=train_dataset,
238
- peft_config=lora_config,
239
- processing_class=tokenizer,
240
- )
241
-
242
- # Train
243
- print("\n" + "=" * 60)
244
- print("PHASE 1: Training (200 steps)")
245
- print("=" * 60)
246
- trainer.train()
247
-
248
- # Save trained model locally
249
- print("\nSaving trained model locally...")
250
- trainer.save_model("./trained_model")
251
- tokenizer.save_pretrained("./trained_model")
252
-
253
- # Test the fine-tuned model
254
- print("\n" + "=" * 60)
255
- print("PHASE 2: Testing Fine-tuned Model")
256
- print("=" * 60)
257
-
258
- trained_model = trainer.model
259
- trained_model.train(False)
260
-
261
- finetuned_score = test_model(
262
- trained_model, tokenizer, "Fine-tuned Qwen3-0.6B (200 steps)"
263
- )
264
-
265
- # Upload to Hub
266
- print("\n" + "=" * 60)
267
- print("PHASE 3: Uploading to HuggingFace Hub")
268
- print("=" * 60)
269
-
270
- try:
271
- # Push model
272
- print(f"Pushing model to {REPO_ID}...")
273
- trained_model.push_to_hub(REPO_ID, token=HF_TOKEN)
274
- tokenizer.push_to_hub(REPO_ID, token=HF_TOKEN)
275
- print(f"Model uploaded successfully!")
276
- print(f"URL: https://huggingface.co/{REPO_ID}")
277
- upload_success = True
278
- except Exception as e:
279
- print(f"Upload failed: {e}")
280
- upload_success = False
281
-
282
- # Summary
283
- print("\n" + "=" * 60)
284
- print("SUMMARY")
285
- print("=" * 60)
286
- print(f"Baseline (from earlier): 27.44%")
287
- print(f"Fine-tuned (200 steps): {finetuned_score:.2f}%")
288
-
289
- if finetuned_score > 27.44:
290
- print(f"IMPROVEMENT: +{finetuned_score - 27.44:.2f}%")
291
- print("SUCCESS! Fine-tuned model beats baseline!")
292
- else:
293
- print(f"DIFFERENCE: {finetuned_score - 27.44:.2f}%")
294
- print("Fine-tuned model did not beat baseline.")
295
-
296
- print(f"\nUpload status: {'SUCCESS' if upload_success else 'FAILED'}")
297
- if upload_success:
298
- print(f"Model URL: https://huggingface.co/{REPO_ID}")
299
- print("=" * 60)
300
-
301
-
302
- if __name__ == "__main__":
303
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_test_upload_v3.py DELETED
@@ -1,336 +0,0 @@
1
- # /// script
2
- # requires-python = ">=3.10"
3
- # dependencies = [
4
- # "torch",
5
- # "transformers>=4.45.0",
6
- # "accelerate",
7
- # "datasets",
8
- # "trl>=0.12.0",
9
- # "peft",
10
- # "huggingface_hub",
11
- # ]
12
- # ///
13
- """
14
- Combined training, testing, and upload script.
15
- Trains Qwen3-0.6B on codeforces-cots (150 steps - proven optimal), tests on HumanEval, uploads to Hub.
16
- """
17
-
18
- import os
19
- import re
20
- import subprocess
21
- import tempfile
22
- import random
23
- import numpy as np
24
- from datasets import load_dataset, Dataset
25
- from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
26
- from peft import LoraConfig
27
- from trl import SFTTrainer, SFTConfig
28
- from huggingface_hub import login, HfApi
29
- import torch
30
-
31
- # Set seeds for reproducibility
32
- SEED = 42
33
- random.seed(SEED)
34
- np.random.seed(SEED)
35
- torch.manual_seed(SEED)
36
- torch.cuda.manual_seed_all(SEED)
37
- set_seed(SEED)
38
-
39
- # Authenticate with HF Hub at the start
40
- HF_TOKEN = os.environ.get("HF_TOKEN")
41
- if HF_TOKEN:
42
- login(token=HF_TOKEN)
43
- print("HF Hub authenticated successfully!")
44
- else:
45
- print("WARNING: No HF_TOKEN found in environment")
46
-
47
- REPO_ID = "passagereptile455/qwen3-codeforces-humaneval"
48
-
49
-
50
- def extract_function_body(response: str) -> str:
51
- """Extract just the function body from model response."""
52
- response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
53
- response = response.strip()
54
-
55
- code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
56
- if code_match:
57
- response = code_match.group(1)
58
- else:
59
- code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
60
- if code_match:
61
- response = code_match.group(1)
62
-
63
- response = response.strip()
64
- lines = response.split("\n")
65
-
66
- start_idx = 0
67
- for i, line in enumerate(lines):
68
- if line.strip().startswith("def "):
69
- start_idx = i
70
- break
71
-
72
- start_idx += 1
73
-
74
- if start_idx < len(lines):
75
- stripped = lines[start_idx].strip()
76
- if stripped.startswith('"""') or stripped.startswith("'''"):
77
- quote = stripped[:3]
78
- if stripped.count(quote) >= 2:
79
- start_idx += 1
80
- else:
81
- start_idx += 1
82
- while start_idx < len(lines) and quote not in lines[start_idx]:
83
- start_idx += 1
84
- start_idx += 1
85
-
86
- body_lines = lines[start_idx:]
87
- return "\n".join(body_lines)
88
-
89
-
90
- def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
91
- """Run the test using subprocess."""
92
- full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
93
-
94
- with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
95
- f.write(full_code)
96
- temp_path = f.name
97
-
98
- try:
99
- result = subprocess.run(
100
- ["python", temp_path], capture_output=True, text=True, timeout=10
101
- )
102
- return result.returncode == 0
103
- except subprocess.TimeoutExpired:
104
- return False
105
- except Exception:
106
- return False
107
- finally:
108
- try:
109
- os.unlink(temp_path)
110
- except:
111
- pass
112
-
113
-
114
- def test_model(model, tokenizer, model_name="Model"):
115
- """Test model on HumanEval."""
116
- print(f"\n{'=' * 60}")
117
- print(f"Testing: {model_name}")
118
- print("=" * 60)
119
-
120
- dataset = load_dataset("openai/openai_humaneval", split="test")
121
- print(f"Total problems: {len(dataset)}")
122
-
123
- passed = 0
124
- failed = 0
125
-
126
- for i, problem in enumerate(dataset):
127
- prompt = problem["prompt"]
128
- test = problem["test"]
129
- entry_point = problem["entry_point"]
130
-
131
- messages = [
132
- {
133
- "role": "user",
134
- "content": f"Complete this Python function. Output only the code.\n\n{prompt}",
135
- }
136
- ]
137
-
138
- text = tokenizer.apply_chat_template(
139
- messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
140
- )
141
-
142
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
143
-
144
- with torch.no_grad():
145
- outputs = model.generate(
146
- **inputs,
147
- max_new_tokens=512,
148
- do_sample=False,
149
- pad_token_id=tokenizer.eos_token_id,
150
- )
151
-
152
- response = tokenizer.decode(
153
- outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
154
- )
155
-
156
- completion = extract_function_body(response)
157
- success = run_test_subprocess(prompt, completion, test, entry_point)
158
-
159
- if success:
160
- passed += 1
161
- else:
162
- failed += 1
163
-
164
- if (i + 1) % 20 == 0 or i == len(dataset) - 1:
165
- print(
166
- f"Progress: {i + 1}/{len(dataset)} | Pass: {passed} | Fail: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
167
- )
168
-
169
- final_score = passed / len(dataset) * 100
170
- print(f"\nFINAL: {passed}/{len(dataset)} = {final_score:.2f}%")
171
- return final_score
172
-
173
-
174
- def main():
175
- print("=" * 60)
176
- print("Combined Training, Testing & Upload")
177
- print("150 steps - with SAME-RUN baseline comparison")
178
- print("=" * 60)
179
-
180
- model_name = "Qwen/Qwen3-0.6B"
181
-
182
- # Load tokenizer
183
- print("\nLoading tokenizer...")
184
- tokenizer = AutoTokenizer.from_pretrained(model_name)
185
- if tokenizer.pad_token is None:
186
- tokenizer.pad_token = tokenizer.eos_token
187
-
188
- # Load base model
189
- print("Loading base model...")
190
- base_model = AutoModelForCausalLM.from_pretrained(
191
- model_name,
192
- torch_dtype=torch.float16,
193
- device_map="auto",
194
- )
195
-
196
- # PHASE 1: Test BASE model first (same run comparison)
197
- print("\n" + "=" * 60)
198
- print("PHASE 1: Testing BASE Model (for fair comparison)")
199
- print("=" * 60)
200
- base_score = test_model(base_model, tokenizer, "Base Qwen3-0.6B")
201
-
202
- # LoRA config
203
- lora_config = LoraConfig(
204
- r=8,
205
- lora_alpha=16,
206
- lora_dropout=0.05,
207
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
208
- bias="none",
209
- task_type="CAUSAL_LM",
210
- )
211
-
212
- # Load training dataset
213
- print("\nLoading training dataset (streaming)...")
214
- dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
215
-
216
- print("Preparing examples...")
217
- examples = []
218
- for i, ex in enumerate(dataset):
219
- if i >= 500:
220
- break
221
- text = tokenizer.apply_chat_template(
222
- ex["messages"],
223
- tokenize=False,
224
- add_generation_prompt=False,
225
- )
226
- examples.append({"text": text})
227
-
228
- print(f"Loaded {len(examples)} training examples")
229
- train_dataset = Dataset.from_list(examples)
230
-
231
- # Training config - 150 steps (proven optimal)
232
- training_args = SFTConfig(
233
- output_dir="./output",
234
- max_steps=150, # Proven optimal - 200 regresses
235
- per_device_train_batch_size=2,
236
- gradient_accumulation_steps=4,
237
- learning_rate=5e-6,
238
- lr_scheduler_type="cosine",
239
- warmup_steps=10,
240
- logging_steps=25,
241
- save_steps=150,
242
- fp16=True,
243
- gradient_checkpointing=True,
244
- push_to_hub=False, # We'll push manually after eval
245
- report_to="none",
246
- seed=42, # Fixed seed for reproducibility
247
- )
248
-
249
- # Need to reload model for training (can't train already-evaluated model cleanly)
250
- print("\nReloading model for training...")
251
- del base_model
252
- torch.cuda.empty_cache()
253
-
254
- train_model = AutoModelForCausalLM.from_pretrained(
255
- model_name,
256
- torch_dtype=torch.float16,
257
- device_map="auto",
258
- )
259
-
260
- # Create trainer
261
- print("\nInitializing trainer...")
262
- trainer = SFTTrainer(
263
- model=train_model,
264
- args=training_args,
265
- train_dataset=train_dataset,
266
- peft_config=lora_config,
267
- processing_class=tokenizer,
268
- )
269
-
270
- # Train
271
- print("\n" + "=" * 60)
272
- print("PHASE 2: Training (150 steps)")
273
- print("=" * 60)
274
- trainer.train()
275
-
276
- # Save trained model locally
277
- print("\nSaving trained model locally...")
278
- trainer.save_model("./trained_model")
279
- tokenizer.save_pretrained("./trained_model")
280
-
281
- # Test the fine-tuned model
282
- print("\n" + "=" * 60)
283
- print("PHASE 3: Testing Fine-tuned Model")
284
- print("=" * 60)
285
-
286
- trained_model = trainer.model
287
- trained_model.train(False)
288
-
289
- finetuned_score = test_model(
290
- trained_model, tokenizer, "Fine-tuned Qwen3-0.6B (150 steps)"
291
- )
292
-
293
- # Upload to Hub only if we beat the baseline
294
- print("\n" + "=" * 60)
295
- print("PHASE 4: Uploading to HuggingFace Hub")
296
- print("=" * 60)
297
-
298
- upload_success = False
299
- if finetuned_score > base_score:
300
- try:
301
- print(f"Pushing model to {REPO_ID}...")
302
- trained_model.push_to_hub(REPO_ID, token=HF_TOKEN)
303
- tokenizer.push_to_hub(REPO_ID, token=HF_TOKEN)
304
- print(f"Model uploaded successfully!")
305
- print(f"URL: https://huggingface.co/{REPO_ID}")
306
- upload_success = True
307
- except Exception as e:
308
- print(f"Upload failed: {e}")
309
- else:
310
- print("Fine-tuned model did NOT beat baseline - skipping upload")
311
-
312
- # Summary - SAME RUN COMPARISON
313
- print("\n" + "=" * 60)
314
- print("SUMMARY (Same-Run Comparison)")
315
- print("=" * 60)
316
- print(f"Base model (this run): {base_score:.2f}%")
317
- print(f"Fine-tuned (150 steps): {finetuned_score:.2f}%")
318
- diff = finetuned_score - base_score
319
-
320
- if diff > 0:
321
- print(f"IMPROVEMENT: +{diff:.2f}%")
322
- print("SUCCESS! Fine-tuned model beats baseline!")
323
- elif diff == 0:
324
- print("NO CHANGE: Same as baseline")
325
- else:
326
- print(f"REGRESSION: {diff:.2f}%")
327
- print("Fine-tuned model is WORSE than baseline.")
328
-
329
- print(f"\nUpload status: {'SUCCESS' if upload_success else 'SKIPPED/FAILED'}")
330
- if upload_success:
331
- print(f"Model URL: https://huggingface.co/{REPO_ID}")
332
- print("=" * 60)
333
-
334
-
335
- if __name__ == "__main__":
336
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_v5_fixed.py DELETED
@@ -1,129 +0,0 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.12.0",
4
- # "peft>=0.7.0",
5
- # "transformers>=4.36.0",
6
- # "accelerate>=0.24.0",
7
- # "trackio",
8
- # "datasets",
9
- # ]
10
- # ///
11
-
12
- """
13
- Training with proper dataset formatting
14
- """
15
-
16
- import sys
17
- import traceback
18
- from datasets import load_dataset, Dataset
19
- from peft import LoraConfig
20
- from trl import SFTTrainer, SFTConfig
21
- from transformers import AutoTokenizer
22
- import torch
23
-
24
- print("=" * 50)
25
- print("FIXED TRAINING v5")
26
- print("=" * 50)
27
-
28
- try:
29
- print(f"CUDA: {torch.cuda.is_available()}")
30
-
31
- # Streaming load
32
- print("Streaming codeforces-cots...")
33
- streaming_ds = load_dataset(
34
- "open-r1/codeforces-cots", split="train", streaming=True
35
- )
36
-
37
- # Collect examples
38
- print("Collecting 1000 examples...")
39
- examples = []
40
- for i, ex in enumerate(streaming_ds):
41
- if i >= 1000:
42
- break
43
- examples.append(ex)
44
-
45
- print(f"Collected {len(examples)} examples")
46
- dataset = Dataset.from_list(examples)
47
- print(f"Dataset columns: {dataset.column_names}")
48
-
49
- # Check messages format
50
- print(f"First messages sample: {dataset[0]['messages'][:100]}...")
51
-
52
- # Load tokenizer
53
- print("Loading tokenizer...")
54
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
55
- if tokenizer.pad_token is None:
56
- tokenizer.pad_token = tokenizer.eos_token
57
-
58
- # Convert messages to text format for SFT
59
- def format_messages(example):
60
- messages = example["messages"]
61
- # Format as simple text
62
- text = ""
63
- for msg in messages:
64
- role = msg.get("role", "user")
65
- content = msg.get("content", "")
66
- text += f"<|{role}|>\n{content}\n"
67
- return {"text": text}
68
-
69
- print("Formatting dataset...")
70
- dataset = dataset.map(format_messages, remove_columns=dataset.column_names)
71
- print(f"Formatted. Sample: {dataset[0]['text'][:200]}...")
72
-
73
- # Config
74
- config = SFTConfig(
75
- output_dir="qwen3-codeforces",
76
- push_to_hub=True,
77
- hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
78
- hub_strategy="every_save",
79
- max_steps=200,
80
- per_device_train_batch_size=1,
81
- gradient_accumulation_steps=8,
82
- learning_rate=5e-6,
83
- max_length=512,
84
- logging_steps=20,
85
- save_strategy="steps",
86
- save_steps=100,
87
- save_total_limit=1,
88
- eval_strategy="no",
89
- warmup_ratio=0.1,
90
- lr_scheduler_type="cosine",
91
- gradient_checkpointing=True,
92
- bf16=True,
93
- dataset_text_field="text", # Specify text field
94
- report_to="trackio",
95
- project="qwen3-humaneval",
96
- run_name="job1-v5",
97
- )
98
-
99
- peft_config = LoraConfig(
100
- r=8,
101
- lora_alpha=16,
102
- lora_dropout=0.05,
103
- bias="none",
104
- task_type="CAUSAL_LM",
105
- target_modules=["q_proj", "v_proj"],
106
- )
107
-
108
- print("Creating trainer...")
109
- trainer = SFTTrainer(
110
- model="Qwen/Qwen3-0.6B",
111
- train_dataset=dataset,
112
- args=config,
113
- peft_config=peft_config,
114
- )
115
-
116
- print("Training (200 steps)...")
117
- trainer.train()
118
-
119
- print("Pushing to Hub...")
120
- trainer.push_to_hub()
121
-
122
- print("=" * 50)
123
- print("SUCCESS!")
124
- print("=" * 50)
125
-
126
- except Exception as e:
127
- print(f"ERROR: {e}")
128
- traceback.print_exc()
129
- sys.exit(1)