passagereptile455 commited on
Commit
c389ee5
·
verified ·
1 Parent(s): 2336b3d

Add clean training script with solutions_py subset

Browse files
Files changed (1) hide show
  1. train_humaneval_clean.py +299 -0
train_humaneval_clean.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "trl>=0.15.0",
4
+ # "peft>=0.14.0",
5
+ # "transformers>=4.51.0",
6
+ # "accelerate>=0.30.0",
7
+ # "datasets",
8
+ # "torch",
9
+ # "huggingface_hub",
10
+ # "human_eval",
11
+ # ]
12
+ # ///
13
+ """
14
+ Fine-tune Qwen3-0.6B on codeforces-cots (Python subset) to beat base on HumanEval.
15
+ Reproduction of Ben Burtenshaw's Claude Code vs Codex challenge.
16
+ """
17
+
18
+ import os
19
+ import sys
20
+ import time
21
+ import tempfile
22
+ import json
23
+
24
+ # === PHASE 0: Authentication ===
25
+ print("=" * 60)
26
+ print("PHASE 0: Authentication")
27
+ print("=" * 60)
28
+
29
+ from huggingface_hub import login, HfApi
30
+
31
+ HF_TOKEN = os.environ.get("HF_TOKEN")
32
+ if not HF_TOKEN:
33
+ raise ValueError("HF_TOKEN environment variable required")
34
+
35
+ login(token=HF_TOKEN)
36
+ api = HfApi()
37
+ user_info = api.whoami()
38
+ print(f"Authenticated as: {user_info['name']}")
39
+
40
+ MODEL_NAME = "Qwen/Qwen3-0.6B"
41
+ DATASET_NAME = "open-r1/codeforces-cots"
42
+ DATASET_SUBSET = "solutions_py"
43
+ OUTPUT_REPO = f"{user_info['name']}/qwen3-humaneval-sft"
44
+ NUM_EXAMPLES = 500
45
+ MAX_STEPS = 150
46
+
47
+ print(f"Model: {MODEL_NAME}")
48
+ print(f"Dataset: {DATASET_NAME} ({DATASET_SUBSET} subset)")
49
+ print(f"Output: {OUTPUT_REPO}")
50
+
51
+
52
+ # === PHASE 1: Load Base Model and Run Benchmark ===
53
+ print("\n" + "=" * 60)
54
+ print("PHASE 1: Benchmark Base Model on HumanEval")
55
+ print("=" * 60)
56
+
57
+ import torch
58
+ from transformers import AutoModelForCausalLM, AutoTokenizer
59
+
60
+ print("Loading base model...")
61
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
62
+ base_model = AutoModelForCausalLM.from_pretrained(
63
+ MODEL_NAME,
64
+ torch_dtype=torch.float16,
65
+ device_map="auto",
66
+ trust_remote_code=True,
67
+ )
68
+ print(f"Model loaded on {base_model.device}")
69
+
70
+
71
+ def run_humaneval_benchmark(model, tokenizer, label="model"):
72
+ """Run HumanEval benchmark on model."""
73
+ from human_eval.data import read_problems
74
+ from human_eval.evaluation import evaluate_functional_correctness as check_correctness
75
+
76
+ problems = read_problems()
77
+ print(f"Testing {label} on {len(problems)} HumanEval problems...")
78
+
79
+ samples = []
80
+ model.eval()
81
+
82
+ for i, (task_id, problem) in enumerate(problems.items()):
83
+ prompt = problem["prompt"]
84
+
85
+ messages = [{"role": "user", "content": f"Complete this Python function:\n\n{prompt}"}]
86
+ text = tokenizer.apply_chat_template(
87
+ messages,
88
+ tokenize=False,
89
+ add_generation_prompt=True,
90
+ enable_thinking=False,
91
+ )
92
+
93
+ inputs = tokenizer(text, return_tensors="pt").to(model.device)
94
+
95
+ with torch.no_grad():
96
+ outputs = model.generate(
97
+ **inputs,
98
+ max_new_tokens=512,
99
+ do_sample=False,
100
+ pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
101
+ )
102
+
103
+ response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
104
+
105
+ if "```python" in response:
106
+ code = response.split("```python")[1].split("```")[0].strip()
107
+ elif "```" in response:
108
+ code = response.split("```")[1].split("```")[0].strip()
109
+ else:
110
+ code = response.strip()
111
+
112
+ completion = prompt + code
113
+ samples.append({"task_id": task_id, "completion": completion})
114
+
115
+ if (i + 1) % 20 == 0:
116
+ print(f" Progress: {i + 1}/{len(problems)}")
117
+
118
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
119
+ for s in samples:
120
+ f.write(json.dumps(s) + "\n")
121
+ samples_file = f.name
122
+
123
+ results = check_correctness(samples_file, k=[1], timeout=10.0)
124
+ os.unlink(samples_file)
125
+
126
+ score = results["pass@1"] * 100
127
+ passed = int(score * len(problems) / 100)
128
+ print(f"{label} score: {score:.2f}% ({passed}/{len(problems)} passed)")
129
+ return score, passed, len(problems)
130
+
131
+
132
+ base_score, base_passed, total = run_humaneval_benchmark(base_model, tokenizer, "BASE")
133
+
134
+ del base_model
135
+ torch.cuda.empty_cache()
136
+ print(f"\nBase model score: {base_score:.2f}%")
137
+
138
+
139
+ # === PHASE 2: Train on codeforces-cots (Python subset) ===
140
+ print("\n" + "=" * 60)
141
+ print("PHASE 2: Fine-tune on codeforces-cots (solutions_py)")
142
+ print("=" * 60)
143
+
144
+ from datasets import load_dataset, Dataset
145
+ from peft import LoraConfig
146
+ from trl import SFTTrainer, SFTConfig
147
+
148
+ print("Reloading model for training...")
149
+ model = AutoModelForCausalLM.from_pretrained(
150
+ MODEL_NAME,
151
+ torch_dtype=torch.float16,
152
+ device_map="auto",
153
+ trust_remote_code=True,
154
+ )
155
+
156
+ print(f"Loading {DATASET_NAME} ({DATASET_SUBSET} subset)...")
157
+ ds = load_dataset(DATASET_NAME, DATASET_SUBSET, split="train", streaming=True)
158
+
159
+ examples = []
160
+ print(f"Preparing {NUM_EXAMPLES} training examples...")
161
+ for i, ex in enumerate(ds):
162
+ if i >= NUM_EXAMPLES:
163
+ break
164
+ text = tokenizer.apply_chat_template(ex["messages"], tokenize=False)
165
+ examples.append({"text": text})
166
+ if (i + 1) % 100 == 0:
167
+ print(f" Prepared {i + 1}/{NUM_EXAMPLES} examples")
168
+
169
+ train_dataset = Dataset.from_list(examples)
170
+ print(f"Training dataset ready: {len(train_dataset)} examples")
171
+
172
+ lora_config = LoraConfig(
173
+ r=8,
174
+ lora_alpha=16,
175
+ lora_dropout=0.05,
176
+ target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
177
+ bias="none",
178
+ task_type="CAUSAL_LM",
179
+ )
180
+
181
+ sft_config = SFTConfig(
182
+ output_dir="./sft_output",
183
+ max_steps=MAX_STEPS,
184
+ learning_rate=5e-6,
185
+ per_device_train_batch_size=2,
186
+ gradient_accumulation_steps=4,
187
+ fp16=True,
188
+ gradient_checkpointing=True,
189
+ logging_steps=10,
190
+ save_steps=50,
191
+ max_seq_length=2048,
192
+ dataset_text_field="text",
193
+ )
194
+
195
+ trainer = SFTTrainer(
196
+ model=model,
197
+ args=sft_config,
198
+ train_dataset=train_dataset,
199
+ peft_config=lora_config,
200
+ processing_class=tokenizer,
201
+ )
202
+
203
+ print(f"Starting training for {MAX_STEPS} steps...")
204
+ start_time = time.time()
205
+ trainer.train()
206
+ train_time = time.time() - start_time
207
+ print(f"Training completed in {train_time/60:.1f} minutes")
208
+
209
+ print("Merging LoRA weights...")
210
+ model = trainer.model.merge_and_unload()
211
+
212
+
213
+ # === PHASE 3: Benchmark Fine-tuned Model ===
214
+ print("\n" + "=" * 60)
215
+ print("PHASE 3: Benchmark Fine-tuned Model")
216
+ print("=" * 60)
217
+
218
+ ft_score, ft_passed, _ = run_humaneval_benchmark(model, tokenizer, "FINE-TUNED")
219
+
220
+
221
+ # === PHASE 4: Compare and Upload ===
222
+ print("\n" + "=" * 60)
223
+ print("PHASE 4: Results and Upload")
224
+ print("=" * 60)
225
+
226
+ improvement = ft_score - base_score
227
+ improved_problems = ft_passed - base_passed
228
+
229
+ print(f"\n{'='*40}")
230
+ print("RESULTS SUMMARY")
231
+ print(f"{'='*40}")
232
+ print(f"Base model: {base_score:.2f}% ({base_passed}/{total})")
233
+ print(f"Fine-tuned model: {ft_score:.2f}% ({ft_passed}/{total})")
234
+ print(f"Improvement: {improvement:+.2f}% ({improved_problems:+d} problems)")
235
+ print(f"{'='*40}")
236
+
237
+ if ft_score > base_score:
238
+ print("\n*** SUCCESS: Fine-tuned beats base! ***")
239
+ print(f"Uploading to {OUTPUT_REPO}...")
240
+
241
+ model_card = f"""---
242
+ tags:
243
+ - fine-tuned
244
+ - qwen3
245
+ - humaneval
246
+ - codeforces
247
+ - lora
248
+ base_model: {MODEL_NAME}
249
+ datasets:
250
+ - {DATASET_NAME}
251
+ ---
252
+
253
+ # Qwen3-0.6B Fine-tuned on Codeforces-CoTS (Python)
254
+
255
+ Fine-tuned using SFT on the **solutions_py** subset of `{DATASET_NAME}`.
256
+
257
+ ## Results on HumanEval
258
+
259
+ | Model | Score | Problems Passed |
260
+ |-------|-------|-----------------|
261
+ | Base (Qwen3-0.6B) | {base_score:.2f}% | {base_passed}/{total} |
262
+ | **Fine-tuned** | **{ft_score:.2f}%** | **{ft_passed}/{total}** |
263
+ | **Improvement** | **{improvement:+.2f}%** | **{improved_problems:+d} problems** |
264
+
265
+ ## Training Details
266
+
267
+ - **Dataset**: {DATASET_NAME} ({DATASET_SUBSET} subset) - {NUM_EXAMPLES} examples
268
+ - **Method**: LoRA (r=8, alpha=16)
269
+ - **Steps**: {MAX_STEPS}
270
+ - **Learning Rate**: 5e-6
271
+
272
+ ## Usage
273
+
274
+ ```python
275
+ from transformers import AutoModelForCausalLM, AutoTokenizer
276
+
277
+ model = AutoModelForCausalLM.from_pretrained("{OUTPUT_REPO}")
278
+ tokenizer = AutoTokenizer.from_pretrained("{OUTPUT_REPO}")
279
+ ```
280
+ """
281
+
282
+ model.push_to_hub(OUTPUT_REPO, commit_message="Fine-tuned model beating base on HumanEval")
283
+ tokenizer.push_to_hub(OUTPUT_REPO, commit_message="Add tokenizer")
284
+
285
+ api.upload_file(
286
+ path_or_fileobj=model_card.encode(),
287
+ path_in_repo="README.md",
288
+ repo_id=OUTPUT_REPO,
289
+ commit_message="Add model card with results",
290
+ )
291
+
292
+ print(f"\n*** Model uploaded to: https://huggingface.co/{OUTPUT_REPO} ***")
293
+ else:
294
+ print(f"\nFine-tuned ({ft_score:.2f}%) did not beat base ({base_score:.2f}%)")
295
+ print("Consider running another job with different random state.")
296
+
297
+ print(f"\n{'='*60}")
298
+ print("JOB COMPLETE")
299
+ print(f"{'='*60}")