passagereptile455 commited on
Commit
618bb37
·
verified ·
1 Parent(s): 0e87ee9

Fix push_to_hub to pass token explicitly

Browse files
Files changed (1) hide show
  1. train_humaneval_clean.py +299 -299
train_humaneval_clean.py CHANGED
@@ -1,299 +1,299 @@
1
- # /// script
2
- # dependencies = [
3
- # "trl>=0.15.0",
4
- # "peft>=0.14.0",
5
- # "transformers>=4.51.0",
6
- # "accelerate>=0.30.0",
7
- # "datasets",
8
- # "torch",
9
- # "huggingface_hub",
10
- # "human_eval",
11
- # ]
12
- # ///
13
- """
14
- Fine-tune Qwen3-0.6B on codeforces-cots (Python subset) to beat base on HumanEval.
15
- Reproduction of Ben Burtenshaw's Claude Code vs Codex challenge.
16
- """
17
-
18
- import os
19
- import sys
20
- import time
21
- import tempfile
22
- import json
23
-
24
- # === PHASE 0: Authentication ===
25
- print("=" * 60)
26
- print("PHASE 0: Authentication")
27
- print("=" * 60)
28
-
29
- from huggingface_hub import HfApi
30
-
31
- HF_TOKEN = os.environ.get("HF_TOKEN")
32
- if not HF_TOKEN:
33
- raise ValueError("HF_TOKEN environment variable required")
34
-
35
- # Removed login() - using HfApi(token=) instead
36
- api = HfApi(token=HF_TOKEN)
37
- user_info = api.whoami()
38
- print(f"Authenticated as: {user_info['name']}")
39
-
40
- MODEL_NAME = "Qwen/Qwen3-0.6B"
41
- DATASET_NAME = "open-r1/codeforces-cots"
42
- DATASET_SUBSET = "solutions_py"
43
- OUTPUT_REPO = f"{user_info['name']}/qwen3-humaneval-sft"
44
- NUM_EXAMPLES = 500
45
- MAX_STEPS = 150
46
-
47
- print(f"Model: {MODEL_NAME}")
48
- print(f"Dataset: {DATASET_NAME} ({DATASET_SUBSET} subset)")
49
- print(f"Output: {OUTPUT_REPO}")
50
-
51
-
52
- # === PHASE 1: Load Base Model and Run Benchmark ===
53
- print("\n" + "=" * 60)
54
- print("PHASE 1: Benchmark Base Model on HumanEval")
55
- print("=" * 60)
56
-
57
- import torch
58
- from transformers import AutoModelForCausalLM, AutoTokenizer
59
-
60
- print("Loading base model...")
61
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
62
- base_model = AutoModelForCausalLM.from_pretrained(
63
- MODEL_NAME,
64
- torch_dtype=torch.float16,
65
- device_map="auto",
66
- trust_remote_code=True,
67
- )
68
- print(f"Model loaded on {base_model.device}")
69
-
70
-
71
- def run_humaneval_benchmark(model, tokenizer, label="model"):
72
- """Run HumanEval benchmark on model."""
73
- from human_eval.data import read_problems
74
- from human_eval.evaluation import evaluate_functional_correctness as check_correctness
75
-
76
- problems = read_problems()
77
- print(f"Testing {label} on {len(problems)} HumanEval problems...")
78
-
79
- samples = []
80
- model.eval()
81
-
82
- for i, (task_id, problem) in enumerate(problems.items()):
83
- prompt = problem["prompt"]
84
-
85
- messages = [{"role": "user", "content": f"Complete this Python function:\n\n{prompt}"}]
86
- text = tokenizer.apply_chat_template(
87
- messages,
88
- tokenize=False,
89
- add_generation_prompt=True,
90
- enable_thinking=False,
91
- )
92
-
93
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
94
-
95
- with torch.no_grad():
96
- outputs = model.generate(
97
- **inputs,
98
- max_new_tokens=512,
99
- do_sample=False,
100
- pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
101
- )
102
-
103
- response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
104
-
105
- if "```python" in response:
106
- code = response.split("```python")[1].split("```")[0].strip()
107
- elif "```" in response:
108
- code = response.split("```")[1].split("```")[0].strip()
109
- else:
110
- code = response.strip()
111
-
112
- completion = prompt + code
113
- samples.append({"task_id": task_id, "completion": completion})
114
-
115
- if (i + 1) % 20 == 0:
116
- print(f" Progress: {i + 1}/{len(problems)}")
117
-
118
- with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
119
- for s in samples:
120
- f.write(json.dumps(s) + "\n")
121
- samples_file = f.name
122
-
123
- results = check_correctness(samples_file, k=[1], timeout=10.0)
124
- os.unlink(samples_file)
125
-
126
- score = results["pass@1"] * 100
127
- passed = int(score * len(problems) / 100)
128
- print(f"{label} score: {score:.2f}% ({passed}/{len(problems)} passed)")
129
- return score, passed, len(problems)
130
-
131
-
132
- base_score, base_passed, total = run_humaneval_benchmark(base_model, tokenizer, "BASE")
133
-
134
- del base_model
135
- torch.cuda.empty_cache()
136
- print(f"\nBase model score: {base_score:.2f}%")
137
-
138
-
139
- # === PHASE 2: Train on codeforces-cots (Python subset) ===
140
- print("\n" + "=" * 60)
141
- print("PHASE 2: Fine-tune on codeforces-cots (solutions_py)")
142
- print("=" * 60)
143
-
144
- from datasets import load_dataset, Dataset
145
- from peft import LoraConfig
146
- from trl import SFTTrainer, SFTConfig
147
-
148
- print("Reloading model for training...")
149
- model = AutoModelForCausalLM.from_pretrained(
150
- MODEL_NAME,
151
- torch_dtype=torch.float16,
152
- device_map="auto",
153
- trust_remote_code=True,
154
- )
155
-
156
- print(f"Loading {DATASET_NAME} ({DATASET_SUBSET} subset)...")
157
- ds = load_dataset(DATASET_NAME, DATASET_SUBSET, split="train", streaming=True)
158
-
159
- examples = []
160
- print(f"Preparing {NUM_EXAMPLES} training examples...")
161
- for i, ex in enumerate(ds):
162
- if i >= NUM_EXAMPLES:
163
- break
164
- text = tokenizer.apply_chat_template(ex["messages"], tokenize=False)
165
- examples.append({"text": text})
166
- if (i + 1) % 100 == 0:
167
- print(f" Prepared {i + 1}/{NUM_EXAMPLES} examples")
168
-
169
- train_dataset = Dataset.from_list(examples)
170
- print(f"Training dataset ready: {len(train_dataset)} examples")
171
-
172
- lora_config = LoraConfig(
173
- r=8,
174
- lora_alpha=16,
175
- lora_dropout=0.05,
176
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
177
- bias="none",
178
- task_type="CAUSAL_LM",
179
- )
180
-
181
- sft_config = SFTConfig(
182
- output_dir="./sft_output",
183
- max_steps=MAX_STEPS,
184
- learning_rate=5e-6,
185
- per_device_train_batch_size=2,
186
- gradient_accumulation_steps=4,
187
- fp16=True,
188
- gradient_checkpointing=True,
189
- logging_steps=10,
190
- save_steps=50,
191
- max_length=2048,
192
- dataset_text_field="text",
193
- )
194
-
195
- trainer = SFTTrainer(
196
- model=model,
197
- args=sft_config,
198
- train_dataset=train_dataset,
199
- peft_config=lora_config,
200
- processing_class=tokenizer,
201
- )
202
-
203
- print(f"Starting training for {MAX_STEPS} steps...")
204
- start_time = time.time()
205
- trainer.train()
206
- train_time = time.time() - start_time
207
- print(f"Training completed in {train_time/60:.1f} minutes")
208
-
209
- print("Merging LoRA weights...")
210
- model = trainer.model.merge_and_unload()
211
-
212
-
213
- # === PHASE 3: Benchmark Fine-tuned Model ===
214
- print("\n" + "=" * 60)
215
- print("PHASE 3: Benchmark Fine-tuned Model")
216
- print("=" * 60)
217
-
218
- ft_score, ft_passed, _ = run_humaneval_benchmark(model, tokenizer, "FINE-TUNED")
219
-
220
-
221
- # === PHASE 4: Compare and Upload ===
222
- print("\n" + "=" * 60)
223
- print("PHASE 4: Results and Upload")
224
- print("=" * 60)
225
-
226
- improvement = ft_score - base_score
227
- improved_problems = ft_passed - base_passed
228
-
229
- print(f"\n{'='*40}")
230
- print("RESULTS SUMMARY")
231
- print(f"{'='*40}")
232
- print(f"Base model: {base_score:.2f}% ({base_passed}/{total})")
233
- print(f"Fine-tuned model: {ft_score:.2f}% ({ft_passed}/{total})")
234
- print(f"Improvement: {improvement:+.2f}% ({improved_problems:+d} problems)")
235
- print(f"{'='*40}")
236
-
237
- if ft_score > base_score:
238
- print("\n*** SUCCESS: Fine-tuned beats base! ***")
239
- print(f"Uploading to {OUTPUT_REPO}...")
240
-
241
- model_card = f"""---
242
- tags:
243
- - fine-tuned
244
- - qwen3
245
- - humaneval
246
- - codeforces
247
- - lora
248
- base_model: {MODEL_NAME}
249
- datasets:
250
- - {DATASET_NAME}
251
- ---
252
-
253
- # Qwen3-0.6B Fine-tuned on Codeforces-CoTS (Python)
254
-
255
- Fine-tuned using SFT on the **solutions_py** subset of `{DATASET_NAME}`.
256
-
257
- ## Results on HumanEval
258
-
259
- | Model | Score | Problems Passed |
260
- |-------|-------|-----------------|
261
- | Base (Qwen3-0.6B) | {base_score:.2f}% | {base_passed}/{total} |
262
- | **Fine-tuned** | **{ft_score:.2f}%** | **{ft_passed}/{total}** |
263
- | **Improvement** | **{improvement:+.2f}%** | **{improved_problems:+d} problems** |
264
-
265
- ## Training Details
266
-
267
- - **Dataset**: {DATASET_NAME} ({DATASET_SUBSET} subset) - {NUM_EXAMPLES} examples
268
- - **Method**: LoRA (r=8, alpha=16)
269
- - **Steps**: {MAX_STEPS}
270
- - **Learning Rate**: 5e-6
271
-
272
- ## Usage
273
-
274
- ```python
275
- from transformers import AutoModelForCausalLM, AutoTokenizer
276
-
277
- model = AutoModelForCausalLM.from_pretrained("{OUTPUT_REPO}")
278
- tokenizer = AutoTokenizer.from_pretrained("{OUTPUT_REPO}")
279
- ```
280
- """
281
-
282
- model.push_to_hub(OUTPUT_REPO, commit_message="Fine-tuned model beating base on HumanEval")
283
- tokenizer.push_to_hub(OUTPUT_REPO, commit_message="Add tokenizer")
284
-
285
- api.upload_file(
286
- path_or_fileobj=model_card.encode(),
287
- path_in_repo="README.md",
288
- repo_id=OUTPUT_REPO,
289
- commit_message="Add model card with results",
290
- )
291
-
292
- print(f"\n*** Model uploaded to: https://huggingface.co/{OUTPUT_REPO} ***")
293
- else:
294
- print(f"\nFine-tuned ({ft_score:.2f}%) did not beat base ({base_score:.2f}%)")
295
- print("Consider running another job with different random state.")
296
-
297
- print(f"\n{'='*60}")
298
- print("JOB COMPLETE")
299
- print(f"{'='*60}")
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "trl>=0.15.0",
4
+ # "peft>=0.14.0",
5
+ # "transformers>=4.51.0",
6
+ # "accelerate>=0.30.0",
7
+ # "datasets",
8
+ # "torch",
9
+ # "huggingface_hub",
10
+ # "human_eval",
11
+ # ]
12
+ # ///
13
+ """
14
+ Fine-tune Qwen3-0.6B on codeforces-cots (Python subset) to beat base on HumanEval.
15
+ Reproduction of Ben Burtenshaw's Claude Code vs Codex challenge.
16
+ """
17
+
18
+ import os
19
+ import sys
20
+ import time
21
+ import tempfile
22
+ import json
23
+
24
+ # === PHASE 0: Authentication ===
25
+ print("=" * 60)
26
+ print("PHASE 0: Authentication")
27
+ print("=" * 60)
28
+
29
+ from huggingface_hub import HfApi
30
+
31
+ HF_TOKEN = os.environ.get("HF_TOKEN")
32
+ if not HF_TOKEN:
33
+ raise ValueError("HF_TOKEN environment variable required")
34
+
35
+ # Removed login() - using HfApi(token=) instead
36
+ api = HfApi(token=HF_TOKEN)
37
+ user_info = api.whoami()
38
+ print(f"Authenticated as: {user_info['name']}")
39
+
40
+ MODEL_NAME = "Qwen/Qwen3-0.6B"
41
+ DATASET_NAME = "open-r1/codeforces-cots"
42
+ DATASET_SUBSET = "solutions_py"
43
+ OUTPUT_REPO = f"{user_info['name']}/qwen3-humaneval-sft"
44
+ NUM_EXAMPLES = 500
45
+ MAX_STEPS = 150
46
+
47
+ print(f"Model: {MODEL_NAME}")
48
+ print(f"Dataset: {DATASET_NAME} ({DATASET_SUBSET} subset)")
49
+ print(f"Output: {OUTPUT_REPO}")
50
+
51
+
52
+ # === PHASE 1: Load Base Model and Run Benchmark ===
53
+ print("\n" + "=" * 60)
54
+ print("PHASE 1: Benchmark Base Model on HumanEval")
55
+ print("=" * 60)
56
+
57
+ import torch
58
+ from transformers import AutoModelForCausalLM, AutoTokenizer
59
+
60
+ print("Loading base model...")
61
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
62
+ base_model = AutoModelForCausalLM.from_pretrained(
63
+ MODEL_NAME,
64
+ torch_dtype=torch.float16,
65
+ device_map="auto",
66
+ trust_remote_code=True,
67
+ )
68
+ print(f"Model loaded on {base_model.device}")
69
+
70
+
71
+ def run_humaneval_benchmark(model, tokenizer, label="model"):
72
+ """Run HumanEval benchmark on model."""
73
+ from human_eval.data import read_problems
74
+ from human_eval.evaluation import evaluate_functional_correctness as check_correctness
75
+
76
+ problems = read_problems()
77
+ print(f"Testing {label} on {len(problems)} HumanEval problems...")
78
+
79
+ samples = []
80
+ model.eval()
81
+
82
+ for i, (task_id, problem) in enumerate(problems.items()):
83
+ prompt = problem["prompt"]
84
+
85
+ messages = [{"role": "user", "content": f"Complete this Python function:\n\n{prompt}"}]
86
+ text = tokenizer.apply_chat_template(
87
+ messages,
88
+ tokenize=False,
89
+ add_generation_prompt=True,
90
+ enable_thinking=False,
91
+ )
92
+
93
+ inputs = tokenizer(text, return_tensors="pt").to(model.device)
94
+
95
+ with torch.no_grad():
96
+ outputs = model.generate(
97
+ **inputs,
98
+ max_new_tokens=512,
99
+ do_sample=False,
100
+ pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
101
+ )
102
+
103
+ response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
104
+
105
+ if "```python" in response:
106
+ code = response.split("```python")[1].split("```")[0].strip()
107
+ elif "```" in response:
108
+ code = response.split("```")[1].split("```")[0].strip()
109
+ else:
110
+ code = response.strip()
111
+
112
+ completion = prompt + code
113
+ samples.append({"task_id": task_id, "completion": completion})
114
+
115
+ if (i + 1) % 20 == 0:
116
+ print(f" Progress: {i + 1}/{len(problems)}")
117
+
118
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
119
+ for s in samples:
120
+ f.write(json.dumps(s) + "\n")
121
+ samples_file = f.name
122
+
123
+ results = check_correctness(samples_file, k=[1], timeout=10.0)
124
+ os.unlink(samples_file)
125
+
126
+ score = results["pass@1"] * 100
127
+ passed = int(score * len(problems) / 100)
128
+ print(f"{label} score: {score:.2f}% ({passed}/{len(problems)} passed)")
129
+ return score, passed, len(problems)
130
+
131
+
132
+ base_score, base_passed, total = run_humaneval_benchmark(base_model, tokenizer, "BASE")
133
+
134
+ del base_model
135
+ torch.cuda.empty_cache()
136
+ print(f"\nBase model score: {base_score:.2f}%")
137
+
138
+
139
+ # === PHASE 2: Train on codeforces-cots (Python subset) ===
140
+ print("\n" + "=" * 60)
141
+ print("PHASE 2: Fine-tune on codeforces-cots (solutions_py)")
142
+ print("=" * 60)
143
+
144
+ from datasets import load_dataset, Dataset
145
+ from peft import LoraConfig
146
+ from trl import SFTTrainer, SFTConfig
147
+
148
+ print("Reloading model for training...")
149
+ model = AutoModelForCausalLM.from_pretrained(
150
+ MODEL_NAME,
151
+ torch_dtype=torch.float16,
152
+ device_map="auto",
153
+ trust_remote_code=True,
154
+ )
155
+
156
+ print(f"Loading {DATASET_NAME} ({DATASET_SUBSET} subset)...")
157
+ ds = load_dataset(DATASET_NAME, DATASET_SUBSET, split="train", streaming=True)
158
+
159
+ examples = []
160
+ print(f"Preparing {NUM_EXAMPLES} training examples...")
161
+ for i, ex in enumerate(ds):
162
+ if i >= NUM_EXAMPLES:
163
+ break
164
+ text = tokenizer.apply_chat_template(ex["messages"], tokenize=False)
165
+ examples.append({"text": text})
166
+ if (i + 1) % 100 == 0:
167
+ print(f" Prepared {i + 1}/{NUM_EXAMPLES} examples")
168
+
169
+ train_dataset = Dataset.from_list(examples)
170
+ print(f"Training dataset ready: {len(train_dataset)} examples")
171
+
172
+ lora_config = LoraConfig(
173
+ r=8,
174
+ lora_alpha=16,
175
+ lora_dropout=0.05,
176
+ target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
177
+ bias="none",
178
+ task_type="CAUSAL_LM",
179
+ )
180
+
181
+ sft_config = SFTConfig(
182
+ output_dir="./sft_output",
183
+ max_steps=MAX_STEPS,
184
+ learning_rate=5e-6,
185
+ per_device_train_batch_size=2,
186
+ gradient_accumulation_steps=4,
187
+ fp16=True,
188
+ gradient_checkpointing=True,
189
+ logging_steps=10,
190
+ save_steps=50,
191
+ max_length=2048,
192
+ dataset_text_field="text",
193
+ )
194
+
195
+ trainer = SFTTrainer(
196
+ model=model,
197
+ args=sft_config,
198
+ train_dataset=train_dataset,
199
+ peft_config=lora_config,
200
+ processing_class=tokenizer,
201
+ )
202
+
203
+ print(f"Starting training for {MAX_STEPS} steps...")
204
+ start_time = time.time()
205
+ trainer.train()
206
+ train_time = time.time() - start_time
207
+ print(f"Training completed in {train_time/60:.1f} minutes")
208
+
209
+ print("Merging LoRA weights...")
210
+ model = trainer.model.merge_and_unload()
211
+
212
+
213
+ # === PHASE 3: Benchmark Fine-tuned Model ===
214
+ print("\n" + "=" * 60)
215
+ print("PHASE 3: Benchmark Fine-tuned Model")
216
+ print("=" * 60)
217
+
218
+ ft_score, ft_passed, _ = run_humaneval_benchmark(model, tokenizer, "FINE-TUNED")
219
+
220
+
221
+ # === PHASE 4: Compare and Upload ===
222
+ print("\n" + "=" * 60)
223
+ print("PHASE 4: Results and Upload")
224
+ print("=" * 60)
225
+
226
+ improvement = ft_score - base_score
227
+ improved_problems = ft_passed - base_passed
228
+
229
+ print(f"\n{'='*40}")
230
+ print("RESULTS SUMMARY")
231
+ print(f"{'='*40}")
232
+ print(f"Base model: {base_score:.2f}% ({base_passed}/{total})")
233
+ print(f"Fine-tuned model: {ft_score:.2f}% ({ft_passed}/{total})")
234
+ print(f"Improvement: {improvement:+.2f}% ({improved_problems:+d} problems)")
235
+ print(f"{'='*40}")
236
+
237
+ if ft_score > base_score:
238
+ print("\n*** SUCCESS: Fine-tuned beats base! ***")
239
+ print(f"Uploading to {OUTPUT_REPO}...")
240
+
241
+ model_card = f"""---
242
+ tags:
243
+ - fine-tuned
244
+ - qwen3
245
+ - humaneval
246
+ - codeforces
247
+ - lora
248
+ base_model: {MODEL_NAME}
249
+ datasets:
250
+ - {DATASET_NAME}
251
+ ---
252
+
253
+ # Qwen3-0.6B Fine-tuned on Codeforces-CoTS (Python)
254
+
255
+ Fine-tuned using SFT on the **solutions_py** subset of `{DATASET_NAME}`.
256
+
257
+ ## Results on HumanEval
258
+
259
+ | Model | Score | Problems Passed |
260
+ |-------|-------|-----------------|
261
+ | Base (Qwen3-0.6B) | {base_score:.2f}% | {base_passed}/{total} |
262
+ | **Fine-tuned** | **{ft_score:.2f}%** | **{ft_passed}/{total}** |
263
+ | **Improvement** | **{improvement:+.2f}%** | **{improved_problems:+d} problems** |
264
+
265
+ ## Training Details
266
+
267
+ - **Dataset**: {DATASET_NAME} ({DATASET_SUBSET} subset) - {NUM_EXAMPLES} examples
268
+ - **Method**: LoRA (r=8, alpha=16)
269
+ - **Steps**: {MAX_STEPS}
270
+ - **Learning Rate**: 5e-6
271
+
272
+ ## Usage
273
+
274
+ ```python
275
+ from transformers import AutoModelForCausalLM, AutoTokenizer
276
+
277
+ model = AutoModelForCausalLM.from_pretrained("{OUTPUT_REPO}")
278
+ tokenizer = AutoTokenizer.from_pretrained("{OUTPUT_REPO}")
279
+ ```
280
+ """
281
+
282
+ model.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Fine-tuned model beating base on HumanEval")
283
+ tokenizer.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Add tokenizer")
284
+
285
+ api.upload_file(
286
+ path_or_fileobj=model_card.encode(),
287
+ path_in_repo="README.md",
288
+ repo_id=OUTPUT_REPO,
289
+ commit_message="Add model card with results",
290
+ )
291
+
292
+ print(f"\n*** Model uploaded to: https://huggingface.co/{OUTPUT_REPO} ***")
293
+ else:
294
+ print(f"\nFine-tuned ({ft_score:.2f}%) did not beat base ({base_score:.2f}%)")
295
+ print("Consider running another job with different random state.")
296
+
297
+ print(f"\n{'='*60}")
298
+ print("JOB COMPLETE")
299
+ print(f"{'='*60}")