|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
Fine-tune Qwen3-0.6B on codeforces-cots (Python subset) to beat base on HumanEval.
|
|
|
Reproduction of Ben Burtenshaw's Claude Code vs Codex challenge.
|
|
|
"""
|
|
|
|
|
|
import os
|
|
|
import sys
|
|
|
import time
|
|
|
import tempfile
|
|
|
import json
|
|
|
|
|
|
|
|
|
print("=" * 60)
|
|
|
print("PHASE 0: Authentication")
|
|
|
print("=" * 60)
|
|
|
|
|
|
from huggingface_hub import HfApi
|
|
|
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
if not HF_TOKEN:
|
|
|
raise ValueError("HF_TOKEN environment variable required")
|
|
|
|
|
|
|
|
|
api = HfApi(token=HF_TOKEN)
|
|
|
user_info = api.whoami()
|
|
|
print(f"Authenticated as: {user_info['name']}")
|
|
|
|
|
|
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
|
|
DATASET_NAME = "open-r1/codeforces-cots"
|
|
|
DATASET_SUBSET = "solutions_py"
|
|
|
OUTPUT_REPO = f"{user_info['name']}/qwen3-humaneval-sft"
|
|
|
NUM_EXAMPLES = 500
|
|
|
MAX_STEPS = 150
|
|
|
|
|
|
print(f"Model: {MODEL_NAME}")
|
|
|
print(f"Dataset: {DATASET_NAME} ({DATASET_SUBSET} subset)")
|
|
|
print(f"Output: {OUTPUT_REPO}")
|
|
|
|
|
|
|
|
|
|
|
|
print("\n" + "=" * 60)
|
|
|
print("PHASE 1: Benchmark Base Model on HumanEval")
|
|
|
print("=" * 60)
|
|
|
|
|
|
import torch
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
|
|
print("Loading base model...")
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
|
|
base_model = AutoModelForCausalLM.from_pretrained(
|
|
|
MODEL_NAME,
|
|
|
torch_dtype=torch.float16,
|
|
|
device_map="auto",
|
|
|
trust_remote_code=True,
|
|
|
)
|
|
|
print(f"Model loaded on {base_model.device}")
|
|
|
|
|
|
|
|
|
def run_humaneval_benchmark(model, tokenizer, label="model"):
|
|
|
"""Run HumanEval benchmark on model."""
|
|
|
from human_eval.data import read_problems
|
|
|
from human_eval.evaluation import evaluate_functional_correctness as check_correctness
|
|
|
|
|
|
problems = read_problems()
|
|
|
print(f"Testing {label} on {len(problems)} HumanEval problems...")
|
|
|
|
|
|
samples = []
|
|
|
model.eval()
|
|
|
|
|
|
for i, (task_id, problem) in enumerate(problems.items()):
|
|
|
prompt = problem["prompt"]
|
|
|
|
|
|
messages = [{"role": "user", "content": f"Complete this Python function:\n\n{prompt}"}]
|
|
|
text = tokenizer.apply_chat_template(
|
|
|
messages,
|
|
|
tokenize=False,
|
|
|
add_generation_prompt=True,
|
|
|
enable_thinking=False,
|
|
|
)
|
|
|
|
|
|
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
|
|
|
|
|
with torch.no_grad():
|
|
|
outputs = model.generate(
|
|
|
**inputs,
|
|
|
max_new_tokens=512,
|
|
|
do_sample=False,
|
|
|
pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
|
|
|
)
|
|
|
|
|
|
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
|
|
|
|
|
|
if "```python" in response:
|
|
|
code = response.split("```python")[1].split("```")[0].strip()
|
|
|
elif "```" in response:
|
|
|
code = response.split("```")[1].split("```")[0].strip()
|
|
|
else:
|
|
|
code = response.strip()
|
|
|
|
|
|
completion = prompt + code
|
|
|
samples.append({"task_id": task_id, "completion": completion})
|
|
|
|
|
|
if (i + 1) % 20 == 0:
|
|
|
print(f" Progress: {i + 1}/{len(problems)}")
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
|
|
|
for s in samples:
|
|
|
f.write(json.dumps(s) + "\n")
|
|
|
samples_file = f.name
|
|
|
|
|
|
results = check_correctness(samples_file, k=[1], timeout=10.0)
|
|
|
os.unlink(samples_file)
|
|
|
|
|
|
score = results["pass@1"] * 100
|
|
|
passed = int(score * len(problems) / 100)
|
|
|
print(f"{label} score: {score:.2f}% ({passed}/{len(problems)} passed)")
|
|
|
return score, passed, len(problems)
|
|
|
|
|
|
|
|
|
base_score, base_passed, total = run_humaneval_benchmark(base_model, tokenizer, "BASE")
|
|
|
|
|
|
del base_model
|
|
|
torch.cuda.empty_cache()
|
|
|
print(f"\nBase model score: {base_score:.2f}%")
|
|
|
|
|
|
|
|
|
|
|
|
print("\n" + "=" * 60)
|
|
|
print("PHASE 2: Fine-tune on codeforces-cots (solutions_py)")
|
|
|
print("=" * 60)
|
|
|
|
|
|
from datasets import load_dataset, Dataset
|
|
|
from peft import LoraConfig
|
|
|
from trl import SFTTrainer, SFTConfig
|
|
|
|
|
|
print("Reloading model for training...")
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
|
MODEL_NAME,
|
|
|
torch_dtype=torch.float16,
|
|
|
device_map="auto",
|
|
|
trust_remote_code=True,
|
|
|
)
|
|
|
|
|
|
print(f"Loading {DATASET_NAME} ({DATASET_SUBSET} subset)...")
|
|
|
ds = load_dataset(DATASET_NAME, DATASET_SUBSET, split="train", streaming=True)
|
|
|
|
|
|
examples = []
|
|
|
print(f"Preparing {NUM_EXAMPLES} training examples...")
|
|
|
for i, ex in enumerate(ds):
|
|
|
if i >= NUM_EXAMPLES:
|
|
|
break
|
|
|
text = tokenizer.apply_chat_template(ex["messages"], tokenize=False)
|
|
|
examples.append({"text": text})
|
|
|
if (i + 1) % 100 == 0:
|
|
|
print(f" Prepared {i + 1}/{NUM_EXAMPLES} examples")
|
|
|
|
|
|
train_dataset = Dataset.from_list(examples)
|
|
|
print(f"Training dataset ready: {len(train_dataset)} examples")
|
|
|
|
|
|
lora_config = LoraConfig(
|
|
|
r=8,
|
|
|
lora_alpha=16,
|
|
|
lora_dropout=0.05,
|
|
|
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
|
|
|
bias="none",
|
|
|
task_type="CAUSAL_LM",
|
|
|
)
|
|
|
|
|
|
sft_config = SFTConfig(
|
|
|
output_dir="./sft_output",
|
|
|
max_steps=MAX_STEPS,
|
|
|
learning_rate=5e-6,
|
|
|
per_device_train_batch_size=2,
|
|
|
gradient_accumulation_steps=4,
|
|
|
fp16=True,
|
|
|
gradient_checkpointing=True,
|
|
|
logging_steps=10,
|
|
|
save_steps=50,
|
|
|
max_length=2048,
|
|
|
dataset_text_field="text",
|
|
|
)
|
|
|
|
|
|
trainer = SFTTrainer(
|
|
|
model=model,
|
|
|
args=sft_config,
|
|
|
train_dataset=train_dataset,
|
|
|
peft_config=lora_config,
|
|
|
processing_class=tokenizer,
|
|
|
)
|
|
|
|
|
|
print(f"Starting training for {MAX_STEPS} steps...")
|
|
|
start_time = time.time()
|
|
|
trainer.train()
|
|
|
train_time = time.time() - start_time
|
|
|
print(f"Training completed in {train_time/60:.1f} minutes")
|
|
|
|
|
|
print("Merging LoRA weights...")
|
|
|
model = trainer.model.merge_and_unload()
|
|
|
|
|
|
|
|
|
|
|
|
print("\n" + "=" * 60)
|
|
|
print("PHASE 3: Benchmark Fine-tuned Model")
|
|
|
print("=" * 60)
|
|
|
|
|
|
ft_score, ft_passed, _ = run_humaneval_benchmark(model, tokenizer, "FINE-TUNED")
|
|
|
|
|
|
|
|
|
|
|
|
print("\n" + "=" * 60)
|
|
|
print("PHASE 4: Results and Upload")
|
|
|
print("=" * 60)
|
|
|
|
|
|
improvement = ft_score - base_score
|
|
|
improved_problems = ft_passed - base_passed
|
|
|
|
|
|
print(f"\n{'='*40}")
|
|
|
print("RESULTS SUMMARY")
|
|
|
print(f"{'='*40}")
|
|
|
print(f"Base model: {base_score:.2f}% ({base_passed}/{total})")
|
|
|
print(f"Fine-tuned model: {ft_score:.2f}% ({ft_passed}/{total})")
|
|
|
print(f"Improvement: {improvement:+.2f}% ({improved_problems:+d} problems)")
|
|
|
print(f"{'='*40}")
|
|
|
|
|
|
if ft_score > base_score:
|
|
|
print("\n*** SUCCESS: Fine-tuned beats base! ***")
|
|
|
print(f"Uploading to {OUTPUT_REPO}...")
|
|
|
|
|
|
model_card = f"""---
|
|
|
tags:
|
|
|
- fine-tuned
|
|
|
- qwen3
|
|
|
- humaneval
|
|
|
- codeforces
|
|
|
- lora
|
|
|
base_model: {MODEL_NAME}
|
|
|
datasets:
|
|
|
- {DATASET_NAME}
|
|
|
---
|
|
|
|
|
|
# Qwen3-0.6B Fine-tuned on Codeforces-CoTS (Python)
|
|
|
|
|
|
Fine-tuned using SFT on the **solutions_py** subset of `{DATASET_NAME}`.
|
|
|
|
|
|
## Results on HumanEval
|
|
|
|
|
|
| Model | Score | Problems Passed |
|
|
|
|-------|-------|-----------------|
|
|
|
| Base (Qwen3-0.6B) | {base_score:.2f}% | {base_passed}/{total} |
|
|
|
| **Fine-tuned** | **{ft_score:.2f}%** | **{ft_passed}/{total}** |
|
|
|
| **Improvement** | **{improvement:+.2f}%** | **{improved_problems:+d} problems** |
|
|
|
|
|
|
## Training Details
|
|
|
|
|
|
- **Dataset**: {DATASET_NAME} ({DATASET_SUBSET} subset) - {NUM_EXAMPLES} examples
|
|
|
- **Method**: LoRA (r=8, alpha=16)
|
|
|
- **Steps**: {MAX_STEPS}
|
|
|
- **Learning Rate**: 5e-6
|
|
|
|
|
|
## Usage
|
|
|
|
|
|
```python
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained("{OUTPUT_REPO}")
|
|
|
tokenizer = AutoTokenizer.from_pretrained("{OUTPUT_REPO}")
|
|
|
```
|
|
|
"""
|
|
|
|
|
|
model.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Fine-tuned model beating base on HumanEval")
|
|
|
tokenizer.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Add tokenizer")
|
|
|
|
|
|
api.upload_file(
|
|
|
path_or_fileobj=model_card.encode(),
|
|
|
path_in_repo="README.md",
|
|
|
repo_id=OUTPUT_REPO,
|
|
|
commit_message="Add model card with results",
|
|
|
)
|
|
|
|
|
|
print(f"\n*** Model uploaded to: https://huggingface.co/{OUTPUT_REPO} ***")
|
|
|
else:
|
|
|
print(f"\nFine-tuned ({ft_score:.2f}%) did not beat base ({base_score:.2f}%)")
|
|
|
print("Consider running another job with different random state.")
|
|
|
|
|
|
print(f"\n{'='*60}")
|
|
|
print("JOB COMPLETE")
|
|
|
print(f"{'='*60}")
|
|
|
|