File size: 8,793 Bytes

618bb37

# /// script
# dependencies = [
#     "trl>=0.15.0",
#     "peft>=0.14.0",
#     "transformers>=4.51.0",
#     "accelerate>=0.30.0",
#     "datasets",
#     "torch",
#     "huggingface_hub",
#     "human_eval",
# ]
# ///
"""

Fine-tune Qwen3-0.6B on codeforces-cots (Python subset) to beat base on HumanEval.

Reproduction of Ben Burtenshaw's Claude Code vs Codex challenge.

"""

import os
import sys
import time
import tempfile
import json

# === PHASE 0: Authentication ===
print("=" * 60)
print("PHASE 0: Authentication")
print("=" * 60)

from huggingface_hub import HfApi

HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
    raise ValueError("HF_TOKEN environment variable required")

# Removed login() - using HfApi(token=) instead
api = HfApi(token=HF_TOKEN)
user_info = api.whoami()
print(f"Authenticated as: {user_info['name']}")

MODEL_NAME = "Qwen/Qwen3-0.6B"
DATASET_NAME = "open-r1/codeforces-cots"
DATASET_SUBSET = "solutions_py"
OUTPUT_REPO = f"{user_info['name']}/qwen3-humaneval-sft"
NUM_EXAMPLES = 500
MAX_STEPS = 150

print(f"Model: {MODEL_NAME}")
print(f"Dataset: {DATASET_NAME} ({DATASET_SUBSET} subset)")
print(f"Output: {OUTPUT_REPO}")


# === PHASE 1: Load Base Model and Run Benchmark ===
print("\n" + "=" * 60)
print("PHASE 1: Benchmark Base Model on HumanEval")
print("=" * 60)

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

print("Loading base model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Model loaded on {base_model.device}")


def run_humaneval_benchmark(model, tokenizer, label="model"):
    """Run HumanEval benchmark on model."""
    from human_eval.data import read_problems
    from human_eval.evaluation import evaluate_functional_correctness as check_correctness

    problems = read_problems()
    print(f"Testing {label} on {len(problems)} HumanEval problems...")

    samples = []
    model.eval()

    for i, (task_id, problem) in enumerate(problems.items()):
        prompt = problem["prompt"]

        messages = [{"role": "user", "content": f"Complete this Python function:\n\n{prompt}"}]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False,
        )

        inputs = tokenizer(text, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=512,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
            )

        response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

        if "```python" in response:
            code = response.split("```python")[1].split("```")[0].strip()
        elif "```" in response:
            code = response.split("```")[1].split("```")[0].strip()
        else:
            code = response.strip()

        completion = prompt + code
        samples.append({"task_id": task_id, "completion": completion})

        if (i + 1) % 20 == 0:
            print(f"  Progress: {i + 1}/{len(problems)}")

    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
        for s in samples:
            f.write(json.dumps(s) + "\n")
        samples_file = f.name

    results = check_correctness(samples_file, k=[1], timeout=10.0)
    os.unlink(samples_file)

    score = results["pass@1"] * 100
    passed = int(score * len(problems) / 100)
    print(f"{label} score: {score:.2f}% ({passed}/{len(problems)} passed)")
    return score, passed, len(problems)


base_score, base_passed, total = run_humaneval_benchmark(base_model, tokenizer, "BASE")

del base_model
torch.cuda.empty_cache()
print(f"\nBase model score: {base_score:.2f}%")


# === PHASE 2: Train on codeforces-cots (Python subset) ===
print("\n" + "=" * 60)
print("PHASE 2: Fine-tune on codeforces-cots (solutions_py)")
print("=" * 60)

from datasets import load_dataset, Dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

print("Reloading model for training...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

print(f"Loading {DATASET_NAME} ({DATASET_SUBSET} subset)...")
ds = load_dataset(DATASET_NAME, DATASET_SUBSET, split="train", streaming=True)

examples = []
print(f"Preparing {NUM_EXAMPLES} training examples...")
for i, ex in enumerate(ds):
    if i >= NUM_EXAMPLES:
        break
    text = tokenizer.apply_chat_template(ex["messages"], tokenize=False)
    examples.append({"text": text})
    if (i + 1) % 100 == 0:
        print(f"  Prepared {i + 1}/{NUM_EXAMPLES} examples")

train_dataset = Dataset.from_list(examples)
print(f"Training dataset ready: {len(train_dataset)} examples")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

sft_config = SFTConfig(
    output_dir="./sft_output",
    max_steps=MAX_STEPS,
    learning_rate=5e-6,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    fp16=True,
    gradient_checkpointing=True,
    logging_steps=10,
    save_steps=50,
    max_length=2048,
    dataset_text_field="text",
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    peft_config=lora_config,
    processing_class=tokenizer,
)

print(f"Starting training for {MAX_STEPS} steps...")
start_time = time.time()
trainer.train()
train_time = time.time() - start_time
print(f"Training completed in {train_time/60:.1f} minutes")

print("Merging LoRA weights...")
model = trainer.model.merge_and_unload()


# === PHASE 3: Benchmark Fine-tuned Model ===
print("\n" + "=" * 60)
print("PHASE 3: Benchmark Fine-tuned Model")
print("=" * 60)

ft_score, ft_passed, _ = run_humaneval_benchmark(model, tokenizer, "FINE-TUNED")


# === PHASE 4: Compare and Upload ===
print("\n" + "=" * 60)
print("PHASE 4: Results and Upload")
print("=" * 60)

improvement = ft_score - base_score
improved_problems = ft_passed - base_passed

print(f"\n{'='*40}")
print("RESULTS SUMMARY")
print(f"{'='*40}")
print(f"Base model:       {base_score:.2f}% ({base_passed}/{total})")
print(f"Fine-tuned model: {ft_score:.2f}% ({ft_passed}/{total})")
print(f"Improvement:      {improvement:+.2f}% ({improved_problems:+d} problems)")
print(f"{'='*40}")

if ft_score > base_score:
    print("\n*** SUCCESS: Fine-tuned beats base! ***")
    print(f"Uploading to {OUTPUT_REPO}...")

    model_card = f"""---

tags:

- fine-tuned

- qwen3

- humaneval

- codeforces

- lora

base_model: {MODEL_NAME}

datasets:

- {DATASET_NAME}

---



# Qwen3-0.6B Fine-tuned on Codeforces-CoTS (Python)



Fine-tuned using SFT on the **solutions_py** subset of `{DATASET_NAME}`.



## Results on HumanEval



| Model | Score | Problems Passed |

|-------|-------|-----------------|

| Base (Qwen3-0.6B) | {base_score:.2f}% | {base_passed}/{total} |

| **Fine-tuned** | **{ft_score:.2f}%** | **{ft_passed}/{total}** |

| **Improvement** | **{improvement:+.2f}%** | **{improved_problems:+d} problems** |



## Training Details



- **Dataset**: {DATASET_NAME} ({DATASET_SUBSET} subset) - {NUM_EXAMPLES} examples

- **Method**: LoRA (r=8, alpha=16)

- **Steps**: {MAX_STEPS}

- **Learning Rate**: 5e-6



## Usage



```python

from transformers import AutoModelForCausalLM, AutoTokenizer



model = AutoModelForCausalLM.from_pretrained("{OUTPUT_REPO}")

tokenizer = AutoTokenizer.from_pretrained("{OUTPUT_REPO}")

```

"""

    model.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Fine-tuned model beating base on HumanEval")
    tokenizer.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Add tokenizer")

    api.upload_file(
        path_or_fileobj=model_card.encode(),
        path_in_repo="README.md",
        repo_id=OUTPUT_REPO,
        commit_message="Add model card with results",
    )

    print(f"\n*** Model uploaded to: https://huggingface.co/{OUTPUT_REPO} ***")
else:
    print(f"\nFine-tuned ({ft_score:.2f}%) did not beat base ({base_score:.2f}%)")
    print("Consider running another job with different random state.")

print(f"\n{'='*60}")
print("JOB COMPLETE")
print(f"{'='*60}")