# /// script # dependencies = [ # "trl>=0.15.0", # "peft>=0.14.0", # "transformers>=4.51.0", # "accelerate>=0.30.0", # "datasets", # "torch", # "huggingface_hub", # "human_eval", # ] # /// """ Fine-tune Qwen3-0.6B on codeforces-cots (Python subset) to beat base on HumanEval. Reproduction of Ben Burtenshaw's Claude Code vs Codex challenge. """ import os import sys import time import tempfile import json # === PHASE 0: Authentication === print("=" * 60) print("PHASE 0: Authentication") print("=" * 60) from huggingface_hub import HfApi HF_TOKEN = os.environ.get("HF_TOKEN") if not HF_TOKEN: raise ValueError("HF_TOKEN environment variable required") # Removed login() - using HfApi(token=) instead api = HfApi(token=HF_TOKEN) user_info = api.whoami() print(f"Authenticated as: {user_info['name']}") MODEL_NAME = "Qwen/Qwen3-0.6B" DATASET_NAME = "open-r1/codeforces-cots" DATASET_SUBSET = "solutions_py" OUTPUT_REPO = f"{user_info['name']}/qwen3-humaneval-sft" NUM_EXAMPLES = 500 MAX_STEPS = 150 print(f"Model: {MODEL_NAME}") print(f"Dataset: {DATASET_NAME} ({DATASET_SUBSET} subset)") print(f"Output: {OUTPUT_REPO}") # === PHASE 1: Load Base Model and Run Benchmark === print("\n" + "=" * 60) print("PHASE 1: Benchmark Base Model on HumanEval") print("=" * 60) import torch from transformers import AutoModelForCausalLM, AutoTokenizer print("Loading base model...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) base_model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, ) print(f"Model loaded on {base_model.device}") def run_humaneval_benchmark(model, tokenizer, label="model"): """Run HumanEval benchmark on model.""" from human_eval.data import read_problems from human_eval.evaluation import evaluate_functional_correctness as check_correctness problems = read_problems() print(f"Testing {label} on {len(problems)} HumanEval problems...") samples = [] model.eval() for i, (task_id, problem) in enumerate(problems.items()): prompt = problem["prompt"] messages = [{"role": "user", "content": f"Complete this Python function:\n\n{prompt}"}] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False, ) inputs = tokenizer(text, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=512, do_sample=False, pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id, ) response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) if "```python" in response: code = response.split("```python")[1].split("```")[0].strip() elif "```" in response: code = response.split("```")[1].split("```")[0].strip() else: code = response.strip() completion = prompt + code samples.append({"task_id": task_id, "completion": completion}) if (i + 1) % 20 == 0: print(f" Progress: {i + 1}/{len(problems)}") with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: for s in samples: f.write(json.dumps(s) + "\n") samples_file = f.name results = check_correctness(samples_file, k=[1], timeout=10.0) os.unlink(samples_file) score = results["pass@1"] * 100 passed = int(score * len(problems) / 100) print(f"{label} score: {score:.2f}% ({passed}/{len(problems)} passed)") return score, passed, len(problems) base_score, base_passed, total = run_humaneval_benchmark(base_model, tokenizer, "BASE") del base_model torch.cuda.empty_cache() print(f"\nBase model score: {base_score:.2f}%") # === PHASE 2: Train on codeforces-cots (Python subset) === print("\n" + "=" * 60) print("PHASE 2: Fine-tune on codeforces-cots (solutions_py)") print("=" * 60) from datasets import load_dataset, Dataset from peft import LoraConfig from trl import SFTTrainer, SFTConfig print("Reloading model for training...") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, ) print(f"Loading {DATASET_NAME} ({DATASET_SUBSET} subset)...") ds = load_dataset(DATASET_NAME, DATASET_SUBSET, split="train", streaming=True) examples = [] print(f"Preparing {NUM_EXAMPLES} training examples...") for i, ex in enumerate(ds): if i >= NUM_EXAMPLES: break text = tokenizer.apply_chat_template(ex["messages"], tokenize=False) examples.append({"text": text}) if (i + 1) % 100 == 0: print(f" Prepared {i + 1}/{NUM_EXAMPLES} examples") train_dataset = Dataset.from_list(examples) print(f"Training dataset ready: {len(train_dataset)} examples") lora_config = LoraConfig( r=8, lora_alpha=16, lora_dropout=0.05, target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], bias="none", task_type="CAUSAL_LM", ) sft_config = SFTConfig( output_dir="./sft_output", max_steps=MAX_STEPS, learning_rate=5e-6, per_device_train_batch_size=2, gradient_accumulation_steps=4, fp16=True, gradient_checkpointing=True, logging_steps=10, save_steps=50, max_length=2048, dataset_text_field="text", ) trainer = SFTTrainer( model=model, args=sft_config, train_dataset=train_dataset, peft_config=lora_config, processing_class=tokenizer, ) print(f"Starting training for {MAX_STEPS} steps...") start_time = time.time() trainer.train() train_time = time.time() - start_time print(f"Training completed in {train_time/60:.1f} minutes") print("Merging LoRA weights...") model = trainer.model.merge_and_unload() # === PHASE 3: Benchmark Fine-tuned Model === print("\n" + "=" * 60) print("PHASE 3: Benchmark Fine-tuned Model") print("=" * 60) ft_score, ft_passed, _ = run_humaneval_benchmark(model, tokenizer, "FINE-TUNED") # === PHASE 4: Compare and Upload === print("\n" + "=" * 60) print("PHASE 4: Results and Upload") print("=" * 60) improvement = ft_score - base_score improved_problems = ft_passed - base_passed print(f"\n{'='*40}") print("RESULTS SUMMARY") print(f"{'='*40}") print(f"Base model: {base_score:.2f}% ({base_passed}/{total})") print(f"Fine-tuned model: {ft_score:.2f}% ({ft_passed}/{total})") print(f"Improvement: {improvement:+.2f}% ({improved_problems:+d} problems)") print(f"{'='*40}") if ft_score > base_score: print("\n*** SUCCESS: Fine-tuned beats base! ***") print(f"Uploading to {OUTPUT_REPO}...") model_card = f"""--- tags: - fine-tuned - qwen3 - humaneval - codeforces - lora base_model: {MODEL_NAME} datasets: - {DATASET_NAME} --- # Qwen3-0.6B Fine-tuned on Codeforces-CoTS (Python) Fine-tuned using SFT on the **solutions_py** subset of `{DATASET_NAME}`. ## Results on HumanEval | Model | Score | Problems Passed | |-------|-------|-----------------| | Base (Qwen3-0.6B) | {base_score:.2f}% | {base_passed}/{total} | | **Fine-tuned** | **{ft_score:.2f}%** | **{ft_passed}/{total}** | | **Improvement** | **{improvement:+.2f}%** | **{improved_problems:+d} problems** | ## Training Details - **Dataset**: {DATASET_NAME} ({DATASET_SUBSET} subset) - {NUM_EXAMPLES} examples - **Method**: LoRA (r=8, alpha=16) - **Steps**: {MAX_STEPS} - **Learning Rate**: 5e-6 ## Usage ```python from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained("{OUTPUT_REPO}") tokenizer = AutoTokenizer.from_pretrained("{OUTPUT_REPO}") ``` """ model.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Fine-tuned model beating base on HumanEval") tokenizer.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Add tokenizer") api.upload_file( path_or_fileobj=model_card.encode(), path_in_repo="README.md", repo_id=OUTPUT_REPO, commit_message="Add model card with results", ) print(f"\n*** Model uploaded to: https://huggingface.co/{OUTPUT_REPO} ***") else: print(f"\nFine-tuned ({ft_score:.2f}%) did not beat base ({base_score:.2f}%)") print("Consider running another job with different random state.") print(f"\n{'='*60}") print("JOB COMPLETE") print(f"{'='*60}")