training-scripts / train_humaneval_clean.py

Fix push_to_hub to pass token explicitly

618bb37 verified about 1 month ago

8.79 kB

	# /// script
	# dependencies = [
	# "trl>=0.15.0",
	# "peft>=0.14.0",
	# "transformers>=4.51.0",
	# "accelerate>=0.30.0",
	# "datasets",
	# "torch",
	# "huggingface_hub",
	# "human_eval",
	# ]
	# ///
	"""
	Fine-tune Qwen3-0.6B on codeforces-cots (Python subset) to beat base on HumanEval.
	Reproduction of Ben Burtenshaw's Claude Code vs Codex challenge.
	"""

	import os
	import sys
	import time
	import tempfile
	import json

	# === PHASE 0: Authentication ===
	print("=" * 60)
	print("PHASE 0: Authentication")
	print("=" * 60)

	from huggingface_hub import HfApi

	HF_TOKEN = os.environ.get("HF_TOKEN")
	if not HF_TOKEN:
	raise ValueError("HF_TOKEN environment variable required")

	# Removed login() - using HfApi(token=) instead
	api = HfApi(token=HF_TOKEN)
	user_info = api.whoami()
	print(f"Authenticated as: {user_info['name']}")

	MODEL_NAME = "Qwen/Qwen3-0.6B"
	DATASET_NAME = "open-r1/codeforces-cots"
	DATASET_SUBSET = "solutions_py"
	OUTPUT_REPO = f"{user_info['name']}/qwen3-humaneval-sft"
	NUM_EXAMPLES = 500
	MAX_STEPS = 150

	print(f"Model: {MODEL_NAME}")
	print(f"Dataset: {DATASET_NAME} ({DATASET_SUBSET} subset)")
	print(f"Output: {OUTPUT_REPO}")


	# === PHASE 1: Load Base Model and Run Benchmark ===
	print("\n" + "=" * 60)
	print("PHASE 1: Benchmark Base Model on HumanEval")
	print("=" * 60)

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer

	print("Loading base model...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
	base_model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True,
	)
	print(f"Model loaded on {base_model.device}")


	def run_humaneval_benchmark(model, tokenizer, label="model"):
	"""Run HumanEval benchmark on model."""
	from human_eval.data import read_problems
	from human_eval.evaluation import evaluate_functional_correctness as check_correctness

	problems = read_problems()
	print(f"Testing {label} on {len(problems)} HumanEval problems...")

	samples = []
	model.eval()

	for i, (task_id, problem) in enumerate(problems.items()):
	prompt = problem["prompt"]

	messages = [{"role": "user", "content": f"Complete this Python function:\n\n{prompt}"}]
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=False,
	)

	inputs = tokenizer(text, return_tensors="pt").to(model.device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=512,
	do_sample=False,
	pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
	)

	response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

	if "```python" in response:
	code = response.split("```python")[1].split("```")[0].strip()
	elif "```" in response:
	code = response.split("```")[1].split("```")[0].strip()
	else:
	code = response.strip()

	completion = prompt + code
	samples.append({"task_id": task_id, "completion": completion})

	if (i + 1) % 20 == 0:
	print(f" Progress: {i + 1}/{len(problems)}")

	with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
	for s in samples:
	f.write(json.dumps(s) + "\n")
	samples_file = f.name

	results = check_correctness(samples_file, k=[1], timeout=10.0)
	os.unlink(samples_file)

	score = results["pass@1"] * 100
	passed = int(score * len(problems) / 100)
	print(f"{label} score: {score:.2f}% ({passed}/{len(problems)} passed)")
	return score, passed, len(problems)


	base_score, base_passed, total = run_humaneval_benchmark(base_model, tokenizer, "BASE")

	del base_model
	torch.cuda.empty_cache()
	print(f"\nBase model score: {base_score:.2f}%")


	# === PHASE 2: Train on codeforces-cots (Python subset) ===
	print("\n" + "=" * 60)
	print("PHASE 2: Fine-tune on codeforces-cots (solutions_py)")
	print("=" * 60)

	from datasets import load_dataset, Dataset
	from peft import LoraConfig
	from trl import SFTTrainer, SFTConfig

	print("Reloading model for training...")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True,
	)

	print(f"Loading {DATASET_NAME} ({DATASET_SUBSET} subset)...")
	ds = load_dataset(DATASET_NAME, DATASET_SUBSET, split="train", streaming=True)

	examples = []
	print(f"Preparing {NUM_EXAMPLES} training examples...")
	for i, ex in enumerate(ds):
	if i >= NUM_EXAMPLES:
	break
	text = tokenizer.apply_chat_template(ex["messages"], tokenize=False)
	examples.append({"text": text})
	if (i + 1) % 100 == 0:
	print(f" Prepared {i + 1}/{NUM_EXAMPLES} examples")

	train_dataset = Dataset.from_list(examples)
	print(f"Training dataset ready: {len(train_dataset)} examples")

	lora_config = LoraConfig(
	r=8,
	lora_alpha=16,
	lora_dropout=0.05,
	target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
	bias="none",
	task_type="CAUSAL_LM",
	)

	sft_config = SFTConfig(
	output_dir="./sft_output",
	max_steps=MAX_STEPS,
	learning_rate=5e-6,
	per_device_train_batch_size=2,
	gradient_accumulation_steps=4,
	fp16=True,
	gradient_checkpointing=True,
	logging_steps=10,
	save_steps=50,
	max_length=2048,
	dataset_text_field="text",
	)

	trainer = SFTTrainer(
	model=model,
	args=sft_config,
	train_dataset=train_dataset,
	peft_config=lora_config,
	processing_class=tokenizer,
	)

	print(f"Starting training for {MAX_STEPS} steps...")
	start_time = time.time()
	trainer.train()
	train_time = time.time() - start_time
	print(f"Training completed in {train_time/60:.1f} minutes")

	print("Merging LoRA weights...")
	model = trainer.model.merge_and_unload()


	# === PHASE 3: Benchmark Fine-tuned Model ===
	print("\n" + "=" * 60)
	print("PHASE 3: Benchmark Fine-tuned Model")
	print("=" * 60)

	ft_score, ft_passed, _ = run_humaneval_benchmark(model, tokenizer, "FINE-TUNED")


	# === PHASE 4: Compare and Upload ===
	print("\n" + "=" * 60)
	print("PHASE 4: Results and Upload")
	print("=" * 60)

	improvement = ft_score - base_score
	improved_problems = ft_passed - base_passed

	print(f"\n{'='*40}")
	print("RESULTS SUMMARY")
	print(f"{'='*40}")
	print(f"Base model: {base_score:.2f}% ({base_passed}/{total})")
	print(f"Fine-tuned model: {ft_score:.2f}% ({ft_passed}/{total})")
	print(f"Improvement: {improvement:+.2f}% ({improved_problems:+d} problems)")
	print(f"{'='*40}")

	if ft_score > base_score:
	print("\n* SUCCESS: Fine-tuned beats base! *")
	print(f"Uploading to {OUTPUT_REPO}...")

	model_card = f"""---
	tags:
	- fine-tuned
	- qwen3
	- humaneval
	- codeforces
	- lora
	base_model: {MODEL_NAME}
	datasets:
	- {DATASET_NAME}
	---

	# Qwen3-0.6B Fine-tuned on Codeforces-CoTS (Python)

	Fine-tuned using SFT on the solutions_py subset of `{DATASET_NAME}`.

	## Results on HumanEval

	\| Model \| Score \| Problems Passed \|
	\|-------\|-------\|-----------------\|
	\| Base (Qwen3-0.6B) \| {base_score:.2f}% \| {base_passed}/{total} \|
	\| Fine-tuned \| {ft_score:.2f}% \| {ft_passed}/{total} \|
	\| Improvement \| {improvement:+.2f}% \| {improved_problems:+d} problems \|

	## Training Details

	- Dataset: {DATASET_NAME} ({DATASET_SUBSET} subset) - {NUM_EXAMPLES} examples
	- Method: LoRA (r=8, alpha=16)
	- Steps: {MAX_STEPS}
	- Learning Rate: 5e-6

	## Usage

	```python
	from transformers import AutoModelForCausalLM, AutoTokenizer

	model = AutoModelForCausalLM.from_pretrained("{OUTPUT_REPO}")
	tokenizer = AutoTokenizer.from_pretrained("{OUTPUT_REPO}")
	```
	"""

	model.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Fine-tuned model beating base on HumanEval")
	tokenizer.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Add tokenizer")

	api.upload_file(
	path_or_fileobj=model_card.encode(),
	path_in_repo="README.md",
	repo_id=OUTPUT_REPO,
	commit_message="Add model card with results",
	)

	print(f"\n* Model uploaded to: https://huggingface.co/{OUTPUT_REPO} *")
	else:
	print(f"\nFine-tuned ({ft_score:.2f}%) did not beat base ({base_score:.2f}%)")
	print("Consider running another job with different random state.")

	print(f"\n{'='*60}")
	print("JOB COMPLETE")
	print(f"{'='*60}")