qwen-codeforces-coder / train_coder.py
mgbam's picture
Upload train_coder.py with huggingface_hub
500ebee verified
"""
Fine-tune Qwen2.5-0.5B to solve competitive programming problems
with chain-of-thought reasoning using the codeforces-cots dataset.
"""
import os
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
import torch
# Configuration
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
DATASET_NAME = "open-r1/codeforces-cots"
OUTPUT_DIR = "./qwen-codeforces-coder"
HF_REPO = "mgbam/qwen-codeforces-coder"
print(f"πŸš€ Starting fine-tuning: {MODEL_NAME}")
print(f"πŸ“Š Dataset: {DATASET_NAME}")
print(f"πŸ’Ύ Output: {HF_REPO}")
print()
# Load tokenizer and model
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
# Add padding token if not present
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
# Load and prepare dataset
print(f"Loading dataset: {DATASET_NAME}...")
dataset = load_dataset(DATASET_NAME, split="train")
# Take a subset for faster training (you can increase this)
dataset = dataset.select(range(min(1000, len(dataset))))
print(f"Training on {len(dataset)} examples")
# Split into train/eval
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
def format_prompt(example):
"""Format the dataset into instruction-following format."""
# The dataset has 'problem' and 'solution' fields
problem = example.get('problem', example.get('text', ''))
solution = example.get('solution', example.get('output', ''))
# Create instruction format
prompt = f"""<|im_start|>system
You are a competitive programming expert. Solve problems with clear chain-of-thought reasoning.<|im_end|>
<|im_start|>user
{problem}<|im_end|>
<|im_start|>assistant
{solution}<|im_end|>"""
return {"text": prompt}
# Format datasets
print("Formatting dataset...")
train_dataset = train_dataset.map(format_prompt, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(format_prompt, remove_columns=eval_dataset.column_names)
# Tokenize
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=2048,
padding="max_length"
)
print("Tokenizing...")
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
# Training arguments
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-5,
warmup_steps=100,
logging_steps=10,
eval_steps=50,
save_steps=100,
eval_strategy="steps",
save_strategy="steps",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
fp16=False,
bf16=True,
push_to_hub=True,
hub_model_id=HF_REPO,
hub_strategy="every_save",
report_to=["tensorboard"],
logging_first_step=True,
)
# Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
# Initialize trainer
print("Initializing trainer...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
# Train!
print("\n" + "="*50)
print("πŸ”₯ Starting training!")
print("="*50 + "\n")
trainer.train()
# Save final model
print("\n" + "="*50)
print("πŸ’Ύ Saving final model...")
print("="*50 + "\n")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
# Push to hub
print(f"πŸ“€ Pushing to Hub: {HF_REPO}")
trainer.push_to_hub()
print("\n" + "="*50)
print("βœ… Training complete!")
print(f"🎯 Model available at: https://huggingface.co/{HF_REPO}")
print("="*50)