""" Fine-tune Qwen2.5-0.5B to solve competitive programming problems with chain-of-thought reasoning using the codeforces-cots dataset. """ import os from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) import torch # Configuration MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" DATASET_NAME = "open-r1/codeforces-cots" OUTPUT_DIR = "./qwen-codeforces-coder" HF_REPO = "mgbam/qwen-codeforces-coder" print(f"🚀 Starting fine-tuning: {MODEL_NAME}") print(f"📊 Dataset: {DATASET_NAME}") print(f"💾 Output: {HF_REPO}") print() # Load tokenizer and model print("Loading tokenizer and model...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True ) # Add padding token if not present if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = tokenizer.eos_token_id # Load and prepare dataset print(f"Loading dataset: {DATASET_NAME}...") dataset = load_dataset(DATASET_NAME, split="train") # Take a subset for faster training (you can increase this) dataset = dataset.select(range(min(1000, len(dataset)))) print(f"Training on {len(dataset)} examples") # Split into train/eval dataset = dataset.train_test_split(test_size=0.1, seed=42) train_dataset = dataset["train"] eval_dataset = dataset["test"] def format_prompt(example): """Format the dataset into instruction-following format.""" # The dataset has 'problem' and 'solution' fields problem = example.get('problem', example.get('text', '')) solution = example.get('solution', example.get('output', '')) # Create instruction format prompt = f"""<|im_start|>system You are a competitive programming expert. Solve problems with clear chain-of-thought reasoning.<|im_end|> <|im_start|>user {problem}<|im_end|> <|im_start|>assistant {solution}<|im_end|>""" return {"text": prompt} # Format datasets print("Formatting dataset...") train_dataset = train_dataset.map(format_prompt, remove_columns=train_dataset.column_names) eval_dataset = eval_dataset.map(format_prompt, remove_columns=eval_dataset.column_names) # Tokenize def tokenize_function(examples): return tokenizer( examples["text"], truncation=True, max_length=2048, padding="max_length" ) print("Tokenizing...") train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) # Set format for PyTorch train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"]) eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"]) # Training arguments training_args = TrainingArguments( output_dir=OUTPUT_DIR, num_train_epochs=3, per_device_train_batch_size=4, per_device_eval_batch_size=4, gradient_accumulation_steps=4, learning_rate=2e-5, warmup_steps=100, logging_steps=10, eval_steps=50, save_steps=100, eval_strategy="steps", save_strategy="steps", load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False, fp16=False, bf16=True, push_to_hub=True, hub_model_id=HF_REPO, hub_strategy="every_save", report_to=["tensorboard"], logging_first_step=True, ) # Data collator data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, ) # Initialize trainer print("Initializing trainer...") trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, ) # Train! print("\n" + "="*50) print("🔥 Starting training!") print("="*50 + "\n") trainer.train() # Save final model print("\n" + "="*50) print("💾 Saving final model...") print("="*50 + "\n") trainer.save_model(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) # Push to hub print(f"📤 Pushing to Hub: {HF_REPO}") trainer.push_to_hub() print("\n" + "="*50) print("✅ Training complete!") print(f"🎯 Model available at: https://huggingface.co/{HF_REPO}") print("="*50)