import torch from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, ) from peft import LoraConfig, get_peft_model def main(): # Configuration model_name = "Qwen/Qwen2.5-0.5B-Instruct" # Using 0.5B as 0.6B doesn't exist output_dir = "./qwen-codeforces-cots" max_seq_length = 1024 # Reduced from 2048 to save memory # Detect device - prefer CUDA for GPU training if torch.cuda.is_available(): device = "cuda" use_fp16 = True print(f"Using device: CUDA ({torch.cuda.get_device_name(0)})") else: device = "cpu" use_fp16 = False print(f"Using device: CPU (training will be slow)") print("Loading dataset...") dataset = load_dataset("open-r1/codeforces-cots", split="train") # Split into train and eval dataset = dataset.train_test_split(test_size=0.05, seed=42) train_dataset = dataset["train"] eval_dataset = dataset["test"] print(f"Train samples: {len(train_dataset)}") print(f"Eval samples: {len(eval_dataset)}") print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True, ) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" print("Loading model...") # Use appropriate dtype and device_map based on hardware if torch.cuda.is_available(): from transformers import BitsAndBytesConfig # Use 4-bit quantization for efficient GPU training bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, ) from peft import prepare_model_for_kbit_training model = prepare_model_for_kbit_training(model) # Enable gradient checkpointing for memory efficiency model.gradient_checkpointing_enable() else: model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, trust_remote_code=True, ) model.gradient_checkpointing_enable() # LoRA config - reduced rank for memory efficiency lora_config = LoraConfig( r=8, # Reduced from 16 to save memory lora_alpha=16, # Reduced proportionally target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", ) # Apply LoRA model = get_peft_model(model, lora_config) model.print_trainable_parameters() # Format and tokenize dataset def format_and_tokenize(example): # Format the chat messages text = tokenizer.apply_chat_template( example["messages"], tokenize=False, add_generation_prompt=False ) # Tokenize tokenized = tokenizer( text, truncation=True, max_length=max_seq_length, padding=False, return_tensors=None, ) # Add labels for causal language modeling tokenized["labels"] = tokenized["input_ids"].copy() return tokenized print("Formatting and tokenizing dataset...") train_dataset = train_dataset.map( format_and_tokenize, remove_columns=train_dataset.column_names, desc="Formatting train dataset" ) eval_dataset = eval_dataset.map( format_and_tokenize, remove_columns=eval_dataset.column_names, desc="Formatting eval dataset" ) # Data collator for padding data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, # We're doing causal LM, not masked LM ) # Training arguments - optimized for T4 GPU training_args = TrainingArguments( output_dir=output_dir, per_device_train_batch_size=1, # Keep at 1 for memory safety per_device_eval_batch_size=1, gradient_accumulation_steps=8, # Reduced from 16 to lower memory pressure num_train_epochs=1, max_steps=1000, # Limit steps for testing learning_rate=2e-4, fp16=use_fp16, gradient_checkpointing=True, # Enable gradient checkpointing to save memory save_strategy="steps", save_steps=200, # Save more frequently eval_strategy="steps", eval_steps=200, logging_steps=10, warmup_steps=50, lr_scheduler_type="cosine", optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch", # Use 8-bit optimizer on GPU report_to="none", max_grad_norm=0.3, save_total_limit=2, load_best_model_at_end=False, # Disable to avoid loading issues dataloader_num_workers=0, # No multiprocessing for stability ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, ) print("Starting training...") trainer.train() print("Saving model...") trainer.save_model(output_dir) tokenizer.save_pretrained(output_dir) print("Training complete!") print(f"Model saved to: {output_dir}") if __name__ == "__main__": main()