Spaces:
Sleeping
Sleeping
| import torch | |
| from datasets import load_dataset | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| TrainingArguments, | |
| Trainer, | |
| DataCollatorForLanguageModeling, | |
| ) | |
| from peft import LoraConfig, get_peft_model | |
| def main(): | |
| # Configuration | |
| model_name = "Qwen/Qwen2.5-0.5B-Instruct" # Using 0.5B as 0.6B doesn't exist | |
| output_dir = "./qwen-codeforces-cots" | |
| max_seq_length = 1024 # Reduced from 2048 to save memory | |
| # Detect device - prefer CUDA for GPU training | |
| if torch.cuda.is_available(): | |
| device = "cuda" | |
| use_fp16 = True | |
| print(f"Using device: CUDA ({torch.cuda.get_device_name(0)})") | |
| else: | |
| device = "cpu" | |
| use_fp16 = False | |
| print(f"Using device: CPU (training will be slow)") | |
| print("Loading dataset...") | |
| dataset = load_dataset("open-r1/codeforces-cots", split="train") | |
| # Split into train and eval | |
| dataset = dataset.train_test_split(test_size=0.05, seed=42) | |
| train_dataset = dataset["train"] | |
| eval_dataset = dataset["test"] | |
| print(f"Train samples: {len(train_dataset)}") | |
| print(f"Eval samples: {len(eval_dataset)}") | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| trust_remote_code=True, | |
| ) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| tokenizer.padding_side = "right" | |
| print("Loading model...") | |
| # Use appropriate dtype and device_map based on hardware | |
| if torch.cuda.is_available(): | |
| from transformers import BitsAndBytesConfig | |
| # Use 4-bit quantization for efficient GPU training | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| quantization_config=bnb_config, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| ) | |
| from peft import prepare_model_for_kbit_training | |
| model = prepare_model_for_kbit_training(model) | |
| # Enable gradient checkpointing for memory efficiency | |
| model.gradient_checkpointing_enable() | |
| else: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float32, | |
| trust_remote_code=True, | |
| ) | |
| model.gradient_checkpointing_enable() | |
| # LoRA config - reduced rank for memory efficiency | |
| lora_config = LoraConfig( | |
| r=8, # Reduced from 16 to save memory | |
| lora_alpha=16, # Reduced proportionally | |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], | |
| lora_dropout=0.05, | |
| bias="none", | |
| task_type="CAUSAL_LM", | |
| ) | |
| # Apply LoRA | |
| model = get_peft_model(model, lora_config) | |
| model.print_trainable_parameters() | |
| # Format and tokenize dataset | |
| def format_and_tokenize(example): | |
| # Format the chat messages | |
| text = tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| add_generation_prompt=False | |
| ) | |
| # Tokenize | |
| tokenized = tokenizer( | |
| text, | |
| truncation=True, | |
| max_length=max_seq_length, | |
| padding=False, | |
| return_tensors=None, | |
| ) | |
| # Add labels for causal language modeling | |
| tokenized["labels"] = tokenized["input_ids"].copy() | |
| return tokenized | |
| print("Formatting and tokenizing dataset...") | |
| train_dataset = train_dataset.map( | |
| format_and_tokenize, | |
| remove_columns=train_dataset.column_names, | |
| desc="Formatting train dataset" | |
| ) | |
| eval_dataset = eval_dataset.map( | |
| format_and_tokenize, | |
| remove_columns=eval_dataset.column_names, | |
| desc="Formatting eval dataset" | |
| ) | |
| # Data collator for padding | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=tokenizer, | |
| mlm=False, # We're doing causal LM, not masked LM | |
| ) | |
| # Training arguments - optimized for T4 GPU | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| per_device_train_batch_size=1, # Keep at 1 for memory safety | |
| per_device_eval_batch_size=1, | |
| gradient_accumulation_steps=8, # Reduced from 16 to lower memory pressure | |
| num_train_epochs=1, | |
| max_steps=1000, # Limit steps for testing | |
| learning_rate=2e-4, | |
| fp16=use_fp16, | |
| gradient_checkpointing=True, # Enable gradient checkpointing to save memory | |
| save_strategy="steps", | |
| save_steps=200, # Save more frequently | |
| eval_strategy="steps", | |
| eval_steps=200, | |
| logging_steps=10, | |
| warmup_steps=50, | |
| lr_scheduler_type="cosine", | |
| optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch", # Use 8-bit optimizer on GPU | |
| report_to="none", | |
| max_grad_norm=0.3, | |
| save_total_limit=2, | |
| load_best_model_at_end=False, # Disable to avoid loading issues | |
| dataloader_num_workers=0, # No multiprocessing for stability | |
| ) | |
| # Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| data_collator=data_collator, | |
| ) | |
| print("Starting training...") | |
| trainer.train() | |
| print("Saving model...") | |
| trainer.save_model(output_dir) | |
| tokenizer.save_pretrained(output_dir) | |
| print("Training complete!") | |
| print(f"Model saved to: {output_dir}") | |
| if __name__ == "__main__": | |
| main() | |