claude-code-fine-tune / finetune.py
kghamilton89
Optimize memory usage for T4 GPU training
0c63404
import torch
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model
def main():
# Configuration
model_name = "Qwen/Qwen2.5-0.5B-Instruct" # Using 0.5B as 0.6B doesn't exist
output_dir = "./qwen-codeforces-cots"
max_seq_length = 1024 # Reduced from 2048 to save memory
# Detect device - prefer CUDA for GPU training
if torch.cuda.is_available():
device = "cuda"
use_fp16 = True
print(f"Using device: CUDA ({torch.cuda.get_device_name(0)})")
else:
device = "cpu"
use_fp16 = False
print(f"Using device: CPU (training will be slow)")
print("Loading dataset...")
dataset = load_dataset("open-r1/codeforces-cots", split="train")
# Split into train and eval
dataset = dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
print(f"Train samples: {len(train_dataset)}")
print(f"Eval samples: {len(eval_dataset)}")
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print("Loading model...")
# Use appropriate dtype and device_map based on hardware
if torch.cuda.is_available():
from transformers import BitsAndBytesConfig
# Use 4-bit quantization for efficient GPU training
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)
# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()
else:
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float32,
trust_remote_code=True,
)
model.gradient_checkpointing_enable()
# LoRA config - reduced rank for memory efficiency
lora_config = LoraConfig(
r=8, # Reduced from 16 to save memory
lora_alpha=16, # Reduced proportionally
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Format and tokenize dataset
def format_and_tokenize(example):
# Format the chat messages
text = tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
add_generation_prompt=False
)
# Tokenize
tokenized = tokenizer(
text,
truncation=True,
max_length=max_seq_length,
padding=False,
return_tensors=None,
)
# Add labels for causal language modeling
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
print("Formatting and tokenizing dataset...")
train_dataset = train_dataset.map(
format_and_tokenize,
remove_columns=train_dataset.column_names,
desc="Formatting train dataset"
)
eval_dataset = eval_dataset.map(
format_and_tokenize,
remove_columns=eval_dataset.column_names,
desc="Formatting eval dataset"
)
# Data collator for padding
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False, # We're doing causal LM, not masked LM
)
# Training arguments - optimized for T4 GPU
training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=1, # Keep at 1 for memory safety
per_device_eval_batch_size=1,
gradient_accumulation_steps=8, # Reduced from 16 to lower memory pressure
num_train_epochs=1,
max_steps=1000, # Limit steps for testing
learning_rate=2e-4,
fp16=use_fp16,
gradient_checkpointing=True, # Enable gradient checkpointing to save memory
save_strategy="steps",
save_steps=200, # Save more frequently
eval_strategy="steps",
eval_steps=200,
logging_steps=10,
warmup_steps=50,
lr_scheduler_type="cosine",
optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch", # Use 8-bit optimizer on GPU
report_to="none",
max_grad_norm=0.3,
save_total_limit=2,
load_best_model_at_end=False, # Disable to avoid loading issues
dataloader_num_workers=0, # No multiprocessing for stability
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
print("Starting training...")
trainer.train()
print("Saving model...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print("Training complete!")
print(f"Model saved to: {output_dir}")
if __name__ == "__main__":
main()