Spaces:

ZennyKenny
/

claude-code-fine-tune

Sleeping

kghamilton89

Optimize memory usage for T4 GPU training

0c63404 6 days ago

5.67 kB

	import torch
	from datasets import load_dataset
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	TrainingArguments,
	Trainer,
	DataCollatorForLanguageModeling,
	)
	from peft import LoraConfig, get_peft_model

	def main():
	# Configuration
	model_name = "Qwen/Qwen2.5-0.5B-Instruct" # Using 0.5B as 0.6B doesn't exist
	output_dir = "./qwen-codeforces-cots"
	max_seq_length = 1024 # Reduced from 2048 to save memory

	# Detect device - prefer CUDA for GPU training
	if torch.cuda.is_available():
	device = "cuda"
	use_fp16 = True
	print(f"Using device: CUDA ({torch.cuda.get_device_name(0)})")
	else:
	device = "cpu"
	use_fp16 = False
	print(f"Using device: CPU (training will be slow)")

	print("Loading dataset...")
	dataset = load_dataset("open-r1/codeforces-cots", split="train")

	# Split into train and eval
	dataset = dataset.train_test_split(test_size=0.05, seed=42)
	train_dataset = dataset["train"]
	eval_dataset = dataset["test"]

	print(f"Train samples: {len(train_dataset)}")
	print(f"Eval samples: {len(eval_dataset)}")

	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	trust_remote_code=True,
	)
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right"

	print("Loading model...")
	# Use appropriate dtype and device_map based on hardware
	if torch.cuda.is_available():
	from transformers import BitsAndBytesConfig
	# Use 4-bit quantization for efficient GPU training
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	quantization_config=bnb_config,
	device_map="auto",
	trust_remote_code=True,
	)
	from peft import prepare_model_for_kbit_training
	model = prepare_model_for_kbit_training(model)
	# Enable gradient checkpointing for memory efficiency
	model.gradient_checkpointing_enable()
	else:
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float32,
	trust_remote_code=True,
	)
	model.gradient_checkpointing_enable()

	# LoRA config - reduced rank for memory efficiency
	lora_config = LoraConfig(
	r=8, # Reduced from 16 to save memory
	lora_alpha=16, # Reduced proportionally
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM",
	)

	# Apply LoRA
	model = get_peft_model(model, lora_config)
	model.print_trainable_parameters()

	# Format and tokenize dataset
	def format_and_tokenize(example):
	# Format the chat messages
	text = tokenizer.apply_chat_template(
	example["messages"],
	tokenize=False,
	add_generation_prompt=False
	)
	# Tokenize
	tokenized = tokenizer(
	text,
	truncation=True,
	max_length=max_seq_length,
	padding=False,
	return_tensors=None,
	)
	# Add labels for causal language modeling
	tokenized["labels"] = tokenized["input_ids"].copy()
	return tokenized

	print("Formatting and tokenizing dataset...")
	train_dataset = train_dataset.map(
	format_and_tokenize,
	remove_columns=train_dataset.column_names,
	desc="Formatting train dataset"
	)
	eval_dataset = eval_dataset.map(
	format_and_tokenize,
	remove_columns=eval_dataset.column_names,
	desc="Formatting eval dataset"
	)

	# Data collator for padding
	data_collator = DataCollatorForLanguageModeling(
	tokenizer=tokenizer,
	mlm=False, # We're doing causal LM, not masked LM
	)

	# Training arguments - optimized for T4 GPU
	training_args = TrainingArguments(
	output_dir=output_dir,
	per_device_train_batch_size=1, # Keep at 1 for memory safety
	per_device_eval_batch_size=1,
	gradient_accumulation_steps=8, # Reduced from 16 to lower memory pressure
	num_train_epochs=1,
	max_steps=1000, # Limit steps for testing
	learning_rate=2e-4,
	fp16=use_fp16,
	gradient_checkpointing=True, # Enable gradient checkpointing to save memory
	save_strategy="steps",
	save_steps=200, # Save more frequently
	eval_strategy="steps",
	eval_steps=200,
	logging_steps=10,
	warmup_steps=50,
	lr_scheduler_type="cosine",
	optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch", # Use 8-bit optimizer on GPU
	report_to="none",
	max_grad_norm=0.3,
	save_total_limit=2,
	load_best_model_at_end=False, # Disable to avoid loading issues
	dataloader_num_workers=0, # No multiprocessing for stability
	)

	# Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	data_collator=data_collator,
	)

	print("Starting training...")
	trainer.train()

	print("Saving model...")
	trainer.save_model(output_dir)
	tokenizer.save_pretrained(output_dir)

	print("Training complete!")
	print(f"Model saved to: {output_dir}")

	if __name__ == "__main__":
	main()