kghamilton89 commited on
Commit
0c63404
·
1 Parent(s): bb4b68b

Optimize memory usage for T4 GPU training

Browse files

- Reduce max_seq_length from 2048 to 1024
- Reduce LoRA rank from 16 to 8 (fewer trainable params)
- Enable gradient checkpointing on model
- Add gradient_checkpointing=True in TrainingArguments
- Reduce gradient_accumulation_steps from 16 to 8
- Use paged_adamw_8bit optimizer for GPU memory efficiency

These changes should prevent OOM crashes on T4 (16GB VRAM)

Files changed (1) hide show
  1. finetune.py +12 -8
finetune.py CHANGED
@@ -13,7 +13,7 @@ def main():
13
  # Configuration
14
  model_name = "Qwen/Qwen2.5-0.5B-Instruct" # Using 0.5B as 0.6B doesn't exist
15
  output_dir = "./qwen-codeforces-cots"
16
- max_seq_length = 2048
17
 
18
  # Detect device - prefer CUDA for GPU training
19
  if torch.cuda.is_available():
@@ -63,17 +63,20 @@ def main():
63
  )
64
  from peft import prepare_model_for_kbit_training
65
  model = prepare_model_for_kbit_training(model)
 
 
66
  else:
67
  model = AutoModelForCausalLM.from_pretrained(
68
  model_name,
69
  torch_dtype=torch.float32,
70
  trust_remote_code=True,
71
  )
 
72
 
73
- # LoRA config
74
  lora_config = LoraConfig(
75
- r=16,
76
- lora_alpha=32,
77
  target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
78
  lora_dropout=0.05,
79
  bias="none",
@@ -122,16 +125,17 @@ def main():
122
  mlm=False, # We're doing causal LM, not masked LM
123
  )
124
 
125
- # Training arguments - reduced for CPU training
126
  training_args = TrainingArguments(
127
  output_dir=output_dir,
128
- per_device_train_batch_size=1, # Reduced for CPU
129
  per_device_eval_batch_size=1,
130
- gradient_accumulation_steps=16, # Maintain effective batch size
131
  num_train_epochs=1,
132
  max_steps=1000, # Limit steps for testing
133
  learning_rate=2e-4,
134
  fp16=use_fp16,
 
135
  save_strategy="steps",
136
  save_steps=200, # Save more frequently
137
  eval_strategy="steps",
@@ -139,7 +143,7 @@ def main():
139
  logging_steps=10,
140
  warmup_steps=50,
141
  lr_scheduler_type="cosine",
142
- optim="adamw_torch",
143
  report_to="none",
144
  max_grad_norm=0.3,
145
  save_total_limit=2,
 
13
  # Configuration
14
  model_name = "Qwen/Qwen2.5-0.5B-Instruct" # Using 0.5B as 0.6B doesn't exist
15
  output_dir = "./qwen-codeforces-cots"
16
+ max_seq_length = 1024 # Reduced from 2048 to save memory
17
 
18
  # Detect device - prefer CUDA for GPU training
19
  if torch.cuda.is_available():
 
63
  )
64
  from peft import prepare_model_for_kbit_training
65
  model = prepare_model_for_kbit_training(model)
66
+ # Enable gradient checkpointing for memory efficiency
67
+ model.gradient_checkpointing_enable()
68
  else:
69
  model = AutoModelForCausalLM.from_pretrained(
70
  model_name,
71
  torch_dtype=torch.float32,
72
  trust_remote_code=True,
73
  )
74
+ model.gradient_checkpointing_enable()
75
 
76
+ # LoRA config - reduced rank for memory efficiency
77
  lora_config = LoraConfig(
78
+ r=8, # Reduced from 16 to save memory
79
+ lora_alpha=16, # Reduced proportionally
80
  target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
81
  lora_dropout=0.05,
82
  bias="none",
 
125
  mlm=False, # We're doing causal LM, not masked LM
126
  )
127
 
128
+ # Training arguments - optimized for T4 GPU
129
  training_args = TrainingArguments(
130
  output_dir=output_dir,
131
+ per_device_train_batch_size=1, # Keep at 1 for memory safety
132
  per_device_eval_batch_size=1,
133
+ gradient_accumulation_steps=8, # Reduced from 16 to lower memory pressure
134
  num_train_epochs=1,
135
  max_steps=1000, # Limit steps for testing
136
  learning_rate=2e-4,
137
  fp16=use_fp16,
138
+ gradient_checkpointing=True, # Enable gradient checkpointing to save memory
139
  save_strategy="steps",
140
  save_steps=200, # Save more frequently
141
  eval_strategy="steps",
 
143
  logging_steps=10,
144
  warmup_steps=50,
145
  lr_scheduler_type="cosine",
146
+ optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch", # Use 8-bit optimizer on GPU
147
  report_to="none",
148
  max_grad_norm=0.3,
149
  save_total_limit=2,