Spaces:
Sleeping
Sleeping
kghamilton89
commited on
Commit
·
0c63404
1
Parent(s):
bb4b68b
Optimize memory usage for T4 GPU training
Browse files- Reduce max_seq_length from 2048 to 1024
- Reduce LoRA rank from 16 to 8 (fewer trainable params)
- Enable gradient checkpointing on model
- Add gradient_checkpointing=True in TrainingArguments
- Reduce gradient_accumulation_steps from 16 to 8
- Use paged_adamw_8bit optimizer for GPU memory efficiency
These changes should prevent OOM crashes on T4 (16GB VRAM)
- finetune.py +12 -8
finetune.py
CHANGED
|
@@ -13,7 +13,7 @@ def main():
|
|
| 13 |
# Configuration
|
| 14 |
model_name = "Qwen/Qwen2.5-0.5B-Instruct" # Using 0.5B as 0.6B doesn't exist
|
| 15 |
output_dir = "./qwen-codeforces-cots"
|
| 16 |
-
max_seq_length = 2048
|
| 17 |
|
| 18 |
# Detect device - prefer CUDA for GPU training
|
| 19 |
if torch.cuda.is_available():
|
|
@@ -63,17 +63,20 @@ def main():
|
|
| 63 |
)
|
| 64 |
from peft import prepare_model_for_kbit_training
|
| 65 |
model = prepare_model_for_kbit_training(model)
|
|
|
|
|
|
|
| 66 |
else:
|
| 67 |
model = AutoModelForCausalLM.from_pretrained(
|
| 68 |
model_name,
|
| 69 |
torch_dtype=torch.float32,
|
| 70 |
trust_remote_code=True,
|
| 71 |
)
|
|
|
|
| 72 |
|
| 73 |
-
# LoRA config
|
| 74 |
lora_config = LoraConfig(
|
| 75 |
-
r=16
|
| 76 |
-
lora_alpha=
|
| 77 |
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
|
| 78 |
lora_dropout=0.05,
|
| 79 |
bias="none",
|
|
@@ -122,16 +125,17 @@ def main():
|
|
| 122 |
mlm=False, # We're doing causal LM, not masked LM
|
| 123 |
)
|
| 124 |
|
| 125 |
-
# Training arguments -
|
| 126 |
training_args = TrainingArguments(
|
| 127 |
output_dir=output_dir,
|
| 128 |
-
per_device_train_batch_size=1, #
|
| 129 |
per_device_eval_batch_size=1,
|
| 130 |
-
gradient_accumulation_steps=
|
| 131 |
num_train_epochs=1,
|
| 132 |
max_steps=1000, # Limit steps for testing
|
| 133 |
learning_rate=2e-4,
|
| 134 |
fp16=use_fp16,
|
|
|
|
| 135 |
save_strategy="steps",
|
| 136 |
save_steps=200, # Save more frequently
|
| 137 |
eval_strategy="steps",
|
|
@@ -139,7 +143,7 @@ def main():
|
|
| 139 |
logging_steps=10,
|
| 140 |
warmup_steps=50,
|
| 141 |
lr_scheduler_type="cosine",
|
| 142 |
-
optim="adamw_torch",
|
| 143 |
report_to="none",
|
| 144 |
max_grad_norm=0.3,
|
| 145 |
save_total_limit=2,
|
|
|
|
| 13 |
# Configuration
|
| 14 |
model_name = "Qwen/Qwen2.5-0.5B-Instruct" # Using 0.5B as 0.6B doesn't exist
|
| 15 |
output_dir = "./qwen-codeforces-cots"
|
| 16 |
+
max_seq_length = 1024 # Reduced from 2048 to save memory
|
| 17 |
|
| 18 |
# Detect device - prefer CUDA for GPU training
|
| 19 |
if torch.cuda.is_available():
|
|
|
|
| 63 |
)
|
| 64 |
from peft import prepare_model_for_kbit_training
|
| 65 |
model = prepare_model_for_kbit_training(model)
|
| 66 |
+
# Enable gradient checkpointing for memory efficiency
|
| 67 |
+
model.gradient_checkpointing_enable()
|
| 68 |
else:
|
| 69 |
model = AutoModelForCausalLM.from_pretrained(
|
| 70 |
model_name,
|
| 71 |
torch_dtype=torch.float32,
|
| 72 |
trust_remote_code=True,
|
| 73 |
)
|
| 74 |
+
model.gradient_checkpointing_enable()
|
| 75 |
|
| 76 |
+
# LoRA config - reduced rank for memory efficiency
|
| 77 |
lora_config = LoraConfig(
|
| 78 |
+
r=8, # Reduced from 16 to save memory
|
| 79 |
+
lora_alpha=16, # Reduced proportionally
|
| 80 |
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
|
| 81 |
lora_dropout=0.05,
|
| 82 |
bias="none",
|
|
|
|
| 125 |
mlm=False, # We're doing causal LM, not masked LM
|
| 126 |
)
|
| 127 |
|
| 128 |
+
# Training arguments - optimized for T4 GPU
|
| 129 |
training_args = TrainingArguments(
|
| 130 |
output_dir=output_dir,
|
| 131 |
+
per_device_train_batch_size=1, # Keep at 1 for memory safety
|
| 132 |
per_device_eval_batch_size=1,
|
| 133 |
+
gradient_accumulation_steps=8, # Reduced from 16 to lower memory pressure
|
| 134 |
num_train_epochs=1,
|
| 135 |
max_steps=1000, # Limit steps for testing
|
| 136 |
learning_rate=2e-4,
|
| 137 |
fp16=use_fp16,
|
| 138 |
+
gradient_checkpointing=True, # Enable gradient checkpointing to save memory
|
| 139 |
save_strategy="steps",
|
| 140 |
save_steps=200, # Save more frequently
|
| 141 |
eval_strategy="steps",
|
|
|
|
| 143 |
logging_steps=10,
|
| 144 |
warmup_steps=50,
|
| 145 |
lr_scheduler_type="cosine",
|
| 146 |
+
optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch", # Use 8-bit optimizer on GPU
|
| 147 |
report_to="none",
|
| 148 |
max_grad_norm=0.3,
|
| 149 |
save_total_limit=2,
|