Faaz commited on
Commit Β·
6e8f91c
1
Parent(s): 691fc84
Fix OOM: disable torch.compile, reduce batch sizes, double grad_accumulation
Browse files
configs/training_config.yaml
CHANGED
|
@@ -8,7 +8,7 @@ model:
|
|
| 8 |
name: "Qwen/Qwen2.5-Coder-7B-Instruct"
|
| 9 |
hidden_size: 3584
|
| 10 |
dtype: "bf16" # bf16 required for MI300X stability (NOT fp16)
|
| 11 |
-
use_compile:
|
| 12 |
gradient_checkpointing: true # Save VRAM even with 192GB
|
| 13 |
|
| 14 |
# ββ LoRA βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -40,7 +40,7 @@ training:
|
|
| 40 |
phase1:
|
| 41 |
steps: 5000
|
| 42 |
lr: 2.0e-4
|
| 43 |
-
batch_size:
|
| 44 |
warmup_steps: 100
|
| 45 |
data_filter: "code_only"
|
| 46 |
|
|
@@ -48,7 +48,7 @@ training:
|
|
| 48 |
phase2:
|
| 49 |
steps: 2500
|
| 50 |
lr: 1.0e-5
|
| 51 |
-
batch_size:
|
| 52 |
warmup_steps: 50
|
| 53 |
data_filter: "websight_only"
|
| 54 |
|
|
@@ -56,12 +56,12 @@ training:
|
|
| 56 |
phase3:
|
| 57 |
steps: 2500
|
| 58 |
lr: 5.0e-5
|
| 59 |
-
batch_size: 12
|
| 60 |
warmup_steps: 50
|
| 61 |
data_filter: "all"
|
| 62 |
|
| 63 |
# Shared training settings
|
| 64 |
-
grad_accumulation: 4
|
| 65 |
max_grad_norm: 1.0
|
| 66 |
eval_every: 250
|
| 67 |
save_every: 500
|
|
|
|
| 8 |
name: "Qwen/Qwen2.5-Coder-7B-Instruct"
|
| 9 |
hidden_size: 3584
|
| 10 |
dtype: "bf16" # bf16 required for MI300X stability (NOT fp16)
|
| 11 |
+
use_compile: false # Disabled β inductor eats ~130GB VRAM on ROCm
|
| 12 |
gradient_checkpointing: true # Save VRAM even with 192GB
|
| 13 |
|
| 14 |
# ββ LoRA βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 40 |
phase1:
|
| 41 |
steps: 5000
|
| 42 |
lr: 2.0e-4
|
| 43 |
+
batch_size: 8 # Reduced from 16 (OOM with compile+logits)
|
| 44 |
warmup_steps: 100
|
| 45 |
data_filter: "code_only"
|
| 46 |
|
|
|
|
| 48 |
phase2:
|
| 49 |
steps: 2500
|
| 50 |
lr: 1.0e-5
|
| 51 |
+
batch_size: 4 # Reduced from 8 (vision needs more VRAM)
|
| 52 |
warmup_steps: 50
|
| 53 |
data_filter: "websight_only"
|
| 54 |
|
|
|
|
| 56 |
phase3:
|
| 57 |
steps: 2500
|
| 58 |
lr: 5.0e-5
|
| 59 |
+
batch_size: 6 # Reduced from 12
|
| 60 |
warmup_steps: 50
|
| 61 |
data_filter: "all"
|
| 62 |
|
| 63 |
# Shared training settings
|
| 64 |
+
grad_accumulation: 8 # Doubled from 4 to keep effective batch size
|
| 65 |
max_grad_norm: 1.0
|
| 66 |
eval_every: 250
|
| 67 |
save_every: 500
|