Faaz commited on
Commit
6e8f91c
Β·
1 Parent(s): 691fc84

Fix OOM: disable torch.compile, reduce batch sizes, double grad_accumulation

Browse files
Files changed (1) hide show
  1. configs/training_config.yaml +5 -5
configs/training_config.yaml CHANGED
@@ -8,7 +8,7 @@ model:
8
  name: "Qwen/Qwen2.5-Coder-7B-Instruct"
9
  hidden_size: 3584
10
  dtype: "bf16" # bf16 required for MI300X stability (NOT fp16)
11
- use_compile: true # torch.compile() works on ROCm
12
  gradient_checkpointing: true # Save VRAM even with 192GB
13
 
14
  # ── LoRA ───────────────────────────────────────────────────────
@@ -40,7 +40,7 @@ training:
40
  phase1:
41
  steps: 5000
42
  lr: 2.0e-4
43
- batch_size: 16 # MI300X can handle large batches
44
  warmup_steps: 100
45
  data_filter: "code_only"
46
 
@@ -48,7 +48,7 @@ training:
48
  phase2:
49
  steps: 2500
50
  lr: 1.0e-5
51
- batch_size: 8 # Smaller batch for vision bridge
52
  warmup_steps: 50
53
  data_filter: "websight_only"
54
 
@@ -56,12 +56,12 @@ training:
56
  phase3:
57
  steps: 2500
58
  lr: 5.0e-5
59
- batch_size: 12
60
  warmup_steps: 50
61
  data_filter: "all"
62
 
63
  # Shared training settings
64
- grad_accumulation: 4
65
  max_grad_norm: 1.0
66
  eval_every: 250
67
  save_every: 500
 
8
  name: "Qwen/Qwen2.5-Coder-7B-Instruct"
9
  hidden_size: 3584
10
  dtype: "bf16" # bf16 required for MI300X stability (NOT fp16)
11
+ use_compile: false # Disabled β€” inductor eats ~130GB VRAM on ROCm
12
  gradient_checkpointing: true # Save VRAM even with 192GB
13
 
14
  # ── LoRA ───────────────────────────────────────────────────────
 
40
  phase1:
41
  steps: 5000
42
  lr: 2.0e-4
43
+ batch_size: 8 # Reduced from 16 (OOM with compile+logits)
44
  warmup_steps: 100
45
  data_filter: "code_only"
46
 
 
48
  phase2:
49
  steps: 2500
50
  lr: 1.0e-5
51
+ batch_size: 4 # Reduced from 8 (vision needs more VRAM)
52
  warmup_steps: 50
53
  data_filter: "websight_only"
54
 
 
56
  phase3:
57
  steps: 2500
58
  lr: 5.0e-5
59
+ batch_size: 6 # Reduced from 12
60
  warmup_steps: 50
61
  data_filter: "all"
62
 
63
  # Shared training settings
64
+ grad_accumulation: 8 # Doubled from 4 to keep effective batch size
65
  max_grad_norm: 1.0
66
  eval_every: 250
67
  save_every: 500