Mindigenous commited on
Commit
144ae24
Β·
verified Β·
1 Parent(s): 2957021

Upload configs

Browse files
Files changed (1) hide show
  1. configs/training_config.yaml +66 -98
configs/training_config.yaml CHANGED
@@ -1,113 +1,81 @@
1
- # ==========================================
2
- # MINDI 1.5 Vision-Coder β€” Training Configuration
3
- # Optimized for AMD MI300X 192GB VRAM
4
- # ==========================================
5
-
6
- # ── Model ──────────────────────────────────────────────────────
7
- model:
8
- name: "Qwen/Qwen2.5-Coder-7B-Instruct"
9
- hidden_size: 3584
10
- dtype: "bf16" # bf16 required for MI300X stability (NOT fp16)
11
- use_compile: false # Disabled β€” inductor eats ~130GB VRAM on ROCm
12
- gradient_checkpointing: true # Save VRAM even with 192GB
13
-
14
- # ── LoRA ───────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  lora:
16
- r: 64
17
  alpha: 128
 
18
  dropout: 0.05
19
- bias: "none"
20
- task_type: "CAUSAL_LM"
21
  target_modules:
22
- - q_proj
23
- - k_proj
24
- - v_proj
25
- - o_proj
26
- - gate_proj
27
- - up_proj
28
- - down_proj
29
-
30
- # ── Vision ─────────────────────────────────────────────────────
31
- vision:
32
- clip_model: "openai/clip-vit-large-patch14"
33
- visual_tokens: 256 # 16Γ—16 patches from ViT-L/14
34
- projection_size: 3584 # Must match model.hidden_size
35
- freeze_clip: true # Freeze CLIP backbone
36
-
37
- # ── Training Phases ────────────────────────────────────────────
 
 
 
38
  training:
39
- # Phase 1: LoRA only β€” teach coding patterns
 
 
40
  phase1:
 
 
 
41
  steps: 5000
42
- lr: 2.0e-4
43
- batch_size: 8 # Reduced from 16 (OOM with compile+logits)
44
  warmup_steps: 100
45
- data_filter: "code_only"
46
-
47
- # Phase 2: Vision bridge only β€” align visual tokens
48
  phase2:
 
 
 
49
  steps: 2500
50
- lr: 1.0e-5
51
- batch_size: 4 # Reduced from 8 (vision needs more VRAM)
52
  warmup_steps: 50
53
- data_filter: "websight_only"
54
-
55
- # Phase 3: All trainable β€” joint fine-tuning
56
  phase3:
 
 
 
57
  steps: 2500
58
- lr: 5.0e-5
59
- batch_size: 6 # Reduced from 12
60
  warmup_steps: 50
61
- data_filter: "all"
62
-
63
- # Shared training settings
64
- grad_accumulation: 8 # Doubled from 4 to keep effective batch size
65
- max_grad_norm: 1.0
66
- eval_every: 250
67
  save_every: 500
68
-
69
- # ── Data ───────────────────────────────────────────────────────
70
- data:
71
- # Text-only code data (Phase 1 + Phase 3)
72
- train_file: "data/processed/train.jsonl" # 4.18GB, 1,304,486 examples
73
- val_file: "data/processed/val.jsonl" # 0.23GB, 72,471 examples
74
-
75
- # Vision+code data β€” WebSight UI screenshots (Phase 2 + Phase 3)
76
- vision_train_file: "data/websight/train.jsonl"
77
- vision_val_file: "data/websight/val.jsonl"
78
-
79
- max_length: 4096
80
- shuffle_buffer: 10000 # Streaming shuffle buffer size
81
- num_workers: 4 # DataLoader workers
82
- pin_memory: true
83
- prefetch_factor: 2
84
-
85
- # ── Logging ────────────────────────────────────────────────────
86
- logging:
87
- wandb_project: "mindi-1.5-vision-coder"
88
- wandb_entity: "mindigenous"
89
- log_every: 10 # Log metrics every N steps
90
- log_dir: "logs/training"
91
- sample_every: 500 # Generate sample outputs every N steps
92
- tags:
93
- - "mindi-1.5"
94
- - "lora"
95
- - "vision-coder"
96
- - "mi300x"
97
-
98
- # ── Output ─────────────────────────────────────────────────────
99
- output:
100
- checkpoint_dir: "checkpoints/training"
101
- best_model: "checkpoints/best"
102
- hf_repo: "Mindigenous/MINDI-1.5-Vision-Coder"
103
- push_every_phase: true
104
-
105
- # ── Local Dev Overrides (RTX 4060 8GB) ────────────────────────
106
- # Apply these when testing locally with --dry_run
107
- local_overrides:
108
- batch_size: 1
109
- gradient_accumulation_steps: 16
110
- max_length: 2048
111
- gradient_checkpointing: true
112
- use_compile: false
113
- num_workers: 0
 
1
+ data:
2
+ max_length: 2048
3
+ num_workers: 2
4
+ pin_memory: true
5
+ prefetch_factor: 2
6
+ shuffle_buffer: 10000
7
+ train_file: data/processed/train.jsonl
8
+ val_file: data/processed/val.jsonl
9
+ vision_train_file: data/websight/train.jsonl
10
+ vision_val_file: data/websight/val.jsonl
11
+ local_overrides:
12
+ batch_size: 1
13
+ gradient_accumulation_steps: 16
14
+ gradient_checkpointing: true
15
+ max_length: 2048
16
+ num_workers: 0
17
+ use_compile: false
18
+ logging:
19
+ log_dir: logs/training
20
+ log_every: 10
21
+ sample_every: 500
22
+ tags:
23
+ - mindi-1.5
24
+ - lora
25
+ - vision-coder
26
+ - mi300x
27
+ wandb_entity: mindigenous
28
+ wandb_project: mindi-1.5-vision-coder
29
  lora:
 
30
  alpha: 128
31
+ bias: none
32
  dropout: 0.05
33
+ r: 64
 
34
  target_modules:
35
+ - q_proj
36
+ - k_proj
37
+ - v_proj
38
+ - o_proj
39
+ - gate_proj
40
+ - up_proj
41
+ - down_proj
42
+ task_type: CAUSAL_LM
43
+ model:
44
+ dtype: bf16
45
+ gradient_checkpointing: true
46
+ hidden_size: 3584
47
+ name: Qwen/Qwen2.5-Coder-7B-Instruct
48
+ use_compile: false
49
+ output:
50
+ best_model: checkpoints/best
51
+ checkpoint_dir: checkpoints/training
52
+ hf_repo: Mindigenous/MINDI-1.5-Vision-Coder
53
+ push_every_phase: true
54
  training:
55
+ eval_every: 250
56
+ grad_accumulation: 24
57
+ max_grad_norm: 1.0
58
  phase1:
59
+ batch_size: 2
60
+ data_filter: code_only
61
+ lr: 0.0002
62
  steps: 5000
 
 
63
  warmup_steps: 100
 
 
 
64
  phase2:
65
+ batch_size: 2
66
+ data_filter: websight_only
67
+ lr: 1.0e-05
68
  steps: 2500
 
 
69
  warmup_steps: 50
 
 
 
70
  phase3:
71
+ batch_size: 2
72
+ data_filter: all
73
+ lr: 5.0e-05
74
  steps: 2500
 
 
75
  warmup_steps: 50
 
 
 
 
 
 
76
  save_every: 500
77
+ vision:
78
+ clip_model: openai/clip-vit-large-patch14
79
+ freeze_clip: true
80
+ projection_size: 3584
81
+ visual_tokens: 256