Faaz commited on
Commit
691fc84
Β·
1 Parent(s): 07de2d7

Fix hidden_size: 4096 -> 3584 to match Qwen2.5-Coder-7B-Instruct

Browse files
configs/training_config.yaml CHANGED
@@ -6,7 +6,7 @@
6
  # ── Model ──────────────────────────────────────────────────────
7
  model:
8
  name: "Qwen/Qwen2.5-Coder-7B-Instruct"
9
- hidden_size: 4096
10
  dtype: "bf16" # bf16 required for MI300X stability (NOT fp16)
11
  use_compile: true # torch.compile() works on ROCm
12
  gradient_checkpointing: true # Save VRAM even with 192GB
@@ -31,7 +31,7 @@ lora:
31
  vision:
32
  clip_model: "openai/clip-vit-large-patch14"
33
  visual_tokens: 256 # 16Γ—16 patches from ViT-L/14
34
- projection_size: 4096 # Must match model.hidden_size
35
  freeze_clip: true # Freeze CLIP backbone
36
 
37
  # ── Training Phases ────────────────────────────────────────────
 
6
  # ── Model ──────────────────────────────────────────────────────
7
  model:
8
  name: "Qwen/Qwen2.5-Coder-7B-Instruct"
9
+ hidden_size: 3584
10
  dtype: "bf16" # bf16 required for MI300X stability (NOT fp16)
11
  use_compile: true # torch.compile() works on ROCm
12
  gradient_checkpointing: true # Save VRAM even with 192GB
 
31
  vision:
32
  clip_model: "openai/clip-vit-large-patch14"
33
  visual_tokens: 256 # 16Γ—16 patches from ViT-L/14
34
+ projection_size: 3584 # Must match model.hidden_size
35
  freeze_clip: true # Freeze CLIP backbone
36
 
37
  # ── Training Phases ────────────────────────────────────────────
scripts/train.py CHANGED
@@ -255,7 +255,7 @@ def main() -> None:
255
  model = MINDI15(
256
  model_name=model_cfg.get("name", "Qwen/Qwen2.5-Coder-7B-Instruct"),
257
  clip_model=vision_cfg.get("clip_model", "openai/clip-vit-large-patch14"),
258
- hidden_size=model_cfg.get("hidden_size", 4096),
259
  num_visual_tokens=vision_cfg.get("visual_tokens", 256),
260
  torch_dtype=config.torch_dtype,
261
  )
 
255
  model = MINDI15(
256
  model_name=model_cfg.get("name", "Qwen/Qwen2.5-Coder-7B-Instruct"),
257
  clip_model=vision_cfg.get("clip_model", "openai/clip-vit-large-patch14"),
258
+ hidden_size=model_cfg.get("hidden_size", 3584),
259
  num_visual_tokens=vision_cfg.get("visual_tokens", 256),
260
  torch_dtype=config.torch_dtype,
261
  )
src/model/fusion_layer.py CHANGED
@@ -1,7 +1,7 @@
1
  """
2
  MINDI 1.5 Vision-Coder β€” Vision-Language Fusion Layer
3
 
4
- Prepends projected visual tokens (256 Γ— 4096) to text token embeddings
5
  and extends the attention mask accordingly. Uses Linear + LayerNorm
6
  for the visual projection gate.
7
  """
@@ -19,8 +19,8 @@ class VisionLanguageFusion(nn.Module):
19
  Fuses visual and text embeddings by prepending visual tokens.
20
 
21
  Pipeline:
22
- 1. visual_tokens (batch, 256, 4096) β†’ Linear β†’ LayerNorm
23
- 2. Prepend to text_embeds (batch, seq_len, 4096)
24
  3. Extend attention_mask to cover the extra 256 visual positions
25
 
26
  All trainable parameters live in the gate projection + LayerNorm.
@@ -28,7 +28,7 @@ class VisionLanguageFusion(nn.Module):
28
 
29
  def __init__(
30
  self,
31
- hidden_size: int = 4096,
32
  num_visual_tokens: int = 256,
33
  ) -> None:
34
  """
 
1
  """
2
  MINDI 1.5 Vision-Coder β€” Vision-Language Fusion Layer
3
 
4
+ Prepends projected visual tokens (256 Γ— 3584) to text token embeddings
5
  and extends the attention mask accordingly. Uses Linear + LayerNorm
6
  for the visual projection gate.
7
  """
 
19
  Fuses visual and text embeddings by prepending visual tokens.
20
 
21
  Pipeline:
22
+ 1. visual_tokens (batch, 256, 3584) β†’ Linear β†’ LayerNorm
23
+ 2. Prepend to text_embeds (batch, seq_len, 3584)
24
  3. Extend attention_mask to cover the extra 256 visual positions
25
 
26
  All trainable parameters live in the gate projection + LayerNorm.
 
28
 
29
  def __init__(
30
  self,
31
+ hidden_size: int = 3584,
32
  num_visual_tokens: int = 256,
33
  ) -> None:
34
  """
src/model/mindi_model.py CHANGED
@@ -47,7 +47,7 @@ class MINDI15(nn.Module):
47
 
48
  Components:
49
  - architecture: Qwen2.5-Coder-7B-Instruct + LoRA
50
- - vision_encoder: CLIP ViT-L/14 (frozen) β†’ 256 tokens Γ— 4096
51
  - fusion: Linear + LayerNorm prepend fusion
52
  - tokenizer: MINDI custom tokenizer with 22 special tokens
53
  """
@@ -56,7 +56,7 @@ class MINDI15(nn.Module):
56
  self,
57
  model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct",
58
  clip_model: str = "openai/clip-vit-large-patch14",
59
- hidden_size: int = 4096,
60
  num_visual_tokens: int = 256,
61
  tokenizer_path: Optional[Path] = None,
62
  device: Optional[str] = None,
 
47
 
48
  Components:
49
  - architecture: Qwen2.5-Coder-7B-Instruct + LoRA
50
+ - vision_encoder: CLIP ViT-L/14 (frozen) β†’ 256 tokens Γ— 3584
51
  - fusion: Linear + LayerNorm prepend fusion
52
  - tokenizer: MINDI custom tokenizer with 22 special tokens
53
  """
 
56
  self,
57
  model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct",
58
  clip_model: str = "openai/clip-vit-large-patch14",
59
+ hidden_size: int = 3584,
60
  num_visual_tokens: int = 256,
61
  tokenizer_path: Optional[Path] = None,
62
  device: Optional[str] = None,
src/model/vision_encoder.py CHANGED
@@ -2,8 +2,8 @@
2
  MINDI 1.5 Vision-Coder β€” Vision Encoder
3
 
4
  Uses CLIP ViT-L/14 (frozen) to encode UI screenshots into 256 visual
5
- tokens projected from 1024 β†’ 4096 to match the Qwen hidden dimension.
6
- Output shape: (batch, 256, 4096).
7
  """
8
 
9
  from __future__ import annotations
@@ -22,7 +22,7 @@ class VisionEncoder(nn.Module):
22
  CLIP ViT-L/14 vision encoder for MINDI 1.5.
23
 
24
  Extracts ALL 256 patch tokens (excludes CLS) from CLIP and
25
- projects them from 1024 β†’ 4096 to match Qwen2.5 hidden_size.
26
  The CLIP backbone is frozen; only the projection layer trains.
27
  """
28
 
@@ -31,7 +31,7 @@ class VisionEncoder(nn.Module):
31
  def __init__(
32
  self,
33
  model_name: str = "openai/clip-vit-large-patch14",
34
- llm_hidden_size: int = 4096,
35
  device: Optional[str] = None,
36
  cache_dir: Optional[Path] = None,
37
  torch_dtype: torch.dtype = torch.float32,
 
2
  MINDI 1.5 Vision-Coder β€” Vision Encoder
3
 
4
  Uses CLIP ViT-L/14 (frozen) to encode UI screenshots into 256 visual
5
+ tokens projected from 1024 β†’ 3584 to match the Qwen hidden dimension.
6
+ Output shape: (batch, 256, 3584).
7
  """
8
 
9
  from __future__ import annotations
 
22
  CLIP ViT-L/14 vision encoder for MINDI 1.5.
23
 
24
  Extracts ALL 256 patch tokens (excludes CLS) from CLIP and
25
+ projects them from 1024 β†’ 3584 to match Qwen2.5 hidden_size.
26
  The CLIP backbone is frozen; only the projection layer trains.
27
  """
28
 
 
31
  def __init__(
32
  self,
33
  model_name: str = "openai/clip-vit-large-patch14",
34
+ llm_hidden_size: int = 3584,
35
  device: Optional[str] = None,
36
  cache_dir: Optional[Path] = None,
37
  torch_dtype: torch.dtype = torch.float32,