Fix hidden_size: 4096 -> 3584 to match Qwen2.5-Coder-7B-Instruct

Files changed (5) hide show

configs/training_config.yaml CHANGED Viewed

@@ -6,7 +6,7 @@
 # ── Model ──────────────────────────────────────────────────────
 model:
   name: "Qwen/Qwen2.5-Coder-7B-Instruct"
-  hidden_size: 4096
   dtype: "bf16"                   # bf16 required for MI300X stability (NOT fp16)
   use_compile: true               # torch.compile() works on ROCm
   gradient_checkpointing: true    # Save VRAM even with 192GB
@@ -31,7 +31,7 @@ lora:
 vision:
   clip_model: "openai/clip-vit-large-patch14"
   visual_tokens: 256              # 16×16 patches from ViT-L/14
-  projection_size: 4096           # Must match model.hidden_size
   freeze_clip: true               # Freeze CLIP backbone
 # ── Training Phases ────────────────────────────────────────────

 # ── Model ──────────────────────────────────────────────────────
 model:
   name: "Qwen/Qwen2.5-Coder-7B-Instruct"
+  hidden_size: 3584
   dtype: "bf16"                   # bf16 required for MI300X stability (NOT fp16)
   use_compile: true               # torch.compile() works on ROCm
   gradient_checkpointing: true    # Save VRAM even with 192GB
 vision:
   clip_model: "openai/clip-vit-large-patch14"
   visual_tokens: 256              # 16×16 patches from ViT-L/14
+  projection_size: 3584           # Must match model.hidden_size
   freeze_clip: true               # Freeze CLIP backbone
 # ── Training Phases ────────────────────────────────────────────

scripts/train.py CHANGED Viewed

@@ -255,7 +255,7 @@ def main() -> None:
     model = MINDI15(
         model_name=model_cfg.get("name", "Qwen/Qwen2.5-Coder-7B-Instruct"),
         clip_model=vision_cfg.get("clip_model", "openai/clip-vit-large-patch14"),
-        hidden_size=model_cfg.get("hidden_size", 4096),
         num_visual_tokens=vision_cfg.get("visual_tokens", 256),
         torch_dtype=config.torch_dtype,
     )

     model = MINDI15(
         model_name=model_cfg.get("name", "Qwen/Qwen2.5-Coder-7B-Instruct"),
         clip_model=vision_cfg.get("clip_model", "openai/clip-vit-large-patch14"),
+        hidden_size=model_cfg.get("hidden_size", 3584),
         num_visual_tokens=vision_cfg.get("visual_tokens", 256),
         torch_dtype=config.torch_dtype,
     )

src/model/fusion_layer.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
 MINDI 1.5 Vision-Coder — Vision-Language Fusion Layer
-Prepends projected visual tokens (256 × 4096) to text token embeddings
 and extends the attention mask accordingly.  Uses Linear + LayerNorm
 for the visual projection gate.
 """
@@ -19,8 +19,8 @@ class VisionLanguageFusion(nn.Module):
     Fuses visual and text embeddings by prepending visual tokens.
     Pipeline:
-        1. visual_tokens (batch, 256, 4096) → Linear → LayerNorm
-        2. Prepend to text_embeds (batch, seq_len, 4096)
         3. Extend attention_mask to cover the extra 256 visual positions
     All trainable parameters live in the gate projection + LayerNorm.
@@ -28,7 +28,7 @@ class VisionLanguageFusion(nn.Module):
     def __init__(
         self,
-        hidden_size: int = 4096,
         num_visual_tokens: int = 256,
     ) -> None:
         """

 """
 MINDI 1.5 Vision-Coder — Vision-Language Fusion Layer
+Prepends projected visual tokens (256 × 3584) to text token embeddings
 and extends the attention mask accordingly.  Uses Linear + LayerNorm
 for the visual projection gate.
 """
     Fuses visual and text embeddings by prepending visual tokens.
     Pipeline:
+        1. visual_tokens (batch, 256, 3584) → Linear → LayerNorm
+        2. Prepend to text_embeds (batch, seq_len, 3584)
         3. Extend attention_mask to cover the extra 256 visual positions
     All trainable parameters live in the gate projection + LayerNorm.
     def __init__(
         self,
+        hidden_size: int = 3584,
         num_visual_tokens: int = 256,
     ) -> None:
         """

src/model/mindi_model.py CHANGED Viewed

@@ -47,7 +47,7 @@ class MINDI15(nn.Module):
     Components:
         - architecture: Qwen2.5-Coder-7B-Instruct + LoRA
-        - vision_encoder: CLIP ViT-L/14 (frozen) → 256 tokens × 4096
         - fusion: Linear + LayerNorm prepend fusion
         - tokenizer: MINDI custom tokenizer with 22 special tokens
     """
@@ -56,7 +56,7 @@ class MINDI15(nn.Module):
         self,
         model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct",
         clip_model: str = "openai/clip-vit-large-patch14",
-        hidden_size: int = 4096,
         num_visual_tokens: int = 256,
         tokenizer_path: Optional[Path] = None,
         device: Optional[str] = None,

     Components:
         - architecture: Qwen2.5-Coder-7B-Instruct + LoRA
+        - vision_encoder: CLIP ViT-L/14 (frozen) → 256 tokens × 3584
         - fusion: Linear + LayerNorm prepend fusion
         - tokenizer: MINDI custom tokenizer with 22 special tokens
     """
         self,
         model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct",
         clip_model: str = "openai/clip-vit-large-patch14",
+        hidden_size: int = 3584,
         num_visual_tokens: int = 256,
         tokenizer_path: Optional[Path] = None,
         device: Optional[str] = None,

src/model/vision_encoder.py CHANGED Viewed

@@ -2,8 +2,8 @@
 MINDI 1.5 Vision-Coder — Vision Encoder
 Uses CLIP ViT-L/14 (frozen) to encode UI screenshots into 256 visual
-tokens projected from 1024 → 4096 to match the Qwen hidden dimension.
-Output shape: (batch, 256, 4096).
 """
 from __future__ import annotations
@@ -22,7 +22,7 @@ class VisionEncoder(nn.Module):
     CLIP ViT-L/14 vision encoder for MINDI 1.5.
     Extracts ALL 256 patch tokens (excludes CLS) from CLIP and
-    projects them from 1024 → 4096 to match Qwen2.5 hidden_size.
     The CLIP backbone is frozen; only the projection layer trains.
     """
@@ -31,7 +31,7 @@ class VisionEncoder(nn.Module):
     def __init__(
         self,
         model_name: str = "openai/clip-vit-large-patch14",
-        llm_hidden_size: int = 4096,
         device: Optional[str] = None,
         cache_dir: Optional[Path] = None,
         torch_dtype: torch.dtype = torch.float32,

 MINDI 1.5 Vision-Coder — Vision Encoder
 Uses CLIP ViT-L/14 (frozen) to encode UI screenshots into 256 visual
+tokens projected from 1024 → 3584 to match the Qwen hidden dimension.
+Output shape: (batch, 256, 3584).
 """
 from __future__ import annotations
     CLIP ViT-L/14 vision encoder for MINDI 1.5.
     Extracts ALL 256 patch tokens (excludes CLS) from CLIP and
+    projects them from 1024 → 3584 to match Qwen2.5 hidden_size.
     The CLIP backbone is frozen; only the projection layer trains.
     """
     def __init__(
         self,
         model_name: str = "openai/clip-vit-large-patch14",
+        llm_hidden_size: int = 3584,
         device: Optional[str] = None,
         cache_dir: Optional[Path] = None,
         torch_dtype: torch.dtype = torch.float32,