Faaz commited on
Commit Β·
691fc84
1
Parent(s): 07de2d7
Fix hidden_size: 4096 -> 3584 to match Qwen2.5-Coder-7B-Instruct
Browse files- configs/training_config.yaml +2 -2
- scripts/train.py +1 -1
- src/model/fusion_layer.py +4 -4
- src/model/mindi_model.py +2 -2
- src/model/vision_encoder.py +4 -4
configs/training_config.yaml
CHANGED
|
@@ -6,7 +6,7 @@
|
|
| 6 |
# ββ Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 7 |
model:
|
| 8 |
name: "Qwen/Qwen2.5-Coder-7B-Instruct"
|
| 9 |
-
hidden_size:
|
| 10 |
dtype: "bf16" # bf16 required for MI300X stability (NOT fp16)
|
| 11 |
use_compile: true # torch.compile() works on ROCm
|
| 12 |
gradient_checkpointing: true # Save VRAM even with 192GB
|
|
@@ -31,7 +31,7 @@ lora:
|
|
| 31 |
vision:
|
| 32 |
clip_model: "openai/clip-vit-large-patch14"
|
| 33 |
visual_tokens: 256 # 16Γ16 patches from ViT-L/14
|
| 34 |
-
projection_size:
|
| 35 |
freeze_clip: true # Freeze CLIP backbone
|
| 36 |
|
| 37 |
# ββ Training Phases ββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 6 |
# ββ Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 7 |
model:
|
| 8 |
name: "Qwen/Qwen2.5-Coder-7B-Instruct"
|
| 9 |
+
hidden_size: 3584
|
| 10 |
dtype: "bf16" # bf16 required for MI300X stability (NOT fp16)
|
| 11 |
use_compile: true # torch.compile() works on ROCm
|
| 12 |
gradient_checkpointing: true # Save VRAM even with 192GB
|
|
|
|
| 31 |
vision:
|
| 32 |
clip_model: "openai/clip-vit-large-patch14"
|
| 33 |
visual_tokens: 256 # 16Γ16 patches from ViT-L/14
|
| 34 |
+
projection_size: 3584 # Must match model.hidden_size
|
| 35 |
freeze_clip: true # Freeze CLIP backbone
|
| 36 |
|
| 37 |
# ββ Training Phases ββββββββββββββββββββββββββββββββββββββββββββ
|
scripts/train.py
CHANGED
|
@@ -255,7 +255,7 @@ def main() -> None:
|
|
| 255 |
model = MINDI15(
|
| 256 |
model_name=model_cfg.get("name", "Qwen/Qwen2.5-Coder-7B-Instruct"),
|
| 257 |
clip_model=vision_cfg.get("clip_model", "openai/clip-vit-large-patch14"),
|
| 258 |
-
hidden_size=model_cfg.get("hidden_size",
|
| 259 |
num_visual_tokens=vision_cfg.get("visual_tokens", 256),
|
| 260 |
torch_dtype=config.torch_dtype,
|
| 261 |
)
|
|
|
|
| 255 |
model = MINDI15(
|
| 256 |
model_name=model_cfg.get("name", "Qwen/Qwen2.5-Coder-7B-Instruct"),
|
| 257 |
clip_model=vision_cfg.get("clip_model", "openai/clip-vit-large-patch14"),
|
| 258 |
+
hidden_size=model_cfg.get("hidden_size", 3584),
|
| 259 |
num_visual_tokens=vision_cfg.get("visual_tokens", 256),
|
| 260 |
torch_dtype=config.torch_dtype,
|
| 261 |
)
|
src/model/fusion_layer.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
MINDI 1.5 Vision-Coder β Vision-Language Fusion Layer
|
| 3 |
|
| 4 |
-
Prepends projected visual tokens (256 Γ
|
| 5 |
and extends the attention mask accordingly. Uses Linear + LayerNorm
|
| 6 |
for the visual projection gate.
|
| 7 |
"""
|
|
@@ -19,8 +19,8 @@ class VisionLanguageFusion(nn.Module):
|
|
| 19 |
Fuses visual and text embeddings by prepending visual tokens.
|
| 20 |
|
| 21 |
Pipeline:
|
| 22 |
-
1. visual_tokens (batch, 256,
|
| 23 |
-
2. Prepend to text_embeds (batch, seq_len,
|
| 24 |
3. Extend attention_mask to cover the extra 256 visual positions
|
| 25 |
|
| 26 |
All trainable parameters live in the gate projection + LayerNorm.
|
|
@@ -28,7 +28,7 @@ class VisionLanguageFusion(nn.Module):
|
|
| 28 |
|
| 29 |
def __init__(
|
| 30 |
self,
|
| 31 |
-
hidden_size: int =
|
| 32 |
num_visual_tokens: int = 256,
|
| 33 |
) -> None:
|
| 34 |
"""
|
|
|
|
| 1 |
"""
|
| 2 |
MINDI 1.5 Vision-Coder β Vision-Language Fusion Layer
|
| 3 |
|
| 4 |
+
Prepends projected visual tokens (256 Γ 3584) to text token embeddings
|
| 5 |
and extends the attention mask accordingly. Uses Linear + LayerNorm
|
| 6 |
for the visual projection gate.
|
| 7 |
"""
|
|
|
|
| 19 |
Fuses visual and text embeddings by prepending visual tokens.
|
| 20 |
|
| 21 |
Pipeline:
|
| 22 |
+
1. visual_tokens (batch, 256, 3584) β Linear β LayerNorm
|
| 23 |
+
2. Prepend to text_embeds (batch, seq_len, 3584)
|
| 24 |
3. Extend attention_mask to cover the extra 256 visual positions
|
| 25 |
|
| 26 |
All trainable parameters live in the gate projection + LayerNorm.
|
|
|
|
| 28 |
|
| 29 |
def __init__(
|
| 30 |
self,
|
| 31 |
+
hidden_size: int = 3584,
|
| 32 |
num_visual_tokens: int = 256,
|
| 33 |
) -> None:
|
| 34 |
"""
|
src/model/mindi_model.py
CHANGED
|
@@ -47,7 +47,7 @@ class MINDI15(nn.Module):
|
|
| 47 |
|
| 48 |
Components:
|
| 49 |
- architecture: Qwen2.5-Coder-7B-Instruct + LoRA
|
| 50 |
-
- vision_encoder: CLIP ViT-L/14 (frozen) β 256 tokens Γ
|
| 51 |
- fusion: Linear + LayerNorm prepend fusion
|
| 52 |
- tokenizer: MINDI custom tokenizer with 22 special tokens
|
| 53 |
"""
|
|
@@ -56,7 +56,7 @@ class MINDI15(nn.Module):
|
|
| 56 |
self,
|
| 57 |
model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct",
|
| 58 |
clip_model: str = "openai/clip-vit-large-patch14",
|
| 59 |
-
hidden_size: int =
|
| 60 |
num_visual_tokens: int = 256,
|
| 61 |
tokenizer_path: Optional[Path] = None,
|
| 62 |
device: Optional[str] = None,
|
|
|
|
| 47 |
|
| 48 |
Components:
|
| 49 |
- architecture: Qwen2.5-Coder-7B-Instruct + LoRA
|
| 50 |
+
- vision_encoder: CLIP ViT-L/14 (frozen) β 256 tokens Γ 3584
|
| 51 |
- fusion: Linear + LayerNorm prepend fusion
|
| 52 |
- tokenizer: MINDI custom tokenizer with 22 special tokens
|
| 53 |
"""
|
|
|
|
| 56 |
self,
|
| 57 |
model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct",
|
| 58 |
clip_model: str = "openai/clip-vit-large-patch14",
|
| 59 |
+
hidden_size: int = 3584,
|
| 60 |
num_visual_tokens: int = 256,
|
| 61 |
tokenizer_path: Optional[Path] = None,
|
| 62 |
device: Optional[str] = None,
|
src/model/vision_encoder.py
CHANGED
|
@@ -2,8 +2,8 @@
|
|
| 2 |
MINDI 1.5 Vision-Coder β Vision Encoder
|
| 3 |
|
| 4 |
Uses CLIP ViT-L/14 (frozen) to encode UI screenshots into 256 visual
|
| 5 |
-
tokens projected from 1024 β
|
| 6 |
-
Output shape: (batch, 256,
|
| 7 |
"""
|
| 8 |
|
| 9 |
from __future__ import annotations
|
|
@@ -22,7 +22,7 @@ class VisionEncoder(nn.Module):
|
|
| 22 |
CLIP ViT-L/14 vision encoder for MINDI 1.5.
|
| 23 |
|
| 24 |
Extracts ALL 256 patch tokens (excludes CLS) from CLIP and
|
| 25 |
-
projects them from 1024 β
|
| 26 |
The CLIP backbone is frozen; only the projection layer trains.
|
| 27 |
"""
|
| 28 |
|
|
@@ -31,7 +31,7 @@ class VisionEncoder(nn.Module):
|
|
| 31 |
def __init__(
|
| 32 |
self,
|
| 33 |
model_name: str = "openai/clip-vit-large-patch14",
|
| 34 |
-
llm_hidden_size: int =
|
| 35 |
device: Optional[str] = None,
|
| 36 |
cache_dir: Optional[Path] = None,
|
| 37 |
torch_dtype: torch.dtype = torch.float32,
|
|
|
|
| 2 |
MINDI 1.5 Vision-Coder β Vision Encoder
|
| 3 |
|
| 4 |
Uses CLIP ViT-L/14 (frozen) to encode UI screenshots into 256 visual
|
| 5 |
+
tokens projected from 1024 β 3584 to match the Qwen hidden dimension.
|
| 6 |
+
Output shape: (batch, 256, 3584).
|
| 7 |
"""
|
| 8 |
|
| 9 |
from __future__ import annotations
|
|
|
|
| 22 |
CLIP ViT-L/14 vision encoder for MINDI 1.5.
|
| 23 |
|
| 24 |
Extracts ALL 256 patch tokens (excludes CLS) from CLIP and
|
| 25 |
+
projects them from 1024 β 3584 to match Qwen2.5 hidden_size.
|
| 26 |
The CLIP backbone is frozen; only the projection layer trains.
|
| 27 |
"""
|
| 28 |
|
|
|
|
| 31 |
def __init__(
|
| 32 |
self,
|
| 33 |
model_name: str = "openai/clip-vit-large-patch14",
|
| 34 |
+
llm_hidden_size: int = 3584,
|
| 35 |
device: Optional[str] = None,
|
| 36 |
cache_dir: Optional[Path] = None,
|
| 37 |
torch_dtype: torch.dtype = torch.float32,
|