Add WebSight vision data pipeline: download script, image-aware data loader, phase data routing

Files changed (4) hide show

configs/training_config.yaml +6 -0
scripts/download_websight.py +141 -0
scripts/train.py +7 -4
src/training/mindi_trainer.py +72 -3

configs/training_config.yaml CHANGED Viewed

@@ -68,8 +68,14 @@ training:
 # ── Data ───────────────────────────────────────────────────────
 data:
   train_file: "data/processed/train.jsonl"     # 4.18GB, 1,304,486 examples
   val_file: "data/processed/val.jsonl"         # 0.23GB, 72,471 examples
   max_length: 4096
   shuffle_buffer: 10000           # Streaming shuffle buffer size
   num_workers: 4                  # DataLoader workers

 # ── Data ───────────────────────────────────────────────────────
 data:
+  # Text-only code data (Phase 1 + Phase 3)
   train_file: "data/processed/train.jsonl"     # 4.18GB, 1,304,486 examples
   val_file: "data/processed/val.jsonl"         # 0.23GB, 72,471 examples
+  # Vision+code data — WebSight UI screenshots (Phase 2 + Phase 3)
+  vision_train_file: "data/websight/train.jsonl"
+  vision_val_file: "data/websight/val.jsonl"
   max_length: 4096
   shuffle_buffer: 10000           # Streaming shuffle buffer size
   num_workers: 4                  # DataLoader workers

scripts/download_websight.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#!/usr/bin/env python3
+"""
+MINDI 1.5 Vision-Coder — Download WebSight v0.2 Subset
+Downloads UI screenshot + HTML/CSS code pairs from HuggingFaceM4/WebSight.
+Saves images to data/websight/images/ and creates data/websight/train.jsonl
+and data/websight/val.jsonl with the MINDI training format.
+Usage:
+    python3 scripts/download_websight.py --num_train 50000 --num_val 2500
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+# Add project root to path
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+def main():
+    parser = argparse.ArgumentParser(description="Download WebSight dataset subset")
+    parser.add_argument("--num_train", type=int, default=50000,
+                        help="Number of training examples (default: 50000)")
+    parser.add_argument("--num_val", type=int, default=2500,
+                        help="Number of validation examples (default: 2500)")
+    parser.add_argument("--output_dir", type=str, default="data/websight",
+                        help="Output directory")
+    parser.add_argument("--version", type=str, default="v0.2",
+                        help="WebSight version (v0.1 or v0.2)")
+    args = parser.parse_args()
+    total = args.num_train + args.num_val
+    output_dir = Path(args.output_dir)
+    images_dir = output_dir / "images"
+    images_dir.mkdir(parents=True, exist_ok=True)
+    print("=" * 60)
+    print("  MINDI 1.5 — WebSight Dataset Download")
+    print("=" * 60)
+    print(f"  Version:  {args.version}")
+    print(f"  Train:    {args.num_train:,}")
+    print(f"  Val:      {args.num_val:,}")
+    print(f"  Output:   {output_dir}")
+    print()
+    # Load dataset with streaming to avoid downloading everything
+    print("[1/3] Loading WebSight dataset (streaming) ...")
+    from datasets import load_dataset
+    ds = load_dataset(
+        "HuggingFaceM4/WebSight",
+        args.version,
+        split="train",
+        streaming=True,
+        token=os.environ.get("HF_TOKEN"),
+    )
+    # Process examples
+    print(f"[2/3] Downloading {total:,} examples ...")
+    train_path = output_dir / "train.jsonl"
+    val_path = output_dir / "val.jsonl"
+    train_f = open(train_path, "w", encoding="utf-8")
+    val_f = open(val_path, "w", encoding="utf-8")
+    count = 0
+    for i, example in enumerate(ds):
+        if i >= total:
+            break
+        # Extract image and code
+        image = example.get("image")
+        code = example.get("text", "")
+        if image is None or not code.strip():
+            continue
+        # Save image
+        img_filename = f"ws_{i:07d}.jpg"
+        img_path = images_dir / img_filename
+        image.save(str(img_path), "JPEG", quality=85)
+        # Create MINDI-format training example
+        entry = {
+            "id": f"websight_{i:07d}",
+            "type": "vision_code",
+            "source": "websight_v0.2",
+            "image_path": f"data/websight/images/{img_filename}",
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You are MINDI 1.5 Vision-Coder, a specialized AI for understanding UI screenshots and generating accurate HTML/CSS code."
+                },
+                {
+                    "role": "user",
+                    "content": "<|vision_start|><|vision_end|>\nGenerate the HTML/CSS code for this UI screenshot."
+                },
+                {
+                    "role": "assistant",
+                    "content": f"<|think_start|>I'll analyze the UI layout and generate the corresponding code.<|think_end|>\n<|code_start|>\n{code.strip()}\n<|code_end|>"
+                }
+            ],
+            "metadata": {
+                "dataset": "websight",
+                "version": args.version,
+            }
+        }
+        # Split: first num_train → train, rest → val
+        if count < args.num_train:
+            train_f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+        else:
+            val_f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+        count += 1
+        if count % 1000 == 0:
+            print(f"  {count:,}/{total:,} downloaded ...")
+    train_f.close()
+    val_f.close()
+    # Stats
+    train_count = min(count, args.num_train)
+    val_count = max(0, count - args.num_train)
+    print(f"\n[3/3] Done!")
+    print(f"  Train: {train_count:,} examples → {train_path}")
+    print(f"  Val:   {val_count:,} examples → {val_path}")
+    print(f"  Images: {images_dir}")
+    print(f"  Disk:  ", end="")
+    os.system(f"du -sh {output_dir}")
+if __name__ == "__main__":
+    main()

scripts/train.py CHANGED Viewed

@@ -84,12 +84,12 @@ def build_training_config(raw: dict, dry_run: bool = False):
     # Build phase configs from YAML
     phases = []
     phase_defs = [
-        ("phase1", "phase1_lora", True, False, False),
-        ("phase2", "phase2_vision_bridge", False, True, True),
-        ("phase3", "phase3_all", True, True, True),
     ]
     cumulative_step = 0
-    for key, name, lora, vision, fusion in phase_defs:
         pcfg = training.get(key, {})
         steps = pcfg.get("steps", 2500)
         if dry_run:
@@ -106,12 +106,15 @@ def build_training_config(raw: dict, dry_run: bool = False):
             lora=lora,
             vision_projection=vision,
             fusion=fusion,
         ))
         cumulative_step = end
     config = TrainingConfig(
         train_file=PROJECT_ROOT / data.get("train_file", "data/processed/train.jsonl"),
         val_file=PROJECT_ROOT / data.get("val_file", "data/processed/val.jsonl"),
         output_dir=PROJECT_ROOT / output.get("checkpoint_dir", "checkpoints/training"),
         log_dir=PROJECT_ROOT / logging_cfg.get("log_dir", "logs/training"),
         max_seq_length=data.get("max_length", 4096),

     # Build phase configs from YAML
     phases = []
     phase_defs = [
+        ("phase1", "phase1_lora", True, False, False, "text"),
+        ("phase2", "phase2_vision_bridge", False, True, True, "vision"),
+        ("phase3", "phase3_all", True, True, True, "mixed"),
     ]
     cumulative_step = 0
+    for key, name, lora, vision, fusion, data_type in phase_defs:
         pcfg = training.get(key, {})
         steps = pcfg.get("steps", 2500)
         if dry_run:
             lora=lora,
             vision_projection=vision,
             fusion=fusion,
+            data_type=data_type,
         ))
         cumulative_step = end
     config = TrainingConfig(
         train_file=PROJECT_ROOT / data.get("train_file", "data/processed/train.jsonl"),
         val_file=PROJECT_ROOT / data.get("val_file", "data/processed/val.jsonl"),
+        vision_train_file=PROJECT_ROOT / data.get("vision_train_file", "data/websight/train.jsonl"),
+        vision_val_file=PROJECT_ROOT / data.get("vision_val_file", "data/websight/val.jsonl"),
         output_dir=PROJECT_ROOT / output.get("checkpoint_dir", "checkpoints/training"),
         log_dir=PROJECT_ROOT / logging_cfg.get("log_dir", "logs/training"),
         max_seq_length=data.get("max_length", 4096),

src/training/mindi_trainer.py CHANGED Viewed

@@ -28,6 +28,7 @@ from typing import Any, Iterator, Optional
 import torch
 import torch.nn as nn
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
 from torch.utils.data import DataLoader, IterableDataset
@@ -50,6 +51,8 @@ class PhaseConfig:
     lora: bool = False
     vision_projection: bool = False
     fusion: bool = False
 @dataclass
@@ -59,6 +62,8 @@ class TrainingConfig:
     # Data paths
     train_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "processed" / "train.jsonl")
     val_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "processed" / "val.jsonl")
     # Output
     output_dir: Path = field(default_factory=lambda: PROJECT_ROOT / "checkpoints" / "training")
@@ -93,18 +98,21 @@ class TrainingConfig:
             start_step=0, end_step=5000,
             learning_rate=2e-4, batch_size=16,
             lora=True, vision_projection=False, fusion=False,
         ),
         PhaseConfig(
             name="phase2_vision_bridge",
             start_step=5000, end_step=7500,
             learning_rate=1e-5, batch_size=8,
             lora=False, vision_projection=True, fusion=True,
         ),
         PhaseConfig(
             name="phase3_all",
             start_step=7500, end_step=10000,
             learning_rate=5e-5, batch_size=12,
             lora=True, vision_projection=True, fusion=True,
         ),
     ])
@@ -123,9 +131,11 @@ class StreamingJSONLDataset(IterableDataset):
     """
     Streams JSONL training data from disk line by line.
     Tokenizes on-the-fly to avoid loading 4+ GB into RAM.
     Expected JSONL format:
         {"id": "...", "type": "...", "source": "...",
          "messages": [{"role": "system", "content": "..."},
                       {"role": "user", "content": "..."},
                       {"role": "assistant", "content": "..."}],
@@ -139,12 +149,14 @@ class StreamingJSONLDataset(IterableDataset):
         max_length: int = 8192,
         shuffle_buffer: int = 10000,
         seed: int = 42,
     ) -> None:
         self.file_path = Path(file_path)
         self.tokenizer = tokenizer
         self.max_length = max_length
         self.shuffle_buffer = shuffle_buffer
         self.seed = seed
         if not self.file_path.exists():
             raise FileNotFoundError(f"Training data not found: {self.file_path}")
@@ -212,7 +224,18 @@ class StreamingJSONLDataset(IterableDataset):
             rng.shuffle(buffer)
             yield from buffer
-    def __iter__(self) -> Iterator[dict[str, torch.Tensor]]:
         for example in self._shuffled_iterator():
             messages = example.get("messages", [])
             if not messages:
@@ -220,6 +243,12 @@ class StreamingJSONLDataset(IterableDataset):
             text = self._format_messages(messages)
             tokenized = self._tokenize(text)
             if tokenized is not None:
                 yield tokenized
     def count_lines(self) -> int:
@@ -342,6 +371,17 @@ class MINDITrainer:
             shuffle_buffer=shuffle_buffer,
             seed=self.config.seed,
         )
         return DataLoader(
             dataset,
             batch_size=batch_size,
@@ -349,6 +389,7 @@ class MINDITrainer:
             pin_memory=self.config.pin_memory,
             prefetch_factor=self.config.prefetch_factor if self.config.num_workers > 0 else None,
             drop_last=True,
         )
     def _log_metrics(self, metrics: dict) -> None:
@@ -380,12 +421,20 @@ class MINDITrainer:
             input_ids = batch["input_ids"].to(self.device)
             attention_mask = batch["attention_mask"].to(self.device)
             labels = batch["labels"].to(self.device)
             with torch.autocast(device_type="cuda", dtype=self.amp_dtype, enabled=self.use_amp):
                 result = self.model(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     labels=labels,
                 )
             if result["loss"] is not None:
@@ -433,6 +482,7 @@ class MINDITrainer:
         print(f"  LR: {phase.learning_rate}  |  Batch: {phase.batch_size}")
         print(f"  Components: LoRA={phase.lora}, Vision={phase.vision_projection}, "
               f"Fusion={phase.fusion}")
         print("=" * 60)
         # Set trainable components
@@ -446,12 +496,21 @@ class MINDITrainer:
         optimizer = self._build_optimizer(phase)
         scheduler = self._build_scheduler(optimizer, phase)
         # Build data loaders
         train_loader = self._build_dataloader(
-            self.config.train_file, phase.batch_size
         )
         val_loader = self._build_dataloader(
-            self.config.val_file, batch_size=max(phase.batch_size // 2, 1),
             shuffle_buffer=1000,
         )
@@ -475,6 +534,15 @@ class MINDITrainer:
             input_ids = batch["input_ids"].to(self.device)
             attention_mask = batch["attention_mask"].to(self.device)
             labels = batch["labels"].to(self.device)
             # Forward pass with mixed precision
             with torch.autocast(device_type="cuda", dtype=self.amp_dtype, enabled=self.use_amp):
@@ -482,6 +550,7 @@ class MINDITrainer:
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     labels=labels,
                 )
                 loss = result["loss"]

 import torch
 import torch.nn as nn
+from PIL import Image
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
 from torch.utils.data import DataLoader, IterableDataset
     lora: bool = False
     vision_projection: bool = False
     fusion: bool = False
+    # Data type: "text" for code-only, "vision" for image+code, "mixed" for both
+    data_type: str = "text"
 @dataclass
     # Data paths
     train_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "processed" / "train.jsonl")
     val_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "processed" / "val.jsonl")
+    vision_train_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "websight" / "train.jsonl")
+    vision_val_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "websight" / "val.jsonl")
     # Output
     output_dir: Path = field(default_factory=lambda: PROJECT_ROOT / "checkpoints" / "training")
             start_step=0, end_step=5000,
             learning_rate=2e-4, batch_size=16,
             lora=True, vision_projection=False, fusion=False,
+            data_type="text",
         ),
         PhaseConfig(
             name="phase2_vision_bridge",
             start_step=5000, end_step=7500,
             learning_rate=1e-5, batch_size=8,
             lora=False, vision_projection=True, fusion=True,
+            data_type="vision",
         ),
         PhaseConfig(
             name="phase3_all",
             start_step=7500, end_step=10000,
             learning_rate=5e-5, batch_size=12,
             lora=True, vision_projection=True, fusion=True,
+            data_type="mixed",
         ),
     ])
     """
     Streams JSONL training data from disk line by line.
     Tokenizes on-the-fly to avoid loading 4+ GB into RAM.
+    Supports optional image loading for vision-code pairs.
     Expected JSONL format:
         {"id": "...", "type": "...", "source": "...",
+         "image_path": "data/websight/images/ws_0000001.jpg",   (optional)
          "messages": [{"role": "system", "content": "..."},
                       {"role": "user", "content": "..."},
                       {"role": "assistant", "content": "..."}],
         max_length: int = 8192,
         shuffle_buffer: int = 10000,
         seed: int = 42,
+        project_root: Optional[Path] = None,
     ) -> None:
         self.file_path = Path(file_path)
         self.tokenizer = tokenizer
         self.max_length = max_length
         self.shuffle_buffer = shuffle_buffer
         self.seed = seed
+        self.project_root = Path(project_root) if project_root else PROJECT_ROOT
         if not self.file_path.exists():
             raise FileNotFoundError(f"Training data not found: {self.file_path}")
             rng.shuffle(buffer)
             yield from buffer
+    def _load_image(self, image_path: str) -> Optional[Image.Image]:
+        """Load image from a relative path. Returns None if missing/corrupt."""
+        try:
+            full_path = self.project_root / image_path
+            if full_path.exists():
+                img = Image.open(str(full_path)).convert("RGB")
+                return img
+        except Exception:
+            pass
+        return None
+    def __iter__(self) -> Iterator[dict[str, Any]]:
         for example in self._shuffled_iterator():
             messages = example.get("messages", [])
             if not messages:
             text = self._format_messages(messages)
             tokenized = self._tokenize(text)
             if tokenized is not None:
+                # Load image if path present
+                image_path = example.get("image_path")
+                if image_path:
+                    tokenized["image"] = self._load_image(image_path)
+                else:
+                    tokenized["image"] = None
                 yield tokenized
     def count_lines(self) -> int:
             shuffle_buffer=shuffle_buffer,
             seed=self.config.seed,
         )
+        def _collate_fn(batch):
+            """Custom collate: stack tensors, keep images as list."""
+            collated = {
+                "input_ids": torch.stack([b["input_ids"] for b in batch]),
+                "attention_mask": torch.stack([b["attention_mask"] for b in batch]),
+                "labels": torch.stack([b["labels"] for b in batch]),
+                "images": [b.get("image") for b in batch],
+            }
+            return collated
         return DataLoader(
             dataset,
             batch_size=batch_size,
             pin_memory=self.config.pin_memory,
             prefetch_factor=self.config.prefetch_factor if self.config.num_workers > 0 else None,
             drop_last=True,
+            collate_fn=_collate_fn,
         )
     def _log_metrics(self, metrics: dict) -> None:
             input_ids = batch["input_ids"].to(self.device)
             attention_mask = batch["attention_mask"].to(self.device)
             labels = batch["labels"].to(self.device)
+            images = batch.get("images")
+            image = None
+            if images:
+                for img in images:
+                    if img is not None:
+                        image = img
+                        break
             with torch.autocast(device_type="cuda", dtype=self.amp_dtype, enabled=self.use_amp):
                 result = self.model(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     labels=labels,
+                    image=image,
                 )
             if result["loss"] is not None:
         print(f"  LR: {phase.learning_rate}  |  Batch: {phase.batch_size}")
         print(f"  Components: LoRA={phase.lora}, Vision={phase.vision_projection}, "
               f"Fusion={phase.fusion}")
+        print(f"  Data: {phase.data_type}")
         print("=" * 60)
         # Set trainable components
         optimizer = self._build_optimizer(phase)
         scheduler = self._build_scheduler(optimizer, phase)
+        # Select data files based on phase data_type
+        if phase.data_type == "vision":
+            train_file = self.config.vision_train_file
+            val_file = self.config.vision_val_file
+        else:
+            # "text" or "mixed" — use main data (mixed has images inline)
+            train_file = self.config.train_file
+            val_file = self.config.val_file
         # Build data loaders
         train_loader = self._build_dataloader(
+            train_file, phase.batch_size
         )
         val_loader = self._build_dataloader(
+            val_file, batch_size=max(phase.batch_size // 2, 1),
             shuffle_buffer=1000,
         )
             input_ids = batch["input_ids"].to(self.device)
             attention_mask = batch["attention_mask"].to(self.device)
             labels = batch["labels"].to(self.device)
+            images = batch.get("images")  # list of PIL Images or Nones
+            # Pick first non-None image in batch (model processes one image at a time)
+            image = None
+            if images:
+                for img in images:
+                    if img is not None:
+                        image = img
+                        break
             # Forward pass with mixed precision
             with torch.autocast(device_type="cuda", dtype=self.amp_dtype, enabled=self.use_amp):
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     labels=labels,
+                    image=image,
                 )
                 loss = result["loss"]