MINDI-1.5-Vision-Coder / scripts /download_websight.py
Faaz
Add WebSight vision data pipeline: download script, image-aware data loader, phase data routing
672896a
#!/usr/bin/env python3
"""
MINDI 1.5 Vision-Coder — Download WebSight v0.2 Subset
Downloads UI screenshot + HTML/CSS code pairs from HuggingFaceM4/WebSight.
Saves images to data/websight/images/ and creates data/websight/train.jsonl
and data/websight/val.jsonl with the MINDI training format.
Usage:
python3 scripts/download_websight.py --num_train 50000 --num_val 2500
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
def main():
parser = argparse.ArgumentParser(description="Download WebSight dataset subset")
parser.add_argument("--num_train", type=int, default=50000,
help="Number of training examples (default: 50000)")
parser.add_argument("--num_val", type=int, default=2500,
help="Number of validation examples (default: 2500)")
parser.add_argument("--output_dir", type=str, default="data/websight",
help="Output directory")
parser.add_argument("--version", type=str, default="v0.2",
help="WebSight version (v0.1 or v0.2)")
args = parser.parse_args()
total = args.num_train + args.num_val
output_dir = Path(args.output_dir)
images_dir = output_dir / "images"
images_dir.mkdir(parents=True, exist_ok=True)
print("=" * 60)
print(" MINDI 1.5 — WebSight Dataset Download")
print("=" * 60)
print(f" Version: {args.version}")
print(f" Train: {args.num_train:,}")
print(f" Val: {args.num_val:,}")
print(f" Output: {output_dir}")
print()
# Load dataset with streaming to avoid downloading everything
print("[1/3] Loading WebSight dataset (streaming) ...")
from datasets import load_dataset
ds = load_dataset(
"HuggingFaceM4/WebSight",
args.version,
split="train",
streaming=True,
token=os.environ.get("HF_TOKEN"),
)
# Process examples
print(f"[2/3] Downloading {total:,} examples ...")
train_path = output_dir / "train.jsonl"
val_path = output_dir / "val.jsonl"
train_f = open(train_path, "w", encoding="utf-8")
val_f = open(val_path, "w", encoding="utf-8")
count = 0
for i, example in enumerate(ds):
if i >= total:
break
# Extract image and code
image = example.get("image")
code = example.get("text", "")
if image is None or not code.strip():
continue
# Save image
img_filename = f"ws_{i:07d}.jpg"
img_path = images_dir / img_filename
image.save(str(img_path), "JPEG", quality=85)
# Create MINDI-format training example
entry = {
"id": f"websight_{i:07d}",
"type": "vision_code",
"source": "websight_v0.2",
"image_path": f"data/websight/images/{img_filename}",
"messages": [
{
"role": "system",
"content": "You are MINDI 1.5 Vision-Coder, a specialized AI for understanding UI screenshots and generating accurate HTML/CSS code."
},
{
"role": "user",
"content": "<|vision_start|><|vision_end|>\nGenerate the HTML/CSS code for this UI screenshot."
},
{
"role": "assistant",
"content": f"<|think_start|>I'll analyze the UI layout and generate the corresponding code.<|think_end|>\n<|code_start|>\n{code.strip()}\n<|code_end|>"
}
],
"metadata": {
"dataset": "websight",
"version": args.version,
}
}
# Split: first num_train → train, rest → val
if count < args.num_train:
train_f.write(json.dumps(entry, ensure_ascii=False) + "\n")
else:
val_f.write(json.dumps(entry, ensure_ascii=False) + "\n")
count += 1
if count % 1000 == 0:
print(f" {count:,}/{total:,} downloaded ...")
train_f.close()
val_f.close()
# Stats
train_count = min(count, args.num_train)
val_count = max(0, count - args.num_train)
print(f"\n[3/3] Done!")
print(f" Train: {train_count:,} examples → {train_path}")
print(f" Val: {val_count:,} examples → {val_path}")
print(f" Images: {images_dir}")
print(f" Disk: ", end="")
os.system(f"du -sh {output_dir}")
if __name__ == "__main__":
main()