File size: 4,649 Bytes
672896a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | #!/usr/bin/env python3
"""
MINDI 1.5 Vision-Coder — Download WebSight v0.2 Subset
Downloads UI screenshot + HTML/CSS code pairs from HuggingFaceM4/WebSight.
Saves images to data/websight/images/ and creates data/websight/train.jsonl
and data/websight/val.jsonl with the MINDI training format.
Usage:
python3 scripts/download_websight.py --num_train 50000 --num_val 2500
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
def main():
parser = argparse.ArgumentParser(description="Download WebSight dataset subset")
parser.add_argument("--num_train", type=int, default=50000,
help="Number of training examples (default: 50000)")
parser.add_argument("--num_val", type=int, default=2500,
help="Number of validation examples (default: 2500)")
parser.add_argument("--output_dir", type=str, default="data/websight",
help="Output directory")
parser.add_argument("--version", type=str, default="v0.2",
help="WebSight version (v0.1 or v0.2)")
args = parser.parse_args()
total = args.num_train + args.num_val
output_dir = Path(args.output_dir)
images_dir = output_dir / "images"
images_dir.mkdir(parents=True, exist_ok=True)
print("=" * 60)
print(" MINDI 1.5 — WebSight Dataset Download")
print("=" * 60)
print(f" Version: {args.version}")
print(f" Train: {args.num_train:,}")
print(f" Val: {args.num_val:,}")
print(f" Output: {output_dir}")
print()
# Load dataset with streaming to avoid downloading everything
print("[1/3] Loading WebSight dataset (streaming) ...")
from datasets import load_dataset
ds = load_dataset(
"HuggingFaceM4/WebSight",
args.version,
split="train",
streaming=True,
token=os.environ.get("HF_TOKEN"),
)
# Process examples
print(f"[2/3] Downloading {total:,} examples ...")
train_path = output_dir / "train.jsonl"
val_path = output_dir / "val.jsonl"
train_f = open(train_path, "w", encoding="utf-8")
val_f = open(val_path, "w", encoding="utf-8")
count = 0
for i, example in enumerate(ds):
if i >= total:
break
# Extract image and code
image = example.get("image")
code = example.get("text", "")
if image is None or not code.strip():
continue
# Save image
img_filename = f"ws_{i:07d}.jpg"
img_path = images_dir / img_filename
image.save(str(img_path), "JPEG", quality=85)
# Create MINDI-format training example
entry = {
"id": f"websight_{i:07d}",
"type": "vision_code",
"source": "websight_v0.2",
"image_path": f"data/websight/images/{img_filename}",
"messages": [
{
"role": "system",
"content": "You are MINDI 1.5 Vision-Coder, a specialized AI for understanding UI screenshots and generating accurate HTML/CSS code."
},
{
"role": "user",
"content": "<|vision_start|><|vision_end|>\nGenerate the HTML/CSS code for this UI screenshot."
},
{
"role": "assistant",
"content": f"<|think_start|>I'll analyze the UI layout and generate the corresponding code.<|think_end|>\n<|code_start|>\n{code.strip()}\n<|code_end|>"
}
],
"metadata": {
"dataset": "websight",
"version": args.version,
}
}
# Split: first num_train → train, rest → val
if count < args.num_train:
train_f.write(json.dumps(entry, ensure_ascii=False) + "\n")
else:
val_f.write(json.dumps(entry, ensure_ascii=False) + "\n")
count += 1
if count % 1000 == 0:
print(f" {count:,}/{total:,} downloaded ...")
train_f.close()
val_f.close()
# Stats
train_count = min(count, args.num_train)
val_count = max(0, count - args.num_train)
print(f"\n[3/3] Done!")
print(f" Train: {train_count:,} examples → {train_path}")
print(f" Val: {val_count:,} examples → {val_path}")
print(f" Images: {images_dir}")
print(f" Disk: ", end="")
os.system(f"du -sh {output_dir}")
if __name__ == "__main__":
main()
|