File size: 4,649 Bytes
672896a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python3
"""
MINDI 1.5 Vision-Coder — Download WebSight v0.2 Subset

Downloads UI screenshot + HTML/CSS code pairs from HuggingFaceM4/WebSight.
Saves images to data/websight/images/ and creates data/websight/train.jsonl
and data/websight/val.jsonl with the MINDI training format.

Usage:
    python3 scripts/download_websight.py --num_train 50000 --num_val 2500
"""

from __future__ import annotations

import argparse
import json
import os
import sys
from pathlib import Path

# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))


def main():
    parser = argparse.ArgumentParser(description="Download WebSight dataset subset")
    parser.add_argument("--num_train", type=int, default=50000,
                        help="Number of training examples (default: 50000)")
    parser.add_argument("--num_val", type=int, default=2500,
                        help="Number of validation examples (default: 2500)")
    parser.add_argument("--output_dir", type=str, default="data/websight",
                        help="Output directory")
    parser.add_argument("--version", type=str, default="v0.2",
                        help="WebSight version (v0.1 or v0.2)")
    args = parser.parse_args()

    total = args.num_train + args.num_val
    output_dir = Path(args.output_dir)
    images_dir = output_dir / "images"
    images_dir.mkdir(parents=True, exist_ok=True)

    print("=" * 60)
    print("  MINDI 1.5 — WebSight Dataset Download")
    print("=" * 60)
    print(f"  Version:  {args.version}")
    print(f"  Train:    {args.num_train:,}")
    print(f"  Val:      {args.num_val:,}")
    print(f"  Output:   {output_dir}")
    print()

    # Load dataset with streaming to avoid downloading everything
    print("[1/3] Loading WebSight dataset (streaming) ...")
    from datasets import load_dataset

    ds = load_dataset(
        "HuggingFaceM4/WebSight",
        args.version,
        split="train",
        streaming=True,
        token=os.environ.get("HF_TOKEN"),
    )

    # Process examples
    print(f"[2/3] Downloading {total:,} examples ...")
    train_path = output_dir / "train.jsonl"
    val_path = output_dir / "val.jsonl"

    train_f = open(train_path, "w", encoding="utf-8")
    val_f = open(val_path, "w", encoding="utf-8")

    count = 0
    for i, example in enumerate(ds):
        if i >= total:
            break

        # Extract image and code
        image = example.get("image")
        code = example.get("text", "")

        if image is None or not code.strip():
            continue

        # Save image
        img_filename = f"ws_{i:07d}.jpg"
        img_path = images_dir / img_filename
        image.save(str(img_path), "JPEG", quality=85)

        # Create MINDI-format training example
        entry = {
            "id": f"websight_{i:07d}",
            "type": "vision_code",
            "source": "websight_v0.2",
            "image_path": f"data/websight/images/{img_filename}",
            "messages": [
                {
                    "role": "system",
                    "content": "You are MINDI 1.5 Vision-Coder, a specialized AI for understanding UI screenshots and generating accurate HTML/CSS code."
                },
                {
                    "role": "user",
                    "content": "<|vision_start|><|vision_end|>\nGenerate the HTML/CSS code for this UI screenshot."
                },
                {
                    "role": "assistant",
                    "content": f"<|think_start|>I'll analyze the UI layout and generate the corresponding code.<|think_end|>\n<|code_start|>\n{code.strip()}\n<|code_end|>"
                }
            ],
            "metadata": {
                "dataset": "websight",
                "version": args.version,
            }
        }

        # Split: first num_train → train, rest → val
        if count < args.num_train:
            train_f.write(json.dumps(entry, ensure_ascii=False) + "\n")
        else:
            val_f.write(json.dumps(entry, ensure_ascii=False) + "\n")

        count += 1
        if count % 1000 == 0:
            print(f"  {count:,}/{total:,} downloaded ...")

    train_f.close()
    val_f.close()

    # Stats
    train_count = min(count, args.num_train)
    val_count = max(0, count - args.num_train)

    print(f"\n[3/3] Done!")
    print(f"  Train: {train_count:,} examples → {train_path}")
    print(f"  Val:   {val_count:,} examples → {val_path}")
    print(f"  Images: {images_dir}")
    print(f"  Disk:  ", end="")
    os.system(f"du -sh {output_dir}")


if __name__ == "__main__":
    main()