File size: 5,098 Bytes
9508d8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""Create minimal sample data for quick demo without full COCO/Flickr30k download."""

import argparse
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent


def download_hf_sample(output_dir: Path, max_images: int = 50):
    """Download small sample from HuggingFace datasets (requires datasets lib)."""
    try:
        from datasets import load_dataset
        import requests
        from PIL import Image
        from io import BytesIO
        import concurrent.futures
        import csv
    except ImportError:
        print("Install: pip install datasets requests Pillow")
        sys.exit(1)

    output_dir.mkdir(parents=True, exist_ok=True)
    
    print("Loading COCO dataset stream...")
    ds = load_dataset("ChristophSchuhmann/MS_COCO_2017_URL_TEXT", split="train", streaming=True)
    
    # We will collect target URLs and Captions first, up to max_images
    tasks = []
    seen_urls = set()
    
    # Curated images
    curated = [
        (
            "https://images.unsplash.com/photo-1548199973-03cce0bbc87b?w=600&q=80", 
            "Two dogs playing in the snow, running and jumping."
        ),
        (
            "https://images.unsplash.com/photo-1554580665-9831c2063eef?w=600&q=80",
            "A man selling ice cream or desserts from a small cart outdoors."
        ),
        (
            "https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba?w=600&q=80",
            "A cat sitting on a windowsill looking out."
        )
    ]
    
    for url, cap in curated:
        tasks.append((url, cap))
        seen_urls.add(url)
        
    for ex in ds:
        if len(tasks) >= max_images:
            break
        url = ex.get("URL")
        cap = ex.get("TEXT")
        if url and cap and url not in seen_urls:
            tasks.append((url, cap))
            seen_urls.add(url)
            
    print(f"Collected {len(tasks)} target URLs. Starting parallel download...")

    ids, paths, captions = [], [], []
    downloaded = 0

    def download_image(index, url, caption):
        try:
            r = requests.get(url, timeout=5)
            if r.status_code == 200:
                img = Image.open(BytesIO(r.content)).convert("RGB")
                fname = f"sample_{index}.jpg"
                out_path = output_dir / fname
                # Resize image slightly to save disk space for large datasets
                img.thumbnail((300, 300))
                img.save(out_path, format="JPEG", quality=85)
                return f"img_{index}", out_path, caption
        except Exception:
            pass
        return None

    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        future_to_url = {executor.submit(download_image, i, t[0], t[1]): t for i, t in enumerate(tasks)}
        for future in concurrent.futures.as_completed(future_to_url):
            result = future.result()
            if result:
                uid, path, cap = result
                ids.append(uid)
                paths.append(path)
                captions.append(cap)
                downloaded += 1
                if downloaded % 100 == 0:
                    print(f"Downloaded {downloaded}/{len(tasks)} images")

    # Write captions to CSV in the parent directory
    csv_path = output_dir.parent / "captions.csv"
    with open(csv_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        for path, cap in zip(paths, captions):
            writer.writerow([path.name, cap])
            
    print(f"Saved {len(ids)} captions to {csv_path}")
    return list(zip(ids, paths, captions))


def create_placeholder_images(output_dir: Path, count: int = 20):
    """Create simple placeholder images (colored squares) for testing."""
    try:
        from PIL import Image
        import random
    except ImportError:
        print("PIL required")
        sys.exit(1)

    output_dir.mkdir(parents=True, exist_ok=True)
    items = []
    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (128, 0, 128)]
    labels = ["red", "green", "blue", "yellow", "purple"]
    for i in range(count):
        idx = i % len(colors)
        img = Image.new("RGB", (224, 224), color=colors[idx])
        fname = f"demo_{i}.jpg"
        path = output_dir / fname
        img.save(path)
        items.append((f"img_{i}", path, labels[idx]))
    return items


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output", type=Path, default=ROOT / "data" / "images")
    parser.add_argument("--method", choices=["hf", "placeholder"], default="placeholder")
    parser.add_argument("--count", type=int, default=50)
    args = parser.parse_args()

    out = args.output
    if not out.is_absolute():
        out = ROOT / out
    out.mkdir(parents=True, exist_ok=True)

    if args.method == "placeholder":
        items = create_placeholder_images(out, count=args.count)
    else:
        items = download_hf_sample(out, max_images=args.count)

    print(f"Created {len(items)} images in {out}")
    return items


if __name__ == "__main__":
    main()