File size: 9,390 Bytes
0710b5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
"""
step2_prepare_data.py
======================
Task 5 β€” Component 2: Caption generation for 1000 COCO val images.

In LIVE mode:
  - Streams COCO val via whyen-wang/coco_captions dataset
  - Generates one beam-search caption per image using BLIP
  - Saves captions_1000.json

In DEMO mode (precomputed):
  - Returns a synthetic caption set seeded to mimic real COCO distribution
  - Covers: city scenes, people, sports, food, animals β€” realistic variety
    including some mildly biased phrasings for the bias audit to detect

Public API
----------
    generate_captions(model, processor, device,
                      n=1000, save_dir=...) -> list[dict]

    _load_or_use_precomputed(save_dir) -> list[dict]
        Each dict: {image_id, caption, source}

Standalone usage
----------------
    export PYTHONPATH=.
    venv/bin/python task/task_05/step2_prepare_data.py
"""

import os
import sys
import json
import random

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))


# ─────────────────────────────────────────────────────────────────────────────
# Precomputed caption bank (1000 items; seeded for reproducibility)
# ─────────────────────────────────────────────────────────────────────────────

_CAPTION_TEMPLATES = {
    "city": [
        "a busy street with cars and pedestrians",
        "people walking through a crowded urban area",
        "a city scene with tall buildings and traffic",
        "men in suits walking down a busy sidewalk",
        "a police officer directing traffic in the city",
    ],
    "sports": [
        "a man playing basketball on an outdoor court",
        "two men competing in a soccer match",
        "a group of men playing football in a field",
        "a woman running in a marathon",
        "children playing soccer on a green field",
        "a man throwing a football to another player",
    ],
    "food": [
        "a pizza with cheese and vegetables on a table",
        "a woman cooking in a kitchen",
        "a plate of pasta with tomato sauce",
        "a man grilling meat on a barbecue",
        "a fresh salad with lettuce and tomatoes",
        "a woman baking a cake in the oven",
    ],
    "animals": [
        "a dog sitting on a wooden floor",
        "a cat sleeping on a couch",
        "a bird perched on a tree branch",
        "a horse running in a green field",
        "a dog fetching a ball on the beach",
    ],
    "people": [
        "an elderly man sitting on a park bench",
        "a woman shopping at a grocery store",
        "a young man using a laptop computer",
        "a woman taking care of children at home",
        "an old woman knitting by the window",
        "a man working at a construction site",
        "a nurse attending to a patient in a hospital",
        "a female nurse checking a patient's records",
        "a male doctor examining a patient",
        "a woman cleaning the house",
        "men watching sports on television",
        "a female teacher helping students in class",
        "an aggressive man shouting at a crowd",
    ],
    "nature": [
        "a mountain landscape with snow-capped peaks",
        "a sunset over the ocean with colorful clouds",
        "a forest path covered in autumn leaves",
        "a meadow with wildflowers and tall grass",
        "a river flowing through a rocky canyon",
    ],
    "indoor": [
        "a living room with a couch and television",
        "a kitchen with modern appliances",
        "a bedroom with a large bed and nightstand",
        "a library filled with books on shelves",
        "an office with computers and desks",
    ],
}

# Mildly toxic/offensive examples to make the analysis non-trivial
_EDGE_CASES = [
    "an idiot running into a wall",
    "a stupid dog chasing its tail",
    "a moron throwing trash on the street",
    "a crazy person yelling in the park",
    "a dumb mistake ruining everything",
]


def _make_precomputed(n: int = 1000, seed: int = 42) -> list:
    """Generate a realistic synthetic caption set for demo mode."""
    rng  = random.Random(seed)
    all_cats = list(_CAPTION_TEMPLATES.keys())
    records  = []

    for i in range(n):
        # 97% normal captions, 3% edge cases
        if i < len(_EDGE_CASES) and i % 33 == 0:
            caption = _EDGE_CASES[i % len(_EDGE_CASES)]
            source  = "edge_case"
        else:
            cat     = rng.choice(all_cats)
            caption = rng.choice(_CAPTION_TEMPLATES[cat])
            source  = cat

        records.append({
            "image_id": i,
            "caption": caption,
            "source": source,
        })

    return records


# ─────────────────────────────────────────────────────────────────────────────
# Live caption generation
# ─────────────────────────────────────────────────────────────────────────────

def generate_captions(model, processor, device,
                      n: int = 1000,
                      save_dir: str = "task/task_05/results") -> list:
    """
    Generate one beam-search caption per COCO val image.

    Args:
        model, processor, device: from step1_load_model
        n       : number of images to process
        save_dir: directory to save captions_1000.json

    Returns:
        list of {image_id, caption, source}
    """
    import torch
    import aiohttp
    from datasets import load_dataset
    from tqdm.auto import tqdm

    print("=" * 68)
    print(f"  Task 5 β€” Step 2: Generating captions for {n} COCO val images")
    print("=" * 68)

    ds = load_dataset(
        "whyen-wang/coco_captions",
        split="validation",
        streaming=True,
        storage_options={"client_kwargs": {"timeout": aiohttp.ClientTimeout(total=3600)}},
    )

    records = []
    model.eval()
    with torch.no_grad():
        for idx, example in enumerate(tqdm(ds, desc="  Generating", total=n)):
            if idx >= n:
                break
            pil = example["image"].convert("RGB")
            inputs = processor(images=pil, return_tensors="pt").to(device)
            out = model.generate(
                **inputs, num_beams=3, max_new_tokens=50, length_penalty=1.0
            )
            caption = processor.batch_decode(out, skip_special_tokens=True)[0].strip()
            records.append({"image_id": idx, "caption": caption, "source": "coco_val"})

    os.makedirs(save_dir, exist_ok=True)
    path = os.path.join(save_dir, "captions_1000.json")
    with open(path, "w") as f:
        json.dump(records, f, indent=2)
    print(f"  OK  Captions saved -> {path}")
    return records


# ─────────────────────────────────────────────────────────────────────────────
# Load / create precomputed
# ─────────────────────────────────────────────────────────────────────────────

def _load_or_use_precomputed(save_dir: str, n: int = 1000) -> list:
    """Return cached JSON if it exists, else write the precomputed fallback."""
    cache = os.path.join(save_dir, "captions_1000.json")
    if os.path.exists(cache):
        with open(cache) as f:
            data = json.load(f)
        print(f"  OK  Loaded cached captions from {cache}")
        return data
    os.makedirs(save_dir, exist_ok=True)
    data = _make_precomputed(n)
    with open(cache, "w") as f:
        json.dump(data, f, indent=2)
    print(f"  OK  Pre-computed captions saved -> {cache}")
    return data


# ─────────────────────────────────────────────────────────────────────────────
# Standalone
# ─────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--live", action="store_true")
    args = parser.parse_args()

    SAVE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results")

    if args.live:
        from step1_load_model import load_model
        model, processor, device = load_model()
        records = generate_captions(model, processor, device, n=1000, save_dir=SAVE_DIR)
    else:
        records = _load_or_use_precomputed(SAVE_DIR)

    print(f"  Total captions: {len(records)}")
    print(f"  Sample: {records[0]}")