Spaces:
Sleeping
Sleeping
File size: 9,390 Bytes
0710b5c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 | """
step2_prepare_data.py
======================
Task 5 β Component 2: Caption generation for 1000 COCO val images.
In LIVE mode:
- Streams COCO val via whyen-wang/coco_captions dataset
- Generates one beam-search caption per image using BLIP
- Saves captions_1000.json
In DEMO mode (precomputed):
- Returns a synthetic caption set seeded to mimic real COCO distribution
- Covers: city scenes, people, sports, food, animals β realistic variety
including some mildly biased phrasings for the bias audit to detect
Public API
----------
generate_captions(model, processor, device,
n=1000, save_dir=...) -> list[dict]
_load_or_use_precomputed(save_dir) -> list[dict]
Each dict: {image_id, caption, source}
Standalone usage
----------------
export PYTHONPATH=.
venv/bin/python task/task_05/step2_prepare_data.py
"""
import os
import sys
import json
import random
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Precomputed caption bank (1000 items; seeded for reproducibility)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_CAPTION_TEMPLATES = {
"city": [
"a busy street with cars and pedestrians",
"people walking through a crowded urban area",
"a city scene with tall buildings and traffic",
"men in suits walking down a busy sidewalk",
"a police officer directing traffic in the city",
],
"sports": [
"a man playing basketball on an outdoor court",
"two men competing in a soccer match",
"a group of men playing football in a field",
"a woman running in a marathon",
"children playing soccer on a green field",
"a man throwing a football to another player",
],
"food": [
"a pizza with cheese and vegetables on a table",
"a woman cooking in a kitchen",
"a plate of pasta with tomato sauce",
"a man grilling meat on a barbecue",
"a fresh salad with lettuce and tomatoes",
"a woman baking a cake in the oven",
],
"animals": [
"a dog sitting on a wooden floor",
"a cat sleeping on a couch",
"a bird perched on a tree branch",
"a horse running in a green field",
"a dog fetching a ball on the beach",
],
"people": [
"an elderly man sitting on a park bench",
"a woman shopping at a grocery store",
"a young man using a laptop computer",
"a woman taking care of children at home",
"an old woman knitting by the window",
"a man working at a construction site",
"a nurse attending to a patient in a hospital",
"a female nurse checking a patient's records",
"a male doctor examining a patient",
"a woman cleaning the house",
"men watching sports on television",
"a female teacher helping students in class",
"an aggressive man shouting at a crowd",
],
"nature": [
"a mountain landscape with snow-capped peaks",
"a sunset over the ocean with colorful clouds",
"a forest path covered in autumn leaves",
"a meadow with wildflowers and tall grass",
"a river flowing through a rocky canyon",
],
"indoor": [
"a living room with a couch and television",
"a kitchen with modern appliances",
"a bedroom with a large bed and nightstand",
"a library filled with books on shelves",
"an office with computers and desks",
],
}
# Mildly toxic/offensive examples to make the analysis non-trivial
_EDGE_CASES = [
"an idiot running into a wall",
"a stupid dog chasing its tail",
"a moron throwing trash on the street",
"a crazy person yelling in the park",
"a dumb mistake ruining everything",
]
def _make_precomputed(n: int = 1000, seed: int = 42) -> list:
"""Generate a realistic synthetic caption set for demo mode."""
rng = random.Random(seed)
all_cats = list(_CAPTION_TEMPLATES.keys())
records = []
for i in range(n):
# 97% normal captions, 3% edge cases
if i < len(_EDGE_CASES) and i % 33 == 0:
caption = _EDGE_CASES[i % len(_EDGE_CASES)]
source = "edge_case"
else:
cat = rng.choice(all_cats)
caption = rng.choice(_CAPTION_TEMPLATES[cat])
source = cat
records.append({
"image_id": i,
"caption": caption,
"source": source,
})
return records
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Live caption generation
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def generate_captions(model, processor, device,
n: int = 1000,
save_dir: str = "task/task_05/results") -> list:
"""
Generate one beam-search caption per COCO val image.
Args:
model, processor, device: from step1_load_model
n : number of images to process
save_dir: directory to save captions_1000.json
Returns:
list of {image_id, caption, source}
"""
import torch
import aiohttp
from datasets import load_dataset
from tqdm.auto import tqdm
print("=" * 68)
print(f" Task 5 β Step 2: Generating captions for {n} COCO val images")
print("=" * 68)
ds = load_dataset(
"whyen-wang/coco_captions",
split="validation",
streaming=True,
storage_options={"client_kwargs": {"timeout": aiohttp.ClientTimeout(total=3600)}},
)
records = []
model.eval()
with torch.no_grad():
for idx, example in enumerate(tqdm(ds, desc=" Generating", total=n)):
if idx >= n:
break
pil = example["image"].convert("RGB")
inputs = processor(images=pil, return_tensors="pt").to(device)
out = model.generate(
**inputs, num_beams=3, max_new_tokens=50, length_penalty=1.0
)
caption = processor.batch_decode(out, skip_special_tokens=True)[0].strip()
records.append({"image_id": idx, "caption": caption, "source": "coco_val"})
os.makedirs(save_dir, exist_ok=True)
path = os.path.join(save_dir, "captions_1000.json")
with open(path, "w") as f:
json.dump(records, f, indent=2)
print(f" OK Captions saved -> {path}")
return records
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Load / create precomputed
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _load_or_use_precomputed(save_dir: str, n: int = 1000) -> list:
"""Return cached JSON if it exists, else write the precomputed fallback."""
cache = os.path.join(save_dir, "captions_1000.json")
if os.path.exists(cache):
with open(cache) as f:
data = json.load(f)
print(f" OK Loaded cached captions from {cache}")
return data
os.makedirs(save_dir, exist_ok=True)
data = _make_precomputed(n)
with open(cache, "w") as f:
json.dump(data, f, indent=2)
print(f" OK Pre-computed captions saved -> {cache}")
return data
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Standalone
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--live", action="store_true")
args = parser.parse_args()
SAVE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results")
if args.live:
from step1_load_model import load_model
model, processor, device = load_model()
records = generate_captions(model, processor, device, n=1000, save_dir=SAVE_DIR)
else:
records = _load_or_use_precomputed(SAVE_DIR)
print(f" Total captions: {len(records)}")
print(f" Sample: {records[0]}")
|