annotation-qa-env / data /generate_dataset.py
k3tikvats
initial commit
8b4d6a8
"""
Synthetic dataset generator for the Annotation QA Environment.
Generates scene descriptions + gold annotations without requiring any external
dataset (COCO, VOC, etc.). Everything is self-contained and deterministic.
WHY NOT USE COCO IMAGES?
========================
The COCO dataset would NOT work within the hackathon's resource constraints:
1. STORAGE: COCO train2017 is ~18GB of images alone. The Docker container must
run on HF Spaces free tier (16GB RAM, 2 vCPU). Just loading the images into
the container would exceed the storage budget.
2. MEMORY: Serving base64-encoded images in observations would consume ~1-5MB
per step. With concurrent WebSocket sessions, memory would spike past 8GB
instantly.
3. DOCKER BUILD: The Dockerfile must build within the 600s timeout in the
pre-validation script. Downloading 18GB of COCO images during Docker build
would timeout.
4. LLM COMPATIBILITY: The inference script uses text-only OpenAI API clients
(e.g., Qwen2.5-72B-Instruct). Passing raw images would require a VLM
(vision-language model), which is NOT guaranteed in the evaluation pipeline.
The hackathon's evaluation uses "standard Open LLM agent (e.g. Nemotron 3
Super)" which is text-only.
5. REPRODUCIBILITY: COCO images introduce non-determinism via JPEG compression
artifacts and OCR variations. Our synthetic scenes are 100% deterministic.
OUR APPROACH:
- Generate synthetic scenes as structured JSON + natural language descriptions
- Objects have known classes and precise bounding boxes
- The agent reasons about spatial relationships purely through text
- Total dataset is <1MB β€” fits easily in the Docker image
"""
import json
import os
import random
from pathlib import Path
from typing import Any, Dict, List
# Object classes and their typical size ranges (normalized)
OBJECT_CLASSES = {
"car": {"w_range": (0.10, 0.25), "h_range": (0.08, 0.15)},
"truck": {"w_range": (0.15, 0.30), "h_range": (0.10, 0.18)},
"person": {"w_range": (0.04, 0.08), "h_range": (0.10, 0.25)},
"bicycle": {"w_range": (0.06, 0.12), "h_range": (0.06, 0.12)},
"dog": {"w_range": (0.05, 0.10), "h_range": (0.04, 0.08)},
"cat": {"w_range": (0.04, 0.08), "h_range": (0.04, 0.07)},
"tree": {"w_range": (0.08, 0.15), "h_range": (0.15, 0.35)},
"building": {"w_range": (0.15, 0.35), "h_range": (0.20, 0.45)},
"traffic_light": {"w_range": (0.02, 0.04), "h_range": (0.06, 0.10)},
"bench": {"w_range": (0.08, 0.15), "h_range": (0.05, 0.08)},
}
SCENE_TEMPLATES = [
{
"name": "urban_street",
"description": "A busy urban street scene with vehicles, pedestrians, and city infrastructure.",
"typical_objects": ["car", "truck", "person", "bicycle", "traffic_light", "building", "tree", "bench"],
"min_objects": 5,
"max_objects": 10,
},
{
"name": "park",
"description": "A peaceful park setting with trees, benches, and people walking their pets.",
"typical_objects": ["person", "dog", "cat", "tree", "bench", "bicycle"],
"min_objects": 4,
"max_objects": 8,
},
{
"name": "parking_lot",
"description": "A parking lot with various vehicles and some pedestrians.",
"typical_objects": ["car", "truck", "person", "bicycle", "building"],
"min_objects": 5,
"max_objects": 12,
},
{
"name": "residential_area",
"description": "A quiet residential neighborhood with houses, trees, and occasional pedestrians.",
"typical_objects": ["building", "tree", "person", "car", "dog", "cat", "bench"],
"min_objects": 4,
"max_objects": 9,
},
{
"name": "intersection",
"description": "A road intersection with traffic lights, vehicles, and crossing pedestrians.",
"typical_objects": ["car", "truck", "person", "traffic_light", "bicycle", "building"],
"min_objects": 6,
"max_objects": 11,
},
]
SPATIAL_POSITIONS = [
"top-left", "top-center", "top-right",
"middle-left", "center", "middle-right",
"bottom-left", "bottom-center", "bottom-right",
]
def _position_to_region(position: str) -> tuple:
"""Map spatial position name to approximate (x_center, y_center) range."""
mapping = {
"top-left": (0.1, 0.3, 0.1, 0.3),
"top-center": (0.35, 0.65, 0.1, 0.3),
"top-right": (0.7, 0.9, 0.1, 0.3),
"middle-left": (0.1, 0.3, 0.35, 0.65),
"center": (0.35, 0.65, 0.35, 0.65),
"middle-right": (0.7, 0.9, 0.35, 0.65),
"bottom-left": (0.1, 0.3, 0.7, 0.9),
"bottom-center": (0.35, 0.65, 0.7, 0.9),
"bottom-right": (0.7, 0.9, 0.7, 0.9),
}
return mapping.get(position, (0.3, 0.7, 0.3, 0.7))
def generate_scene(
rng: random.Random, scene_id: str, n_objects: int = None
) -> Dict[str, Any]:
"""Generate a single synthetic scene with objects and gold annotations."""
template = rng.choice(SCENE_TEMPLATES)
if n_objects is None:
n_objects = rng.randint(template["min_objects"], template["max_objects"])
objects = []
annotations = []
used_positions = []
for i in range(n_objects):
cls = rng.choice(template["typical_objects"])
size_spec = OBJECT_CLASSES[cls]
# Pick a position that doesn't overlap too much
position = rng.choice(SPATIAL_POSITIONS)
x_lo, x_hi, y_lo, y_hi = _position_to_region(position)
w = rng.uniform(*size_spec["w_range"])
h = rng.uniform(*size_spec["h_range"])
# Place object center within the position region
cx = rng.uniform(x_lo, x_hi)
cy = rng.uniform(y_lo, y_hi)
x = max(0.0, cx - w / 2)
y = max(0.0, cy - h / 2)
# Clamp to image bounds
x = min(x, 1.0 - w)
y = min(y, 1.0 - h)
bbox = [round(x, 4), round(y, 4), round(w, 4), round(h, 4)]
objects.append({
"id": i,
"class_label": cls,
"position": position,
"bbox": bbox,
})
annotations.append({
"id": i,
"bbox": bbox,
"class_label": cls,
})
# Build natural language description
obj_descriptions = []
for obj in objects:
obj_descriptions.append(
f"a {obj['class_label']} at {obj['position']} "
f"(bbox: x={obj['bbox'][0]:.2f}, y={obj['bbox'][1]:.2f}, "
f"w={obj['bbox'][2]:.2f}, h={obj['bbox'][3]:.2f})"
)
scene_text = (
f"{template['description']} "
f"The scene contains {len(objects)} objects: "
+ "; ".join(obj_descriptions)
+ "."
)
return {
"scene_id": scene_id,
"scene_type": template["name"],
"scene_description": scene_text,
"objects": objects,
"gold_annotations": annotations,
}
def generate_task_data(
task_id: str,
difficulty: str,
n_samples: int,
base_seed: int = 42,
) -> List[Dict[str, Any]]:
"""Generate all samples for a given task."""
samples = []
for i in range(n_samples):
rng = random.Random(base_seed + i)
scene = generate_scene(rng, f"{task_id}_sample_{i:03d}")
scene["task_id"] = task_id
scene["difficulty"] = difficulty
scene["seed"] = base_seed + i
samples.append(scene)
return samples
def generate_all_tasks(output_dir: str) -> None:
"""Generate dataset for all 3 tasks and save to disk."""
output_path = Path(output_dir)
# Task 1: Fix Bounding Boxes (Easy) β€” 50 samples
task1_data = generate_task_data(
task_id="fix_bboxes",
difficulty="easy",
n_samples=50,
base_seed=1000,
)
task1_dir = output_path / "task1_fix_bboxes"
task1_dir.mkdir(parents=True, exist_ok=True)
with open(task1_dir / "samples.json", "w") as f:
json.dump(task1_data, f, indent=2)
print(f" Task 1 (fix_bboxes): {len(task1_data)} samples β†’ {task1_dir}")
# Task 2: Fix Classes + Bboxes (Medium) β€” 30 samples
task2_data = generate_task_data(
task_id="fix_classes",
difficulty="medium",
n_samples=30,
base_seed=2000,
)
task2_dir = output_path / "task2_fix_classes"
task2_dir.mkdir(parents=True, exist_ok=True)
with open(task2_dir / "samples.json", "w") as f:
json.dump(task2_data, f, indent=2)
print(f" Task 2 (fix_classes): {len(task2_data)} samples β†’ {task2_dir}")
# Task 3: Batch Consistency Audit (Hard) β€” 10 batches of 5 scenes
task3_data = []
for batch_idx in range(10):
batch_rng = random.Random(3000 + batch_idx * 100)
batch_scenes = []
for scene_idx in range(5):
scene = generate_scene(
batch_rng,
f"batch_audit_batch{batch_idx:02d}_scene{scene_idx:02d}",
)
scene["batch_id"] = batch_idx
scene["task_id"] = "batch_audit"
scene["difficulty"] = "hard"
scene["seed"] = 3000 + batch_idx * 100 + scene_idx
batch_scenes.append(scene)
task3_data.append({
"batch_id": batch_idx,
"scenes": batch_scenes,
})
task3_dir = output_path / "task3_batch_audit"
task3_dir.mkdir(parents=True, exist_ok=True)
with open(task3_dir / "samples.json", "w") as f:
json.dump(task3_data, f, indent=2)
print(f" Task 3 (batch_audit): {len(task3_data)} batches Γ— 5 scenes β†’ {task3_dir}")
if __name__ == "__main__":
script_dir = Path(__file__).parent
tasks_dir = script_dir / "tasks"
print("Generating Annotation QA dataset...")
generate_all_tasks(str(tasks_dir))
print("Done!")