""" Synthetic dataset generator for the Annotation QA Environment. Generates scene descriptions + gold annotations without requiring any external dataset (COCO, VOC, etc.). Everything is self-contained and deterministic. WHY NOT USE COCO IMAGES? ======================== The COCO dataset would NOT work within the hackathon's resource constraints: 1. STORAGE: COCO train2017 is ~18GB of images alone. The Docker container must run on HF Spaces free tier (16GB RAM, 2 vCPU). Just loading the images into the container would exceed the storage budget. 2. MEMORY: Serving base64-encoded images in observations would consume ~1-5MB per step. With concurrent WebSocket sessions, memory would spike past 8GB instantly. 3. DOCKER BUILD: The Dockerfile must build within the 600s timeout in the pre-validation script. Downloading 18GB of COCO images during Docker build would timeout. 4. LLM COMPATIBILITY: The inference script uses text-only OpenAI API clients (e.g., Qwen2.5-72B-Instruct). Passing raw images would require a VLM (vision-language model), which is NOT guaranteed in the evaluation pipeline. The hackathon's evaluation uses "standard Open LLM agent (e.g. Nemotron 3 Super)" which is text-only. 5. REPRODUCIBILITY: COCO images introduce non-determinism via JPEG compression artifacts and OCR variations. Our synthetic scenes are 100% deterministic. OUR APPROACH: - Generate synthetic scenes as structured JSON + natural language descriptions - Objects have known classes and precise bounding boxes - The agent reasons about spatial relationships purely through text - Total dataset is <1MB — fits easily in the Docker image """ import json import os import random from pathlib import Path from typing import Any, Dict, List # Object classes and their typical size ranges (normalized) OBJECT_CLASSES = { "car": {"w_range": (0.10, 0.25), "h_range": (0.08, 0.15)}, "truck": {"w_range": (0.15, 0.30), "h_range": (0.10, 0.18)}, "person": {"w_range": (0.04, 0.08), "h_range": (0.10, 0.25)}, "bicycle": {"w_range": (0.06, 0.12), "h_range": (0.06, 0.12)}, "dog": {"w_range": (0.05, 0.10), "h_range": (0.04, 0.08)}, "cat": {"w_range": (0.04, 0.08), "h_range": (0.04, 0.07)}, "tree": {"w_range": (0.08, 0.15), "h_range": (0.15, 0.35)}, "building": {"w_range": (0.15, 0.35), "h_range": (0.20, 0.45)}, "traffic_light": {"w_range": (0.02, 0.04), "h_range": (0.06, 0.10)}, "bench": {"w_range": (0.08, 0.15), "h_range": (0.05, 0.08)}, } SCENE_TEMPLATES = [ { "name": "urban_street", "description": "A busy urban street scene with vehicles, pedestrians, and city infrastructure.", "typical_objects": ["car", "truck", "person", "bicycle", "traffic_light", "building", "tree", "bench"], "min_objects": 5, "max_objects": 10, }, { "name": "park", "description": "A peaceful park setting with trees, benches, and people walking their pets.", "typical_objects": ["person", "dog", "cat", "tree", "bench", "bicycle"], "min_objects": 4, "max_objects": 8, }, { "name": "parking_lot", "description": "A parking lot with various vehicles and some pedestrians.", "typical_objects": ["car", "truck", "person", "bicycle", "building"], "min_objects": 5, "max_objects": 12, }, { "name": "residential_area", "description": "A quiet residential neighborhood with houses, trees, and occasional pedestrians.", "typical_objects": ["building", "tree", "person", "car", "dog", "cat", "bench"], "min_objects": 4, "max_objects": 9, }, { "name": "intersection", "description": "A road intersection with traffic lights, vehicles, and crossing pedestrians.", "typical_objects": ["car", "truck", "person", "traffic_light", "bicycle", "building"], "min_objects": 6, "max_objects": 11, }, ] SPATIAL_POSITIONS = [ "top-left", "top-center", "top-right", "middle-left", "center", "middle-right", "bottom-left", "bottom-center", "bottom-right", ] def _position_to_region(position: str) -> tuple: """Map spatial position name to approximate (x_center, y_center) range.""" mapping = { "top-left": (0.1, 0.3, 0.1, 0.3), "top-center": (0.35, 0.65, 0.1, 0.3), "top-right": (0.7, 0.9, 0.1, 0.3), "middle-left": (0.1, 0.3, 0.35, 0.65), "center": (0.35, 0.65, 0.35, 0.65), "middle-right": (0.7, 0.9, 0.35, 0.65), "bottom-left": (0.1, 0.3, 0.7, 0.9), "bottom-center": (0.35, 0.65, 0.7, 0.9), "bottom-right": (0.7, 0.9, 0.7, 0.9), } return mapping.get(position, (0.3, 0.7, 0.3, 0.7)) def generate_scene( rng: random.Random, scene_id: str, n_objects: int = None ) -> Dict[str, Any]: """Generate a single synthetic scene with objects and gold annotations.""" template = rng.choice(SCENE_TEMPLATES) if n_objects is None: n_objects = rng.randint(template["min_objects"], template["max_objects"]) objects = [] annotations = [] used_positions = [] for i in range(n_objects): cls = rng.choice(template["typical_objects"]) size_spec = OBJECT_CLASSES[cls] # Pick a position that doesn't overlap too much position = rng.choice(SPATIAL_POSITIONS) x_lo, x_hi, y_lo, y_hi = _position_to_region(position) w = rng.uniform(*size_spec["w_range"]) h = rng.uniform(*size_spec["h_range"]) # Place object center within the position region cx = rng.uniform(x_lo, x_hi) cy = rng.uniform(y_lo, y_hi) x = max(0.0, cx - w / 2) y = max(0.0, cy - h / 2) # Clamp to image bounds x = min(x, 1.0 - w) y = min(y, 1.0 - h) bbox = [round(x, 4), round(y, 4), round(w, 4), round(h, 4)] objects.append({ "id": i, "class_label": cls, "position": position, "bbox": bbox, }) annotations.append({ "id": i, "bbox": bbox, "class_label": cls, }) # Build natural language description obj_descriptions = [] for obj in objects: obj_descriptions.append( f"a {obj['class_label']} at {obj['position']} " f"(bbox: x={obj['bbox'][0]:.2f}, y={obj['bbox'][1]:.2f}, " f"w={obj['bbox'][2]:.2f}, h={obj['bbox'][3]:.2f})" ) scene_text = ( f"{template['description']} " f"The scene contains {len(objects)} objects: " + "; ".join(obj_descriptions) + "." ) return { "scene_id": scene_id, "scene_type": template["name"], "scene_description": scene_text, "objects": objects, "gold_annotations": annotations, } def generate_task_data( task_id: str, difficulty: str, n_samples: int, base_seed: int = 42, ) -> List[Dict[str, Any]]: """Generate all samples for a given task.""" samples = [] for i in range(n_samples): rng = random.Random(base_seed + i) scene = generate_scene(rng, f"{task_id}_sample_{i:03d}") scene["task_id"] = task_id scene["difficulty"] = difficulty scene["seed"] = base_seed + i samples.append(scene) return samples def generate_all_tasks(output_dir: str) -> None: """Generate dataset for all 3 tasks and save to disk.""" output_path = Path(output_dir) # Task 1: Fix Bounding Boxes (Easy) — 50 samples task1_data = generate_task_data( task_id="fix_bboxes", difficulty="easy", n_samples=50, base_seed=1000, ) task1_dir = output_path / "task1_fix_bboxes" task1_dir.mkdir(parents=True, exist_ok=True) with open(task1_dir / "samples.json", "w") as f: json.dump(task1_data, f, indent=2) print(f" Task 1 (fix_bboxes): {len(task1_data)} samples → {task1_dir}") # Task 2: Fix Classes + Bboxes (Medium) — 30 samples task2_data = generate_task_data( task_id="fix_classes", difficulty="medium", n_samples=30, base_seed=2000, ) task2_dir = output_path / "task2_fix_classes" task2_dir.mkdir(parents=True, exist_ok=True) with open(task2_dir / "samples.json", "w") as f: json.dump(task2_data, f, indent=2) print(f" Task 2 (fix_classes): {len(task2_data)} samples → {task2_dir}") # Task 3: Batch Consistency Audit (Hard) — 10 batches of 5 scenes task3_data = [] for batch_idx in range(10): batch_rng = random.Random(3000 + batch_idx * 100) batch_scenes = [] for scene_idx in range(5): scene = generate_scene( batch_rng, f"batch_audit_batch{batch_idx:02d}_scene{scene_idx:02d}", ) scene["batch_id"] = batch_idx scene["task_id"] = "batch_audit" scene["difficulty"] = "hard" scene["seed"] = 3000 + batch_idx * 100 + scene_idx batch_scenes.append(scene) task3_data.append({ "batch_id": batch_idx, "scenes": batch_scenes, }) task3_dir = output_path / "task3_batch_audit" task3_dir.mkdir(parents=True, exist_ok=True) with open(task3_dir / "samples.json", "w") as f: json.dump(task3_data, f, indent=2) print(f" Task 3 (batch_audit): {len(task3_data)} batches × 5 scenes → {task3_dir}") if __name__ == "__main__": script_dir = Path(__file__).parent tasks_dir = script_dir / "tasks" print("Generating Annotation QA dataset...") generate_all_tasks(str(tasks_dir)) print("Done!")