Spaces:
Sleeping
Sleeping
| """ | |
| Synthetic dataset generator for the Annotation QA Environment. | |
| Generates scene descriptions + gold annotations without requiring any external | |
| dataset (COCO, VOC, etc.). Everything is self-contained and deterministic. | |
| WHY NOT USE COCO IMAGES? | |
| ======================== | |
| The COCO dataset would NOT work within the hackathon's resource constraints: | |
| 1. STORAGE: COCO train2017 is ~18GB of images alone. The Docker container must | |
| run on HF Spaces free tier (16GB RAM, 2 vCPU). Just loading the images into | |
| the container would exceed the storage budget. | |
| 2. MEMORY: Serving base64-encoded images in observations would consume ~1-5MB | |
| per step. With concurrent WebSocket sessions, memory would spike past 8GB | |
| instantly. | |
| 3. DOCKER BUILD: The Dockerfile must build within the 600s timeout in the | |
| pre-validation script. Downloading 18GB of COCO images during Docker build | |
| would timeout. | |
| 4. LLM COMPATIBILITY: The inference script uses text-only OpenAI API clients | |
| (e.g., Qwen2.5-72B-Instruct). Passing raw images would require a VLM | |
| (vision-language model), which is NOT guaranteed in the evaluation pipeline. | |
| The hackathon's evaluation uses "standard Open LLM agent (e.g. Nemotron 3 | |
| Super)" which is text-only. | |
| 5. REPRODUCIBILITY: COCO images introduce non-determinism via JPEG compression | |
| artifacts and OCR variations. Our synthetic scenes are 100% deterministic. | |
| OUR APPROACH: | |
| - Generate synthetic scenes as structured JSON + natural language descriptions | |
| - Objects have known classes and precise bounding boxes | |
| - The agent reasons about spatial relationships purely through text | |
| - Total dataset is <1MB β fits easily in the Docker image | |
| """ | |
| import json | |
| import os | |
| import random | |
| from pathlib import Path | |
| from typing import Any, Dict, List | |
| # Object classes and their typical size ranges (normalized) | |
| OBJECT_CLASSES = { | |
| "car": {"w_range": (0.10, 0.25), "h_range": (0.08, 0.15)}, | |
| "truck": {"w_range": (0.15, 0.30), "h_range": (0.10, 0.18)}, | |
| "person": {"w_range": (0.04, 0.08), "h_range": (0.10, 0.25)}, | |
| "bicycle": {"w_range": (0.06, 0.12), "h_range": (0.06, 0.12)}, | |
| "dog": {"w_range": (0.05, 0.10), "h_range": (0.04, 0.08)}, | |
| "cat": {"w_range": (0.04, 0.08), "h_range": (0.04, 0.07)}, | |
| "tree": {"w_range": (0.08, 0.15), "h_range": (0.15, 0.35)}, | |
| "building": {"w_range": (0.15, 0.35), "h_range": (0.20, 0.45)}, | |
| "traffic_light": {"w_range": (0.02, 0.04), "h_range": (0.06, 0.10)}, | |
| "bench": {"w_range": (0.08, 0.15), "h_range": (0.05, 0.08)}, | |
| } | |
| SCENE_TEMPLATES = [ | |
| { | |
| "name": "urban_street", | |
| "description": "A busy urban street scene with vehicles, pedestrians, and city infrastructure.", | |
| "typical_objects": ["car", "truck", "person", "bicycle", "traffic_light", "building", "tree", "bench"], | |
| "min_objects": 5, | |
| "max_objects": 10, | |
| }, | |
| { | |
| "name": "park", | |
| "description": "A peaceful park setting with trees, benches, and people walking their pets.", | |
| "typical_objects": ["person", "dog", "cat", "tree", "bench", "bicycle"], | |
| "min_objects": 4, | |
| "max_objects": 8, | |
| }, | |
| { | |
| "name": "parking_lot", | |
| "description": "A parking lot with various vehicles and some pedestrians.", | |
| "typical_objects": ["car", "truck", "person", "bicycle", "building"], | |
| "min_objects": 5, | |
| "max_objects": 12, | |
| }, | |
| { | |
| "name": "residential_area", | |
| "description": "A quiet residential neighborhood with houses, trees, and occasional pedestrians.", | |
| "typical_objects": ["building", "tree", "person", "car", "dog", "cat", "bench"], | |
| "min_objects": 4, | |
| "max_objects": 9, | |
| }, | |
| { | |
| "name": "intersection", | |
| "description": "A road intersection with traffic lights, vehicles, and crossing pedestrians.", | |
| "typical_objects": ["car", "truck", "person", "traffic_light", "bicycle", "building"], | |
| "min_objects": 6, | |
| "max_objects": 11, | |
| }, | |
| ] | |
| SPATIAL_POSITIONS = [ | |
| "top-left", "top-center", "top-right", | |
| "middle-left", "center", "middle-right", | |
| "bottom-left", "bottom-center", "bottom-right", | |
| ] | |
| def _position_to_region(position: str) -> tuple: | |
| """Map spatial position name to approximate (x_center, y_center) range.""" | |
| mapping = { | |
| "top-left": (0.1, 0.3, 0.1, 0.3), | |
| "top-center": (0.35, 0.65, 0.1, 0.3), | |
| "top-right": (0.7, 0.9, 0.1, 0.3), | |
| "middle-left": (0.1, 0.3, 0.35, 0.65), | |
| "center": (0.35, 0.65, 0.35, 0.65), | |
| "middle-right": (0.7, 0.9, 0.35, 0.65), | |
| "bottom-left": (0.1, 0.3, 0.7, 0.9), | |
| "bottom-center": (0.35, 0.65, 0.7, 0.9), | |
| "bottom-right": (0.7, 0.9, 0.7, 0.9), | |
| } | |
| return mapping.get(position, (0.3, 0.7, 0.3, 0.7)) | |
| def generate_scene( | |
| rng: random.Random, scene_id: str, n_objects: int = None | |
| ) -> Dict[str, Any]: | |
| """Generate a single synthetic scene with objects and gold annotations.""" | |
| template = rng.choice(SCENE_TEMPLATES) | |
| if n_objects is None: | |
| n_objects = rng.randint(template["min_objects"], template["max_objects"]) | |
| objects = [] | |
| annotations = [] | |
| used_positions = [] | |
| for i in range(n_objects): | |
| cls = rng.choice(template["typical_objects"]) | |
| size_spec = OBJECT_CLASSES[cls] | |
| # Pick a position that doesn't overlap too much | |
| position = rng.choice(SPATIAL_POSITIONS) | |
| x_lo, x_hi, y_lo, y_hi = _position_to_region(position) | |
| w = rng.uniform(*size_spec["w_range"]) | |
| h = rng.uniform(*size_spec["h_range"]) | |
| # Place object center within the position region | |
| cx = rng.uniform(x_lo, x_hi) | |
| cy = rng.uniform(y_lo, y_hi) | |
| x = max(0.0, cx - w / 2) | |
| y = max(0.0, cy - h / 2) | |
| # Clamp to image bounds | |
| x = min(x, 1.0 - w) | |
| y = min(y, 1.0 - h) | |
| bbox = [round(x, 4), round(y, 4), round(w, 4), round(h, 4)] | |
| objects.append({ | |
| "id": i, | |
| "class_label": cls, | |
| "position": position, | |
| "bbox": bbox, | |
| }) | |
| annotations.append({ | |
| "id": i, | |
| "bbox": bbox, | |
| "class_label": cls, | |
| }) | |
| # Build natural language description | |
| obj_descriptions = [] | |
| for obj in objects: | |
| obj_descriptions.append( | |
| f"a {obj['class_label']} at {obj['position']} " | |
| f"(bbox: x={obj['bbox'][0]:.2f}, y={obj['bbox'][1]:.2f}, " | |
| f"w={obj['bbox'][2]:.2f}, h={obj['bbox'][3]:.2f})" | |
| ) | |
| scene_text = ( | |
| f"{template['description']} " | |
| f"The scene contains {len(objects)} objects: " | |
| + "; ".join(obj_descriptions) | |
| + "." | |
| ) | |
| return { | |
| "scene_id": scene_id, | |
| "scene_type": template["name"], | |
| "scene_description": scene_text, | |
| "objects": objects, | |
| "gold_annotations": annotations, | |
| } | |
| def generate_task_data( | |
| task_id: str, | |
| difficulty: str, | |
| n_samples: int, | |
| base_seed: int = 42, | |
| ) -> List[Dict[str, Any]]: | |
| """Generate all samples for a given task.""" | |
| samples = [] | |
| for i in range(n_samples): | |
| rng = random.Random(base_seed + i) | |
| scene = generate_scene(rng, f"{task_id}_sample_{i:03d}") | |
| scene["task_id"] = task_id | |
| scene["difficulty"] = difficulty | |
| scene["seed"] = base_seed + i | |
| samples.append(scene) | |
| return samples | |
| def generate_all_tasks(output_dir: str) -> None: | |
| """Generate dataset for all 3 tasks and save to disk.""" | |
| output_path = Path(output_dir) | |
| # Task 1: Fix Bounding Boxes (Easy) β 50 samples | |
| task1_data = generate_task_data( | |
| task_id="fix_bboxes", | |
| difficulty="easy", | |
| n_samples=50, | |
| base_seed=1000, | |
| ) | |
| task1_dir = output_path / "task1_fix_bboxes" | |
| task1_dir.mkdir(parents=True, exist_ok=True) | |
| with open(task1_dir / "samples.json", "w") as f: | |
| json.dump(task1_data, f, indent=2) | |
| print(f" Task 1 (fix_bboxes): {len(task1_data)} samples β {task1_dir}") | |
| # Task 2: Fix Classes + Bboxes (Medium) β 30 samples | |
| task2_data = generate_task_data( | |
| task_id="fix_classes", | |
| difficulty="medium", | |
| n_samples=30, | |
| base_seed=2000, | |
| ) | |
| task2_dir = output_path / "task2_fix_classes" | |
| task2_dir.mkdir(parents=True, exist_ok=True) | |
| with open(task2_dir / "samples.json", "w") as f: | |
| json.dump(task2_data, f, indent=2) | |
| print(f" Task 2 (fix_classes): {len(task2_data)} samples β {task2_dir}") | |
| # Task 3: Batch Consistency Audit (Hard) β 10 batches of 5 scenes | |
| task3_data = [] | |
| for batch_idx in range(10): | |
| batch_rng = random.Random(3000 + batch_idx * 100) | |
| batch_scenes = [] | |
| for scene_idx in range(5): | |
| scene = generate_scene( | |
| batch_rng, | |
| f"batch_audit_batch{batch_idx:02d}_scene{scene_idx:02d}", | |
| ) | |
| scene["batch_id"] = batch_idx | |
| scene["task_id"] = "batch_audit" | |
| scene["difficulty"] = "hard" | |
| scene["seed"] = 3000 + batch_idx * 100 + scene_idx | |
| batch_scenes.append(scene) | |
| task3_data.append({ | |
| "batch_id": batch_idx, | |
| "scenes": batch_scenes, | |
| }) | |
| task3_dir = output_path / "task3_batch_audit" | |
| task3_dir.mkdir(parents=True, exist_ok=True) | |
| with open(task3_dir / "samples.json", "w") as f: | |
| json.dump(task3_data, f, indent=2) | |
| print(f" Task 3 (batch_audit): {len(task3_data)} batches Γ 5 scenes β {task3_dir}") | |
| if __name__ == "__main__": | |
| script_dir = Path(__file__).parent | |
| tasks_dir = script_dir / "tasks" | |
| print("Generating Annotation QA dataset...") | |
| generate_all_tasks(str(tasks_dir)) | |
| print("Done!") | |