# ===================================================================== # drones.yaml — example project config for the data labeling factory # ===================================================================== # # This is the canonical example. It captures EXACTLY what we built tonight # for the fiber-optic drone detector. To make a new project (e.g. stop signs, # fire hydrants, manufacturing defects), copy this file, change `target_object`, # adjust the queries, and run: # # data_label_factory pipeline --project projects/drones.yaml # # Generic shape: # project_name → human-readable identifier (used in experiment dir names) # target_object → the thing you're trying to detect (templated into prompts) # data_root → where local images go # r2: → cloud storage config (bucket, prefix per stage) # buckets: → gather plan (5 buckets is conventional but any structure works) # falcon_queries: → list of queries to run Falcon Perception with # prompts: → optional overrides for templated prompts # backends: → which model backend to use per stage # ===================================================================== project_name: drones target_object: "fiber optic drone" description: | Auto-labeling pipeline for fiber-optic drone detection. Falcon Perception grounds bboxes for any drone, spool, or cable; Qwen2.5-VL verifies each. # Where local images live (gitignored) data_root: ~/drone-falcon-data/v2 # Cloudflare R2 storage r2: bucket: drone-falcon raw_prefix: raw_v2/ # gathered images labels_prefix: labels/ # COCO + verified JSONs reviews_prefix: labels/reviews.json # human verdicts saved by web UI # What to gather, organized by bucket. Each bucket is a folder under data_root # and a corresponding R2 prefix. Multiple queries are OR'd via DDG/Wikimedia. buckets: positive/fiber_spool_drone: queries: - "fiber optic FPV drone" - "tethered fiber optic drone" - "Ukraine fiber optic drone war" - "fiber optic kamikaze drone" - "fiber optic drone payload" - "wired FPV drone Ukraine" - "fiber optic drone with spool" - "Russian fiber optic drone" - "fiber optic dispenser drone" - "fiber optic combat drone" positive/spool_only: queries: - "fiber optic cable spool" - "optical fiber reel" - "fiber optic winding machine" - "spooled optical fiber cable" - "fiber optic cable on reel" - "optical fiber cable drum" negative/drones_no_spool: queries: - "DJI Mavic 3 Pro photo" - "FPV racing drone closeup" - "consumer quadcopter flying" - "agricultural spraying drone" - "DJI Mini 4 Pro photo" - "Autel Evo drone" - "Skydio 2 drone" - "racing drone build" distractor/round_things: queries: - "garden hose reel" - "cable drum reel industrial" - "duct tape roll" - "fire hose reel" - "rope coil pile" - "extension cord reel" - "thread spool sewing" background/empty: queries: - "blue sky clouds" - "open field landscape" - "industrial workshop interior" - "outdoor military training" # What Falcon Perception should look for in each image (the bbox grounding queries). # These are the specific objects we want bounding boxes on. falcon_queries: - "fiber optic spool" - "cable spool" - "drone" - "quadcopter" - "fiber optic drone" # Optional: prompt overrides. Default templates use {target_object} substitution. # Leave commented to use the defaults from lib/project.py. prompts: filter: | Look at this image. Does it show a {target_object}, a related component (cable spool, fiber reel, wound cable), or any other relevant object? Answer with exactly one word: YES or NO. YES if you see ANY of: a {target_object}, a quadcopter, a cable reel, a fiber spool. NO if the main subject is something else. verify: | Look carefully at this image crop. Question: Is the main object in this crop actually a {query}? Answer first with one word: YES, NO, or UNSURE. Then briefly say what the object actually is in 5-10 words. # Which model backend to use per stage. # qwen = Qwen2.5-VL-3B via mlx-vlm server (M4 :8291) — fast, free # gemma = Gemma 4 26B via mac_tensor (M4 :8500) — slow chained agent # falcon = Falcon Perception via mac_tensor /api/falcon — bbox grounding only # pod = remote RunPod GPU pod — fast, ~$0.15-1.65/run backends: filter: qwen label: pod # Falcon on RunPod for production scale verify: pod # Qwen on the same pod # Pod settings (only used when a stage backend = pod) pod: gpu_types: ["NVIDIA L40S"] data_centers: ["EU-RO-1", "EU-CZ-1", "EU-NL-1", "US-CA-2"] image: "runpod/pytorch:1.0.3-cu1290-torch291-ubuntu2204" container_disk_gb: 30 volume_gb: 30