Initial release: data-label-factory v0.1.0

4cda727 verified 6 days ago

5.01 kB

	# =====================================================================
	# drones.yaml — example project config for the data labeling factory
	# =====================================================================
	#
	# This is the canonical example. It captures EXACTLY what we built tonight
	# for the fiber-optic drone detector. To make a new project (e.g. stop signs,
	# fire hydrants, manufacturing defects), copy this file, change `target_object`,
	# adjust the queries, and run:
	#
	# data_label_factory pipeline --project projects/drones.yaml
	#
	# Generic shape:
	# project_name → human-readable identifier (used in experiment dir names)
	# target_object → the thing you're trying to detect (templated into prompts)
	# data_root → where local images go
	# r2: → cloud storage config (bucket, prefix per stage)
	# buckets: → gather plan (5 buckets is conventional but any structure works)
	# falcon_queries: → list of queries to run Falcon Perception with
	# prompts: → optional overrides for templated prompts
	# backends: → which model backend to use per stage
	# =====================================================================

	project_name: drones
	target_object: "fiber optic drone"
	description: \|
	Auto-labeling pipeline for fiber-optic drone detection. Falcon Perception
	grounds bboxes for any drone, spool, or cable; Qwen2.5-VL verifies each.

	# Where local images live (gitignored)
	data_root: ~/drone-falcon-data/v2

	# Cloudflare R2 storage
	r2:
	bucket: drone-falcon
	raw_prefix: raw_v2/ # gathered images
	labels_prefix: labels/ # COCO + verified JSONs
	reviews_prefix: labels/reviews.json # human verdicts saved by web UI

	# What to gather, organized by bucket. Each bucket is a folder under data_root
	# and a corresponding R2 prefix. Multiple queries are OR'd via DDG/Wikimedia.
	buckets:
	positive/fiber_spool_drone:
	queries:
	- "fiber optic FPV drone"
	- "tethered fiber optic drone"
	- "Ukraine fiber optic drone war"
	- "fiber optic kamikaze drone"
	- "fiber optic drone payload"
	- "wired FPV drone Ukraine"
	- "fiber optic drone with spool"
	- "Russian fiber optic drone"
	- "fiber optic dispenser drone"
	- "fiber optic combat drone"

	positive/spool_only:
	queries:
	- "fiber optic cable spool"
	- "optical fiber reel"
	- "fiber optic winding machine"
	- "spooled optical fiber cable"
	- "fiber optic cable on reel"
	- "optical fiber cable drum"

	negative/drones_no_spool:
	queries:
	- "DJI Mavic 3 Pro photo"
	- "FPV racing drone closeup"
	- "consumer quadcopter flying"
	- "agricultural spraying drone"
	- "DJI Mini 4 Pro photo"
	- "Autel Evo drone"
	- "Skydio 2 drone"
	- "racing drone build"

	distractor/round_things:
	queries:
	- "garden hose reel"
	- "cable drum reel industrial"
	- "duct tape roll"
	- "fire hose reel"
	- "rope coil pile"
	- "extension cord reel"
	- "thread spool sewing"

	background/empty:
	queries:
	- "blue sky clouds"
	- "open field landscape"
	- "industrial workshop interior"
	- "outdoor military training"

	# What Falcon Perception should look for in each image (the bbox grounding queries).
	# These are the specific objects we want bounding boxes on.
	falcon_queries:
	- "fiber optic spool"
	- "cable spool"
	- "drone"
	- "quadcopter"
	- "fiber optic drone"

	# Optional: prompt overrides. Default templates use {target_object} substitution.
	# Leave commented to use the defaults from lib/project.py.
	prompts:
	filter: \|
	Look at this image. Does it show a {target_object}, a related component
	(cable spool, fiber reel, wound cable), or any other relevant object?
	Answer with exactly one word: YES or NO.
	YES if you see ANY of: a {target_object}, a quadcopter, a cable reel, a fiber spool.
	NO if the main subject is something else.

	verify: \|
	Look carefully at this image crop.
	Question: Is the main object in this crop actually a {query}?
	Answer first with one word: YES, NO, or UNSURE.
	Then briefly say what the object actually is in 5-10 words.

	# Which model backend to use per stage.
	# qwen = Qwen2.5-VL-3B via mlx-vlm server (M4 :8291) — fast, free
	# gemma = Gemma 4 26B via mac_tensor (M4 :8500) — slow chained agent
	# falcon = Falcon Perception via mac_tensor /api/falcon — bbox grounding only
	# pod = remote RunPod GPU pod — fast, ~$0.15-1.65/run
	backends:
	filter: qwen
	label: pod # Falcon on RunPod for production scale
	verify: pod # Qwen on the same pod

	# Pod settings (only used when a stage backend = pod)
	pod:
	gpu_types: ["NVIDIA L40S"]
	data_centers: ["EU-RO-1", "EU-CZ-1", "EU-NL-1", "US-CA-2"]
	image: "runpod/pytorch:1.0.3-cu1290-torch291-ubuntu2204"
	container_disk_gb: 30
	volume_gb: 30