waltgrace's picture
feat: v2 provider registry, Expert Sniper integration, model benchmark
e019de9 verified
"""
auto.py — Smart project creation from samples + description.
The core v2 experience: a user says "label these as X", gives a few sample
images, and the factory figures out the best project config and backends.
Usage:
# CLI
data_label_factory auto \
--samples ~/my-samples/ \
--description "fire hydrants in urban settings" \
--output projects/fire-hydrants.yaml
# Python
from data_label_factory.auto import auto_project
project = auto_project(
samples=["img1.jpg", "img2.jpg"],
description="fiber optic drones with cable spools",
)
How it works:
1. Analyze sample images via available VLMs to understand the domain
2. Pick the best backends per stage based on content type
3. Generate search queries for the gather stage
4. Create a complete project YAML ready to run
"""
from __future__ import annotations
import json
import os
import time
import yaml
from datetime import datetime
from pathlib import Path
from typing import Any
# ── Content type detection ──────────────────────────────────────
CONTENT_PROFILES = {
"document": {
"keywords": ["text", "document", "form", "receipt", "invoice", "table",
"pdf", "paper", "letter", "report", "handwriting", "ocr"],
"filter_backend": "chandra",
"label_backend": "chandra",
"verify_backend": "qwen",
"gather_queries_hint": "scanned {target}, {target} photograph, {target} closeup",
},
"object": {
"keywords": ["car", "dog", "cat", "drone", "bird", "person", "sign",
"hydrant", "bottle", "phone", "building", "animal",
"vehicle", "tool", "equipment", "furniture"],
"filter_backend": "qwen",
"label_backend": "falcon",
"verify_backend": "qwen",
"gather_queries_hint": "{target} photo, {target} outdoor, {target} closeup",
},
"card": {
"keywords": ["card", "playing card", "trading card", "poker", "blackjack",
"yu-gi-oh", "yugioh", "pokemon", "magic the gathering", "deck"],
"filter_backend": "qwen",
"label_backend": "falcon",
"verify_backend": "qwen",
"gather_queries_hint": "{target} card photo, {target} cards spread",
"synthetic": True,
},
"3d_scene": {
"keywords": ["3d", "depth", "scene", "room", "outdoor", "street",
"driving", "autonomous", "lidar", "stereo"],
"filter_backend": "qwen",
"label_backend": "wilddet3d",
"verify_backend": "qwen",
"gather_queries_hint": "{target} photo, {target} outdoor scene",
},
"generic": {
"keywords": [],
"filter_backend": "qwen",
"label_backend": "falcon",
"verify_backend": "qwen",
"gather_queries_hint": "{target} photo, {target} closeup, {target} example",
},
}
def detect_content_type(description: str, sample_analysis: str = "") -> str:
"""Classify the labeling task into a content profile."""
text = (description + " " + sample_analysis).lower()
best_type = "generic"
best_score = 0
for ctype, profile in CONTENT_PROFILES.items():
if ctype == "generic":
continue
score = sum(1 for kw in profile["keywords"] if kw in text)
if score > best_score:
best_score = score
best_type = ctype
return best_type
def analyze_samples_with_vlm(sample_paths: list[str], description: str) -> dict:
"""Use available VLM to analyze sample images and extract domain info."""
from .providers import create_provider
# Try qwen first (fastest), then gemma
for backend in ("qwen", "gemma"):
try:
provider = create_provider(backend)
if not provider.status().get("alive"):
continue
except Exception:
continue
prompt = (
f"I want to build a dataset of images showing: {description}\n"
f"Look at this sample image. In 2-3 sentences, describe:\n"
f"1. What objects/elements are visible\n"
f"2. What search queries would find similar images\n"
f"3. What negative examples (things to exclude) would help\n"
f"Be specific and concise."
)
analyses = []
for path in sample_paths[:5]: # analyze up to 5 samples
try:
result = provider.filter_image(path, prompt)
analyses.append(result.raw_answer)
except Exception:
continue
if analyses:
return {
"backend_used": backend,
"analyses": analyses,
"combined": " ".join(analyses),
}
return {"backend_used": "none", "analyses": [], "combined": ""}
def generate_queries(description: str, target_object: str,
content_type: str, vlm_analysis: str = "") -> dict:
"""Generate bucket queries for the gather stage."""
profile = CONTENT_PROFILES.get(content_type, CONTENT_PROFILES["generic"])
hint = profile["gather_queries_hint"].format(target=target_object)
# Base positive queries from description
words = [w.strip() for w in description.split() if len(w.strip()) > 2]
positive_queries = [
description,
target_object,
f"{target_object} photo",
f"{target_object} closeup",
f"{target_object} high quality",
]
# Add hint-based queries
for q in hint.split(", "):
if q not in positive_queries:
positive_queries.append(q)
# Extract additional queries from VLM analysis
if vlm_analysis:
# Simple extraction: look for quoted phrases or comma-separated suggestions
for line in vlm_analysis.split("\n"):
if "search" in line.lower() or "query" in line.lower():
parts = line.split('"')
for i in range(1, len(parts), 2):
if len(parts[i]) > 3:
positive_queries.append(parts[i])
# Negative / background queries
negative_queries = [
f"not {target_object}",
"empty background",
"plain surface",
]
buckets = {
f"positive/clear_view": {"queries": positive_queries[:8]},
f"negative/other_objects": {"queries": negative_queries[:4]},
f"background/empty": {"queries": ["blue sky clouds", "empty room", "plain wall"]},
}
return buckets
def generate_falcon_queries(target_object: str, description: str) -> list[str]:
"""Generate Falcon Perception queries for the label stage."""
queries = [target_object]
# Add component parts if description mentions them
words = description.lower().split()
for w in words:
if w not in target_object.lower() and len(w) > 3 and w not in (
"with", "that", "this", "from", "into", "have", "been",
"show", "shows", "showing", "image", "images", "photo",
):
queries.append(w)
return queries[:5] # cap at 5 queries
def auto_project(
samples: list[str] | str,
description: str,
project_name: str = "",
output: str = "",
analyze: bool = True,
) -> dict:
"""Create a complete project config from samples + description.
Args:
samples: Path to directory of sample images, or list of image paths
description: What the user wants to label (e.g. "fire hydrants")
project_name: Optional name (auto-generated from description if empty)
output: Optional path to write YAML (returns dict if empty)
analyze: Whether to use VLM to analyze samples (slower but better)
Returns:
Complete project config dict (also written to output if specified)
"""
# Resolve sample paths
if isinstance(samples, str):
samples_dir = os.path.expanduser(samples)
if os.path.isdir(samples_dir):
sample_paths = [
os.path.join(samples_dir, f) for f in os.listdir(samples_dir)
if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))
]
else:
sample_paths = [samples_dir]
else:
sample_paths = [os.path.expanduser(s) for s in samples]
# Derive target object from description
target_object = description.strip().rstrip(".")
if not project_name:
project_name = target_object.lower().replace(" ", "-")[:30]
print(f"[auto] Creating project: {project_name}")
print(f"[auto] Target: {target_object}")
print(f"[auto] Samples: {len(sample_paths)} images")
# Analyze samples with VLM if available
vlm_analysis = ""
if analyze and sample_paths:
print(f"[auto] Analyzing samples with VLM...")
analysis = analyze_samples_with_vlm(sample_paths, description)
vlm_analysis = analysis.get("combined", "")
if vlm_analysis:
print(f"[auto] Analysis ({analysis['backend_used']}): {vlm_analysis[:200]}...")
# Detect content type
content_type = detect_content_type(description, vlm_analysis)
profile = CONTENT_PROFILES[content_type]
print(f"[auto] Content type: {content_type}")
print(f"[auto] Backends: filter={profile['filter_backend']}, "
f"label={profile['label_backend']}, verify={profile['verify_backend']}")
# Generate queries
buckets = generate_queries(description, target_object, content_type, vlm_analysis)
falcon_queries = generate_falcon_queries(target_object, description)
# Build project config
config = {
"project_name": project_name,
"target_object": target_object,
"description": (
f"Auto-generated project for labeling {target_object}. "
f"Content type: {content_type}. "
f"Created {datetime.now().strftime('%Y-%m-%d %H:%M')}."
),
"data_root": f"~/data-label-factory/{project_name}",
"buckets": buckets,
"falcon_queries": falcon_queries,
"backends": {
"filter": profile["filter_backend"],
"label": profile["label_backend"],
"verify": profile["verify_backend"],
},
}
# Add synthetic config if applicable
if profile.get("synthetic"):
config["synthetic"] = {
"enabled": True,
"type": "flywheel",
"note": "This domain supports synthetic data generation via flywheel",
}
# Copy samples to data_root
data_root = os.path.expanduser(config["data_root"])
samples_dest = os.path.join(data_root, "samples")
os.makedirs(samples_dest, exist_ok=True)
import shutil
for sp in sample_paths:
if os.path.exists(sp):
shutil.copy2(sp, samples_dest)
print(f"[auto] Copied {len(sample_paths)} samples to {samples_dest}")
# Write YAML if output specified
if output:
output = os.path.expanduser(output)
os.makedirs(os.path.dirname(output) or ".", exist_ok=True)
with open(output, "w") as f:
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
print(f"[auto] Saved project: {output}")
print(f"[auto] Ready! Run: data_label_factory pipeline --project {output or '<path>'}")
return config
def main(argv: list[str] | None = None):
import argparse
p = argparse.ArgumentParser(
prog="data_label_factory auto",
description=(
"Create a labeling project automatically from sample images + description. "
"The factory analyzes your samples, picks the best backends, generates "
"search queries, and creates a ready-to-run project YAML."
),
)
p.add_argument("--samples", required=True,
help="Directory of sample images or a single image path")
p.add_argument("--description", required=True,
help='What to label (e.g. "fire hydrants in urban settings")')
p.add_argument("--name", default="",
help="Project name (auto-generated from description if empty)")
p.add_argument("--output", default="",
help="Output YAML path (default: projects/<name>.yaml)")
p.add_argument("--no-analyze", action="store_true",
help="Skip VLM analysis of samples (faster, less accurate)")
args = p.parse_args(argv)
output = args.output
if not output:
name = args.name or args.description.lower().replace(" ", "-")[:30]
output = f"projects/{name}.yaml"
auto_project(
samples=args.samples,
description=args.description,
project_name=args.name,
output=output,
analyze=not args.no_analyze,
)