Image Feature Extraction
MLX
English
data-label-factory
vision
dataset-labeling
object-detection
apple-silicon
gemma
falcon-perception
openrouter
yolo
Instructions to use waltgrace/data-label-factory with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use waltgrace/data-label-factory with MLX:
# Download the model from the Hub pip install huggingface_hub[hf_xet] huggingface-cli download --local-dir data-label-factory waltgrace/data-label-factory
- Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- LM Studio
| """ | |
| auto.py — Smart project creation from samples + description. | |
| The core v2 experience: a user says "label these as X", gives a few sample | |
| images, and the factory figures out the best project config and backends. | |
| Usage: | |
| # CLI | |
| data_label_factory auto \ | |
| --samples ~/my-samples/ \ | |
| --description "fire hydrants in urban settings" \ | |
| --output projects/fire-hydrants.yaml | |
| # Python | |
| from data_label_factory.auto import auto_project | |
| project = auto_project( | |
| samples=["img1.jpg", "img2.jpg"], | |
| description="fiber optic drones with cable spools", | |
| ) | |
| How it works: | |
| 1. Analyze sample images via available VLMs to understand the domain | |
| 2. Pick the best backends per stage based on content type | |
| 3. Generate search queries for the gather stage | |
| 4. Create a complete project YAML ready to run | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import time | |
| import yaml | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Any | |
| # ── Content type detection ────────────────────────────────────── | |
| CONTENT_PROFILES = { | |
| "document": { | |
| "keywords": ["text", "document", "form", "receipt", "invoice", "table", | |
| "pdf", "paper", "letter", "report", "handwriting", "ocr"], | |
| "filter_backend": "chandra", | |
| "label_backend": "chandra", | |
| "verify_backend": "qwen", | |
| "gather_queries_hint": "scanned {target}, {target} photograph, {target} closeup", | |
| }, | |
| "object": { | |
| "keywords": ["car", "dog", "cat", "drone", "bird", "person", "sign", | |
| "hydrant", "bottle", "phone", "building", "animal", | |
| "vehicle", "tool", "equipment", "furniture"], | |
| "filter_backend": "qwen", | |
| "label_backend": "falcon", | |
| "verify_backend": "qwen", | |
| "gather_queries_hint": "{target} photo, {target} outdoor, {target} closeup", | |
| }, | |
| "card": { | |
| "keywords": ["card", "playing card", "trading card", "poker", "blackjack", | |
| "yu-gi-oh", "yugioh", "pokemon", "magic the gathering", "deck"], | |
| "filter_backend": "qwen", | |
| "label_backend": "falcon", | |
| "verify_backend": "qwen", | |
| "gather_queries_hint": "{target} card photo, {target} cards spread", | |
| "synthetic": True, | |
| }, | |
| "3d_scene": { | |
| "keywords": ["3d", "depth", "scene", "room", "outdoor", "street", | |
| "driving", "autonomous", "lidar", "stereo"], | |
| "filter_backend": "qwen", | |
| "label_backend": "wilddet3d", | |
| "verify_backend": "qwen", | |
| "gather_queries_hint": "{target} photo, {target} outdoor scene", | |
| }, | |
| "generic": { | |
| "keywords": [], | |
| "filter_backend": "qwen", | |
| "label_backend": "falcon", | |
| "verify_backend": "qwen", | |
| "gather_queries_hint": "{target} photo, {target} closeup, {target} example", | |
| }, | |
| } | |
| def detect_content_type(description: str, sample_analysis: str = "") -> str: | |
| """Classify the labeling task into a content profile.""" | |
| text = (description + " " + sample_analysis).lower() | |
| best_type = "generic" | |
| best_score = 0 | |
| for ctype, profile in CONTENT_PROFILES.items(): | |
| if ctype == "generic": | |
| continue | |
| score = sum(1 for kw in profile["keywords"] if kw in text) | |
| if score > best_score: | |
| best_score = score | |
| best_type = ctype | |
| return best_type | |
| def analyze_samples_with_vlm(sample_paths: list[str], description: str) -> dict: | |
| """Use available VLM to analyze sample images and extract domain info.""" | |
| from .providers import create_provider | |
| # Try qwen first (fastest), then gemma | |
| for backend in ("qwen", "gemma"): | |
| try: | |
| provider = create_provider(backend) | |
| if not provider.status().get("alive"): | |
| continue | |
| except Exception: | |
| continue | |
| prompt = ( | |
| f"I want to build a dataset of images showing: {description}\n" | |
| f"Look at this sample image. In 2-3 sentences, describe:\n" | |
| f"1. What objects/elements are visible\n" | |
| f"2. What search queries would find similar images\n" | |
| f"3. What negative examples (things to exclude) would help\n" | |
| f"Be specific and concise." | |
| ) | |
| analyses = [] | |
| for path in sample_paths[:5]: # analyze up to 5 samples | |
| try: | |
| result = provider.filter_image(path, prompt) | |
| analyses.append(result.raw_answer) | |
| except Exception: | |
| continue | |
| if analyses: | |
| return { | |
| "backend_used": backend, | |
| "analyses": analyses, | |
| "combined": " ".join(analyses), | |
| } | |
| return {"backend_used": "none", "analyses": [], "combined": ""} | |
| def generate_queries(description: str, target_object: str, | |
| content_type: str, vlm_analysis: str = "") -> dict: | |
| """Generate bucket queries for the gather stage.""" | |
| profile = CONTENT_PROFILES.get(content_type, CONTENT_PROFILES["generic"]) | |
| hint = profile["gather_queries_hint"].format(target=target_object) | |
| # Base positive queries from description | |
| words = [w.strip() for w in description.split() if len(w.strip()) > 2] | |
| positive_queries = [ | |
| description, | |
| target_object, | |
| f"{target_object} photo", | |
| f"{target_object} closeup", | |
| f"{target_object} high quality", | |
| ] | |
| # Add hint-based queries | |
| for q in hint.split(", "): | |
| if q not in positive_queries: | |
| positive_queries.append(q) | |
| # Extract additional queries from VLM analysis | |
| if vlm_analysis: | |
| # Simple extraction: look for quoted phrases or comma-separated suggestions | |
| for line in vlm_analysis.split("\n"): | |
| if "search" in line.lower() or "query" in line.lower(): | |
| parts = line.split('"') | |
| for i in range(1, len(parts), 2): | |
| if len(parts[i]) > 3: | |
| positive_queries.append(parts[i]) | |
| # Negative / background queries | |
| negative_queries = [ | |
| f"not {target_object}", | |
| "empty background", | |
| "plain surface", | |
| ] | |
| buckets = { | |
| f"positive/clear_view": {"queries": positive_queries[:8]}, | |
| f"negative/other_objects": {"queries": negative_queries[:4]}, | |
| f"background/empty": {"queries": ["blue sky clouds", "empty room", "plain wall"]}, | |
| } | |
| return buckets | |
| def generate_falcon_queries(target_object: str, description: str) -> list[str]: | |
| """Generate Falcon Perception queries for the label stage.""" | |
| queries = [target_object] | |
| # Add component parts if description mentions them | |
| words = description.lower().split() | |
| for w in words: | |
| if w not in target_object.lower() and len(w) > 3 and w not in ( | |
| "with", "that", "this", "from", "into", "have", "been", | |
| "show", "shows", "showing", "image", "images", "photo", | |
| ): | |
| queries.append(w) | |
| return queries[:5] # cap at 5 queries | |
| def auto_project( | |
| samples: list[str] | str, | |
| description: str, | |
| project_name: str = "", | |
| output: str = "", | |
| analyze: bool = True, | |
| ) -> dict: | |
| """Create a complete project config from samples + description. | |
| Args: | |
| samples: Path to directory of sample images, or list of image paths | |
| description: What the user wants to label (e.g. "fire hydrants") | |
| project_name: Optional name (auto-generated from description if empty) | |
| output: Optional path to write YAML (returns dict if empty) | |
| analyze: Whether to use VLM to analyze samples (slower but better) | |
| Returns: | |
| Complete project config dict (also written to output if specified) | |
| """ | |
| # Resolve sample paths | |
| if isinstance(samples, str): | |
| samples_dir = os.path.expanduser(samples) | |
| if os.path.isdir(samples_dir): | |
| sample_paths = [ | |
| os.path.join(samples_dir, f) for f in os.listdir(samples_dir) | |
| if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp")) | |
| ] | |
| else: | |
| sample_paths = [samples_dir] | |
| else: | |
| sample_paths = [os.path.expanduser(s) for s in samples] | |
| # Derive target object from description | |
| target_object = description.strip().rstrip(".") | |
| if not project_name: | |
| project_name = target_object.lower().replace(" ", "-")[:30] | |
| print(f"[auto] Creating project: {project_name}") | |
| print(f"[auto] Target: {target_object}") | |
| print(f"[auto] Samples: {len(sample_paths)} images") | |
| # Analyze samples with VLM if available | |
| vlm_analysis = "" | |
| if analyze and sample_paths: | |
| print(f"[auto] Analyzing samples with VLM...") | |
| analysis = analyze_samples_with_vlm(sample_paths, description) | |
| vlm_analysis = analysis.get("combined", "") | |
| if vlm_analysis: | |
| print(f"[auto] Analysis ({analysis['backend_used']}): {vlm_analysis[:200]}...") | |
| # Detect content type | |
| content_type = detect_content_type(description, vlm_analysis) | |
| profile = CONTENT_PROFILES[content_type] | |
| print(f"[auto] Content type: {content_type}") | |
| print(f"[auto] Backends: filter={profile['filter_backend']}, " | |
| f"label={profile['label_backend']}, verify={profile['verify_backend']}") | |
| # Generate queries | |
| buckets = generate_queries(description, target_object, content_type, vlm_analysis) | |
| falcon_queries = generate_falcon_queries(target_object, description) | |
| # Build project config | |
| config = { | |
| "project_name": project_name, | |
| "target_object": target_object, | |
| "description": ( | |
| f"Auto-generated project for labeling {target_object}. " | |
| f"Content type: {content_type}. " | |
| f"Created {datetime.now().strftime('%Y-%m-%d %H:%M')}." | |
| ), | |
| "data_root": f"~/data-label-factory/{project_name}", | |
| "buckets": buckets, | |
| "falcon_queries": falcon_queries, | |
| "backends": { | |
| "filter": profile["filter_backend"], | |
| "label": profile["label_backend"], | |
| "verify": profile["verify_backend"], | |
| }, | |
| } | |
| # Add synthetic config if applicable | |
| if profile.get("synthetic"): | |
| config["synthetic"] = { | |
| "enabled": True, | |
| "type": "flywheel", | |
| "note": "This domain supports synthetic data generation via flywheel", | |
| } | |
| # Copy samples to data_root | |
| data_root = os.path.expanduser(config["data_root"]) | |
| samples_dest = os.path.join(data_root, "samples") | |
| os.makedirs(samples_dest, exist_ok=True) | |
| import shutil | |
| for sp in sample_paths: | |
| if os.path.exists(sp): | |
| shutil.copy2(sp, samples_dest) | |
| print(f"[auto] Copied {len(sample_paths)} samples to {samples_dest}") | |
| # Write YAML if output specified | |
| if output: | |
| output = os.path.expanduser(output) | |
| os.makedirs(os.path.dirname(output) or ".", exist_ok=True) | |
| with open(output, "w") as f: | |
| yaml.dump(config, f, default_flow_style=False, sort_keys=False) | |
| print(f"[auto] Saved project: {output}") | |
| print(f"[auto] Ready! Run: data_label_factory pipeline --project {output or '<path>'}") | |
| return config | |
| def main(argv: list[str] | None = None): | |
| import argparse | |
| p = argparse.ArgumentParser( | |
| prog="data_label_factory auto", | |
| description=( | |
| "Create a labeling project automatically from sample images + description. " | |
| "The factory analyzes your samples, picks the best backends, generates " | |
| "search queries, and creates a ready-to-run project YAML." | |
| ), | |
| ) | |
| p.add_argument("--samples", required=True, | |
| help="Directory of sample images or a single image path") | |
| p.add_argument("--description", required=True, | |
| help='What to label (e.g. "fire hydrants in urban settings")') | |
| p.add_argument("--name", default="", | |
| help="Project name (auto-generated from description if empty)") | |
| p.add_argument("--output", default="", | |
| help="Output YAML path (default: projects/<name>.yaml)") | |
| p.add_argument("--no-analyze", action="store_true", | |
| help="Skip VLM analysis of samples (faster, less accurate)") | |
| args = p.parse_args(argv) | |
| output = args.output | |
| if not output: | |
| name = args.name or args.description.lower().replace(" ", "-")[:30] | |
| output = f"projects/{name}.yaml" | |
| auto_project( | |
| samples=args.samples, | |
| description=args.description, | |
| project_name=args.name, | |
| output=output, | |
| analyze=not args.no_analyze, | |
| ) | |