Spaces:

bytestream89
/

LabelPlayground

Sleeping

App Files Files Community

Erick commited on Mar 1

Commit

47cb9bd

verified ·

1 Parent(s): 9634000

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

.env.example +7 -0
.gitattributes +3 -0
.gitignore +41 -0
.gradio/certificate.pem +31 -0
.python-version +1 -0
CONTEXT.md +164 -0
Makefile +47 -0
README.md +204 -6
app.py +501 -0
autolabel/__init__.py +12 -0
autolabel/config.py +177 -0
autolabel/detect.py +188 -0
autolabel/export.py +156 -0
autolabel/finetune.py +554 -0
autolabel/segment.py +127 -0
autolabel/utils.py +51 -0
pyproject.toml +47 -0
samples/CREDITS.txt +17 -0
samples/animals.jpg +3 -0
samples/cat.jpg +3 -0
samples/dog.jpg +0 -0
samples/kitchen.jpg +3 -0
scripts/export_coco.py +47 -0
scripts/finetune_owlv2.py +152 -0
scripts/run_detection.py +86 -0

.env.example ADDED Viewed

	@@ -0,0 +1,7 @@

+# Required on Apple Silicon: enables CPU fallback for MPS ops not yet in Metal
+PYTORCH_ENABLE_MPS_FALLBACK=1
+# Optional overrides (uncomment and edit as needed)
+# AUTOLABEL_DEVICE=cpu
+# AUTOLABEL_MODEL=google/owlv2-large-patch14-finetuned
+# AUTOLABEL_THRESHOLD=0.1

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+samples/animals.jpg filter=lfs diff=lfs merge=lfs -text
+samples/cat.jpg filter=lfs diff=lfs merge=lfs -text
+samples/kitchen.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,41 @@

+# Environment
+.env
+.venv/
+__pycache__/
+*.py[cod]
+*.pyo
+.pytest_cache/
+*.egg-info/
+dist/
+build/
+# Data directories (potentially large / private)
+data
+data/raw/
+data/detections/
+data/labeled/
+# Model cache (downloaded from Hugging Face)
+.cache/
+models/
+# Jupyter
+.ipynb_checkpoints/
+notebooks/.ipynb_checkpoints/
+# macOS
+.DS_Store
+# Windows
+Thumbs.db
+ehthumbs.db
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# uv
+uv.lock
+/.claude/

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

CONTEXT.md ADDED Viewed

	@@ -0,0 +1,164 @@

+# CONTEXT.md — Technical Reference for autolabel
+> Keep this file up to date as the project evolves. Read this first when
+> resuming work after a break.
+---
+## What this project does
+Uses **OWLv2** (open-vocabulary object detection) and **SAM2** (segment
+anything) to auto-label images via text prompts, then exports a COCO dataset
+for fine-tuning a detection or segmentation model.
+**Current phase:** labeling — two modes available:
+- **Detection** — OWLv2 only; produces bounding boxes.
+- **Segmentation** — OWLv2 → boxes → SAM2 → pixel masks + COCO polygons.
+**Future phase:** fine-tune OWLv2 on the exported COCO dataset using
+`scripts/finetune_owlv2.py` (code is ready, not yet in active use).
+---
+## Architecture
+### Primary interface — `app.py` (Gradio web UI)
+Two-tab UI, all artifacts written to a session temp dir (nothing in the project):
+| Tab | What it does |
+|-----|-------------|
+| 🧪 Test | Single image → instant annotated preview. Dial in prompts and threshold before a batch run. |
+| 📂 Batch | Multiple images → annotated gallery + downloadable ZIP (resized images + `coco_export.json`). |
+### CLI scripts (`scripts/`)
+Independent entry points for headless / automation use:
+| Script | Purpose |
+|--------|---------|
+| `run_detection.py` | Batch detect → `data/detections/` |
+| `export_coco.py` | Build COCO JSON from `data/labeled/` |
+| `finetune_owlv2.py` | Fine-tune OWLv2 (future) |
+### `autolabel/` package
+| Module | Responsibility |
+|--------|---------------|
+| `config.py` | Pydantic settings singleton, auto device detection |
+| `detect.py` | OWLv2 inference — `infer()` (PIL, shared) + `detect_image()` (file) + `run_detection()` (batch CLI) |
+| `segment.py` | SAM2 integration — `load_sam2()`, `segment_with_boxes()`, `_mask_to_polygon()` |
+| `export.py` | COCO JSON builder (no pycocotools); supports both bbox-only and segmentation |
+| `finetune.py` | Training loop, loss, dataset, scheduler |
+| `utils.py` | `collect_images`, `save_json`, `load_json`, `setup_logging` |
+**Key design:** `detect.infer()` is the single OWLv2 inference implementation.
+`app.py` chains SAM2 on top when mode == "Segmentation" — no duplication.
+---
+## Device strategy
+| Platform | Device | dtype |
+|----------|--------|-------|
+| Apple Silicon | `mps` | `float32` |
+| Windows/Linux GPU | `cuda` | `float16` |
+| CPU fallback | `cpu` | `float32` |
+`PYTORCH_ENABLE_MPS_FALLBACK=1` must be set before torch is imported on MPS
+(`.env` handles this). Without it, some OWLv2 ops raise `NotImplementedError`.
+---
+## OWLv2 model
+Default: `google/owlv2-large-patch14-finetuned` (~700 MB, cached in
+`~/.cache/huggingface` after first download).
+Override via env var: `AUTOLABEL_MODEL=google/owlv2-base-patch16`
+| Variant | Size | Notes |
+|---------|------|-------|
+| `owlv2-base-patch16` | ~300 MB | Faster, lower accuracy |
+| `owlv2-large-patch14` | ~700 MB | Good balance |
+| `owlv2-large-patch14-finetuned` | ~700 MB | Default — pre-trained on LVIS/Objects365 |
+---
+## Dependency decisions
+| Package | Why kept |
+|---------|---------|
+| `torch` / `torchvision` | OWLv2 + SAM2 inference |
+| `transformers>=4.45` | OWLv2 and SAM2 models & processors |
+| `pillow` | Image I/O and annotation drawing |
+| `numpy` | Gradio image array interchange; mask arrays |
+| `opencv-python` | `cv2.findContours` for mask → COCO polygon (SAM2) |
+| `pydantic` / `pydantic-settings` | Type-safe config with env-var loading |
+| `click` | CLI option parsing |
+| `tqdm` | Progress bars in CLI batch runner |
+| `python-dotenv` | Load `.env` before torch (MPS fallback) |
+| `gradio` | Web UI |
+Removed: `supervision` (unused), `matplotlib` (fine-tune charts gone),
+`requests` (Label Studio gone).
+---
+## Inference flow
+```
+PIL image
+    ↓
+detect.infer(image, processor, model, prompts, threshold, device, dtype)
+    ↓
+list[{label, score, box_xyxy}]
+    │
+    ├─ Detection mode ──────────────────────────────────────────────────
+    │   ↓ used by app.py directly
+    │   ↓ (CLI: wrapped by detect_image → JSON)
+    │   ↓ export.build_coco → coco_export.json  (bbox only, segmentation:[])
+    │
+    └─ Segmentation mode ───────────────────────────────────────────────
+        ↓
+        segment.segment_with_boxes(image, detections, sam2_processor, sam2_model)
+        ↓
+        list[{label, score, box_xyxy, mask (np.ndarray), segmentation (polygons)}]
+        ↓ mask used for visualization overlay; dropped before JSON serialisation
+        ↓ export.build_coco → coco_export.json  (bbox + segmentation polygons)
+```
+---
+## Batch export ZIP structure
+```
+autolabel_export.zip
+├── coco_export.json          # COCO format, dimensions match images below
+└── images/
+    ├── photo1.jpg            # resized to chosen training size (e.g. 640×640)
+    └── photo2.jpg
+```
+COCO bounding boxes are in the coordinate space of the resized images.
+---
+## Known limitations
+- OWLv2 is detection-only — bounding boxes, no masks.
+- Objects < 32×32 px are often missed at default resolution.
+- MPS inference is slower than CUDA but fast enough for development.
+- Threshold default is 0.1 (intentionally low — easier to discard false
+  positives than recover missed objects).
+---
+## Fine-tuning (future)
+The fine-tuning infrastructure is complete (`autolabel/finetune.py`,
+`scripts/finetune_owlv2.py`) but not in active use. Workflow when ready:
+1. Use the Batch tab to generate a labeled `coco_export.json`
+2. Run `make finetune` (or `uv run python scripts/finetune_owlv2.py --help`)
+3. Evaluate the fine-tuned model in the Test tab

Makefile ADDED Viewed

	@@ -0,0 +1,47 @@

+.PHONY: setup detect export finetune clean app help
+PYTHON := python
+UV     := uv
+DATA_RAW := data/raw
+DATA_DET := data/detections
+DATA_LAB := data/labeled
+help:
+	@echo "autolabel — OWLv2 labeling pipeline"
+	@echo ""
+	@echo "Targets:"
+	@echo "  setup     Install dependencies"
+	@echo "  app       Launch the Gradio UI (primary workflow)"
+	@echo "  detect    Run OWLv2 batch detection via CLI → data/detections/"
+	@echo "  export    Build COCO JSON from data/labeled/ via CLI"
+	@echo "  finetune  Fine-tune OWLv2 via CLI (future use)"
+	@echo "  clean     Remove generated JSON files (raw images untouched)"
+setup:
+	$(UV) sync
+	@cp -n .env.example .env 2>/dev/null || true
+	@echo "Done. Run: make app"
+app:
+	PYTORCH_ENABLE_MPS_FALLBACK=1 $(UV) run python app.py
+detect:
+	$(UV) run python scripts/run_detection.py \
+		--image-dir $(DATA_RAW) \
+		--output-dir $(DATA_DET)
+export:
+	$(UV) run python scripts/export_coco.py \
+		--labeled-dir $(DATA_LAB) \
+		--output $(DATA_LAB)/coco_export.json
+finetune:
+	PYTORCH_ENABLE_MPS_FALLBACK=1 $(UV) run python scripts/finetune_owlv2.py \
+		--coco-json $(DATA_LAB)/coco_export.json \
+		--image-dir $(DATA_RAW)
+clean:
+	@echo "Removing generated files..."
+	find $(DATA_DET) -name "*.json" -delete 2>/dev/null || true
+	find $(DATA_LAB) -name "*.json" -delete 2>/dev/null || true
+	@echo "Done. Raw images in $(DATA_RAW) are untouched."

README.md CHANGED Viewed

@@ -1,12 +1,210 @@
 ---
 title: LabelPlayground
-emoji: 📈
-colorFrom: gray
-colorTo: indigo
 sdk: gradio
 sdk_version: 6.8.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: LabelPlayground
+app_file: app.py
 sdk: gradio
 sdk_version: 6.8.0
 ---
+# autolabel — OWLv2 + SAM2 labeling pipeline
+Auto-label images using **OWLv2** (open-vocabulary object detection) and
+optionally **SAM2** (instance segmentation), then export a COCO dataset ready
+for model fine-tuning.
+---
+## Quickstart
+```bash
+# 1. Install
+uv sync
+# 2. Copy env file (sets PYTORCH_ENABLE_MPS_FALLBACK=1 for Apple Silicon)
+cp .env.example .env
+# 3. Launch
+make app
+```
+Models download automatically on first use and are cached in
+`~/.cache/huggingface`. Nothing else is written to the project directory.
+| Model | Size | Purpose |
+|-------|------|---------|
+| `owlv2-large-patch14-finetuned` | ~700 MB | Text → bounding boxes |
+| `sam2-hiera-tiny` | ~160 MB | Box prompts → pixel masks |
+---
+## How the app works
+### Mode selector
+Both tabs have a **Detection / Segmentation** radio button:
+| Mode | What runs | COCO output |
+|------|-----------|-------------|
+| **Detection** | OWLv2 only | `bbox` + empty `segmentation: []` |
+| **Segmentation** | OWLv2 → SAM2 | `bbox` + `segmentation` polygon list |
+### How Detection and Segmentation work
+**Detection** uses [OWLv2](https://huggingface.co/google/owlv2-large-patch14-finetuned) — an
+open-vocabulary object detector. You give it a text prompt ("cup, bottle") and it returns
+bounding boxes with confidence scores. No fixed class list, no retraining needed.
+**Segmentation** uses the **Grounded SAM2** pattern — two models chained together:
+```
+Text prompts ("cup, bottle")
+        │
+        ▼
+     OWLv2          ← understands text, produces bounding boxes
+        │
+        ▼
+  Bounding boxes
+        │
+        ▼
+     SAM2           ← understands spatial prompts, produces pixel masks
+        │
+        ▼
+  Masks + COCO polygons
+```
+SAM2 (`sam2-hiera-tiny`) is a *prompt-based* segmenter — it accepts box, point, or mask
+prompts but has no concept of text or class names. It can't answer "find me a cup"; it
+can only answer "segment the object inside this box." OWLv2 is the **grounding** step
+that translates your words into coordinates SAM2 can act on.
+Both models run in Segmentation mode. Detection mode skips SAM2 entirely.
+### 🧪 Test tab
+Upload a single image, pick a mode, and type comma-separated object prompts.
+Hit **Detect** to see an annotated preview alongside a results table (label,
+confidence, bounding box). In Segmentation mode, pixel mask overlays are drawn
+on top of the bounding boxes. Use this tab to dial in prompts and threshold
+before a batch run — nothing is saved to disk.
+### 📂 Batch tab
+Upload multiple images and run the chosen mode on all of them at once. You get:
+- An annotated **gallery** showing every image
+- A **Download ZIP** button containing:
+  - `coco_export.json` — COCO-format annotations ready for fine-tuning
+  - `images/` — all images resized to your chosen training size
+The size dropdown offers common YOLOX training resolutions (416 → 1024) plus
+**As is** to keep the original dimensions. Coordinates in the COCO file match
+the resized images exactly.
+All artifacts live in a system temp directory — nothing is written to the project.
+---
+## Project layout
+```
+autolabel/
+├── config.py       # Pydantic settings, auto device detection (CUDA → MPS → CPU)
+├── detect.py       # OWLv2 inference — infer() shared by app + CLI
+├── segment.py      # SAM2 integration — box prompts → masks + COCO polygons
+├── export.py       # COCO JSON builder (no pycocotools); bbox + segmentation
+├── finetune.py     # Fine-tuning loop (future use)
+└── utils.py        # Shared helpers
+scripts/
+├── run_detection.py   # CLI: batch detect → data/detections/
+├── export_coco.py     # CLI: build coco_export.json from data/labeled/
+└── finetune_owlv2.py  # CLI: fine-tune OWLv2 (future use)
+app.py              # Gradio web UI
+```
+---
+## CLI workflow
+Detection and export can be driven from the command line without the UI:
+```bash
+# Detect all images in data/raw/ → data/detections/
+make detect
+# Custom prompts
+uv run python scripts/run_detection.py --prompts "cup,mug,bottle"
+# Force re-run on already-processed images
+uv run python scripts/run_detection.py --force
+# Build COCO JSON from data/labeled/
+make export
+```
+---
+## Fine-tuning (future)
+The fine-tuning infrastructure is already in place. Once you have a
+`coco_export.json` from a Batch run:
+```bash
+make finetune
+# or:
+uv run python scripts/finetune_owlv2.py \
+  --coco-json data/labeled/coco_export.json \
+  --image-dir data/raw \
+  --epochs 10
+```
+### Key hyperparameters
+| Parameter | Default | Notes |
+|-----------|---------|-------|
+| Epochs | 10 | More epochs → higher overfit risk on small datasets |
+| Learning rate | 1e-4 | Applied to the detection head |
+| Gradient accumulation | 4 | Effective batch size multiplier |
+| Unfreeze backbone | off | Also trains the vision encoder — needs more data |
+### Tips
+- Start with **50–100 annotated images per class** minimum; 200–500 is better.
+- Fine-tuned models are more confident — raise the threshold to 0.2–0.4.
+- Leave the backbone frozen unless you have 500+ images per class.
+---
+## Prerequisites
+| Tool | Version | Notes |
+|------|---------|-------|
+| Python | **3.11.x** | Managed by uv |
+| [uv](https://docs.astral.sh/uv/) | latest | `curl -LsSf https://astral.sh/uv/install.sh \| sh` |
+| CUDA toolkit | 11.8+ | Windows/Linux GPU users only |
+**Apple Silicon:** `PYTORCH_ENABLE_MPS_FALLBACK=1` is pre-set in `.env.example`.
+**Windows/CUDA:** remove `PYTORCH_ENABLE_MPS_FALLBACK` from `.env`. For a
+specific CUDA build:
+```powershell
+uv pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
+uv sync
+```
+---
+## Makefile targets
+| Target | Description |
+|--------|-------------|
+| `make setup` | Install dependencies, copy `.env.example` |
+| `make app` | Launch the Gradio UI |
+| `make detect` | Batch detect via CLI → `data/detections/` |
+| `make export` | Build COCO JSON via CLI |
+| `make finetune` | Fine-tune OWLv2 via CLI |
+| `make clean` | Delete generated JSONs (raw images untouched) |
+---
+## License
+MIT

app.py ADDED Viewed

	@@ -0,0 +1,501 @@

+"""
+app.py — OWLv2 / SAM2 image labeling UI
+Tab 1 — Test:   upload one image, pick Detection or Segmentation mode,
+                tune prompts/threshold/size, see instant annotated results.
+Tab 2 — Batch:  upload multiple images, run in the chosen mode, download a ZIP
+                containing resized images + coco_export.json.
+All artifacts live in a system temp directory — nothing is written to the project.
+"""
+from __future__ import annotations
+import os
+os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
+import logging
+import shutil
+import tempfile
+import zipfile
+from pathlib import Path
+from typing import Optional
+import gradio as gr
+import numpy as np
+from dotenv import load_dotenv
+from PIL import Image, ImageDraw, ImageFont
+load_dotenv()
+from autolabel.config import settings
+from autolabel.detect import infer as _owlv2_infer
+from autolabel.export import build_coco
+from autolabel.segment import load_sam2, segment_with_boxes
+from autolabel.utils import save_json, setup_logging
+setup_logging(logging.INFO)
+logger = logging.getLogger(__name__)
+# Temp directory for this session — cleaned up by the OS on reboot
+_TMPDIR = Path(tempfile.mkdtemp(prefix="autolabel_"))
+logger.info("Session temp dir: %s", _TMPDIR)
+# ---------------------------------------------------------------------------
+# Image sizing
+# ---------------------------------------------------------------------------
+_SIZE_OPTIONS = {
+    "As is":        None,
+    "416 × 416":    (416, 416),
+    "480 × 480":    (480, 480),
+    "512 × 512":    (512, 512),
+    "640 × 640":    (640, 640),
+    "736 × 736":    (736, 736),
+    "896 × 896":    (896, 896),
+    "1024 × 1024":  (1024, 1024),
+}
+_SIZE_LABELS = list(_SIZE_OPTIONS.keys())
+def _resize(pil: Image.Image, size_label: str) -> Image.Image:
+    target = _SIZE_OPTIONS[size_label]
+    if target is None:
+        return pil
+    return pil.resize(target, Image.LANCZOS)
+# ---------------------------------------------------------------------------
+# Colours & annotation
+# ---------------------------------------------------------------------------
+_PALETTE = [
+    (52, 211, 153), (251, 146, 60),  (96, 165, 250),  (248, 113, 113),
+    (167, 139, 250),(250, 204, 21),  (34, 211, 238),  (244, 114, 182),
+    (74, 222, 128), (232, 121, 249), (125, 211, 252),  (253, 186, 116),
+    (110, 231, 183),(196, 181, 253), (253, 164, 175),  (134, 239, 172),
+]
+def _colour_for(label: str, prompts: list[str]) -> tuple[int, int, int]:
+    try:
+        return _PALETTE[prompts.index(label) % len(_PALETTE)]
+    except ValueError:
+        return _PALETTE[hash(label) % len(_PALETTE)]
+def _annotate(
+    pil_image: Image.Image,
+    detections: list[dict],
+    prompts: list[str],
+    mode: str = "Detection",
+) -> Image.Image:
+    """Draw bounding boxes (+ mask overlays in Segmentation mode) on *pil_image*."""
+    img = pil_image.copy().convert("RGBA")
+    # --- Segmentation: paint semi-transparent mask overlays first ---
+    if mode == "Segmentation":
+        overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
+        for det in detections:
+            mask = det.get("mask")
+            if mask is None or not isinstance(mask, np.ndarray):
+                continue
+            r, g, b = _colour_for(det["label"], prompts)
+            mask_rgba = np.zeros((mask.shape[0], mask.shape[1], 4), dtype=np.uint8)
+            mask_rgba[mask] = [r, g, b, 100]  # semi-transparent fill
+            overlay = Image.alpha_composite(overlay, Image.fromarray(mask_rgba, "RGBA"))
+        img = Image.alpha_composite(img, overlay)
+    # --- Bounding boxes and labels (both modes) ---
+    draw = ImageDraw.Draw(img, "RGBA")
+    try:
+        font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", size=18)
+    except Exception:
+        font = ImageFont.load_default()
+    for det in detections:
+        x1, y1, x2, y2 = det["box_xyxy"]
+        r, g, b = _colour_for(det["label"], prompts)
+        draw.rectangle([x1, y1, x2, y2], outline=(r, g, b), width=3)
+        tag = f"{det['label']} {det['score']:.2f}"
+        bbox = draw.textbbox((x1, y1), tag, font=font)
+        draw.rectangle([bbox[0]-3, bbox[1]-3, bbox[2]+3, bbox[3]+3], fill=(r, g, b, 210))
+        draw.text((x1, y1), tag, fill=(255, 255, 255), font=font)
+    return img.convert("RGB")
+# ---------------------------------------------------------------------------
+# OWLv2 model (cached)
+# ---------------------------------------------------------------------------
+_owlv2_cache: dict = {}
+def _get_owlv2():
+    if settings.model not in _owlv2_cache:
+        _owlv2_cache.clear()
+        from transformers import Owlv2ForObjectDetection, Owlv2Processor
+        logger.info("Loading OWLv2 %s on %s …", settings.model, settings.device)
+        processor = Owlv2Processor.from_pretrained(settings.model)
+        model = Owlv2ForObjectDetection.from_pretrained(
+            settings.model, torch_dtype=settings.torch_dtype
+        ).to(settings.device)
+        model.eval()
+        _owlv2_cache[settings.model] = (processor, model)
+        logger.info("OWLv2 ready.")
+    return _owlv2_cache[settings.model]
+# ---------------------------------------------------------------------------
+# SAM2 model (cached)
+# ---------------------------------------------------------------------------
+_sam2_cache: dict = {}
+_SAM2_MODEL_ID = "facebook/sam2-hiera-tiny"
+def _get_sam2():
+    if _SAM2_MODEL_ID not in _sam2_cache:
+        processor, model = load_sam2(settings.device, _SAM2_MODEL_ID)
+        _sam2_cache[_SAM2_MODEL_ID] = (processor, model)
+    return _sam2_cache[_SAM2_MODEL_ID]
+# ---------------------------------------------------------------------------
+# Shared inference helpers
+# ---------------------------------------------------------------------------
+def _run_detection(
+    pil_image: Image.Image,
+    prompts: list[str],
+    threshold: float,
+    mode: str,
+) -> list[dict]:
+    """Run OWLv2 (and optionally SAM2) on *pil_image*.
+    Returns detections enriched with 'mask' and 'segmentation' when
+    mode == "Segmentation".
+    """
+    processor, model = _get_owlv2()
+    detections = _owlv2_infer(
+        pil_image, processor, model, prompts, threshold,
+        settings.device, settings.torch_dtype,
+    )
+    if mode == "Segmentation" and detections:
+        sam2_processor, sam2_model = _get_sam2()
+        detections = segment_with_boxes(
+            pil_image, detections, sam2_processor, sam2_model, settings.device
+        )
+    return detections
+def _parse_prompts(text: str) -> list[str]:
+    return [p.strip() for p in text.split(",") if p.strip()]
+# ---------------------------------------------------------------------------
+# Object crops
+# ---------------------------------------------------------------------------
+def _make_crops(
+    pil_image: Image.Image,
+    detections: list[dict],
+    prompts: list[str],
+    mode: str,
+) -> list[tuple[Image.Image, str]]:
+    """Return one (cropped PIL image, caption) pair per detection.
+    Detection mode:    plain bounding-box crop with a coloured border.
+    Segmentation mode: tight crop around the mask's nonzero region; pixels
+                       outside the mask are set to white for a clean cutout.
+    """
+    crops: list[tuple[Image.Image, str]] = []
+    img_w, img_h = pil_image.size
+    for det in detections:
+        x1, y1, x2, y2 = det["box_xyxy"]
+        x1 = max(0, int(x1)); y1 = max(0, int(y1))
+        x2 = min(img_w, int(x2)); y2 = min(img_h, int(y2))
+        if x2 <= x1 or y2 <= y1:
+            continue
+        r, g, b = _colour_for(det["label"], prompts)
+        if mode == "Segmentation":
+            mask = det.get("mask")
+            if mask is not None and isinstance(mask, np.ndarray):
+                # Find the tight bounding box of the mask's nonzero region
+                rows = np.any(mask, axis=1)
+                cols = np.any(mask, axis=0)
+                if rows.any() and cols.any():
+                    r_min, r_max = int(np.where(rows)[0][0]),  int(np.where(rows)[0][-1])
+                    c_min, c_max = int(np.where(cols)[0][0]),  int(np.where(cols)[0][-1])
+                    mask_tight = mask[r_min:r_max + 1, c_min:c_max + 1]
+                    region = np.array(
+                        pil_image.crop((c_min, r_min, c_max + 1, r_max + 1)).convert("RGB")
+                    )
+                    # White background outside the mask
+                    region[~mask_tight] = [255, 255, 255]
+                    crop_rgb = Image.fromarray(region)
+                else:
+                    crop_rgb = pil_image.crop((x1, y1, x2, y2)).convert("RGB")
+            else:
+                crop_rgb = pil_image.crop((x1, y1, x2, y2)).convert("RGB")
+        else:
+            crop_rgb = pil_image.crop((x1, y1, x2, y2)).convert("RGB")
+        # Coloured border
+        bordered = Image.new("RGB", (crop_rgb.width + 6, crop_rgb.height + 6), (r, g, b))
+        bordered.paste(crop_rgb, (3, 3))
+        caption = f"{det['label']}  {det['score']:.2f}"
+        crops.append((bordered, caption))
+    return crops
+# ---------------------------------------------------------------------------
+# Tab 1 — Test
+# ---------------------------------------------------------------------------
+def run_test(
+    image_np: Optional[np.ndarray],
+    prompts_text: str,
+    threshold: float,
+    size_label: str,
+    mode: str,
+):
+    if image_np is None or not prompts_text.strip():
+        return image_np, [], []
+    prompts = _parse_prompts(prompts_text)
+    if not prompts:
+        return image_np, [], []
+    pil = _resize(Image.fromarray(image_np), size_label)
+    detections = _run_detection(pil, prompts, threshold, mode)
+    table = [
+        [i + 1, d["label"], f"{d['score']:.3f}",
+         f"[{d['box_xyxy'][0]:.0f}, {d['box_xyxy'][1]:.0f}, "
+         f"{d['box_xyxy'][2]:.0f}, {d['box_xyxy'][3]:.0f}]"]
+        for i, d in enumerate(detections)
+    ]
+    crops = _make_crops(pil, detections, prompts, mode)
+    return np.array(_annotate(pil, detections, prompts, mode)), table, crops
+# ---------------------------------------------------------------------------
+# Tab 2 — Batch
+# ---------------------------------------------------------------------------
+def run_batch(files, prompts_text: str, threshold: float, size_label: str, mode: str):
+    if not files or not prompts_text.strip():
+        return [], "Upload images and enter prompts to get started.", None
+    prompts = _parse_prompts(prompts_text)
+    if not prompts:
+        return [], "No valid prompts.", None
+    # Fresh temp dir for this run
+    run_dir = _TMPDIR / "current_run"
+    if run_dir.exists():
+        shutil.rmtree(run_dir)
+    images_dir = run_dir / "images"
+    images_dir.mkdir(parents=True)
+    gallery: list[Image.Image] = []
+    total_dets = 0
+    for f in files:
+        try:
+            src = Path(f.name if hasattr(f, "name") else str(f))
+            pil = _resize(Image.open(src).convert("RGB"), size_label)
+            w, h = pil.size
+            detections = _run_detection(pil, prompts, threshold, mode)
+            total_dets += len(detections)
+            # Save resized image (included in the ZIP)
+            img_name = src.name
+            pil.save(images_dir / img_name)
+            # Per-image JSON consumed by build_coco.
+            # Drop numpy mask arrays — they are not JSON-serialisable.
+            json_dets = [
+                {k: v for k, v in det.items() if k != "mask"}
+                for det in detections
+            ]
+            save_json(
+                {"image_path": img_name, "image_width": w,
+                 "image_height": h, "detections": json_dets},
+                run_dir / (src.stem + ".json"),
+            )
+            gallery.append(_annotate(pil, detections, prompts, mode))
+        except Exception:
+            logger.exception("Failed to process %s", f)
+    # Build COCO JSON
+    coco = build_coco(run_dir)
+    coco_path = run_dir / "coco_export.json"
+    if coco:
+        save_json(coco, coco_path)
+    # Package everything into a ZIP
+    zip_path = run_dir / "autolabel_export.zip"
+    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
+        if coco_path.exists():
+            zf.write(coco_path, "coco_export.json")
+        for img_file in sorted(images_dir.iterdir()):
+            zf.write(img_file, f"images/{img_file.name}")
+    n_ann = len(coco.get("annotations", [])) if coco else 0
+    size_note = f" · resized to {size_label}" if size_label != "As is" else ""
+    mode_note = f" · {mode.lower()}"
+    stats = (
+        f"{len(gallery)} image(s) · {total_dets} detection(s) · "
+        f"{n_ann} annotations{size_note}{mode_note}"
+    )
+    return gallery, stats, str(zip_path)
+# ---------------------------------------------------------------------------
+# UI
+# ---------------------------------------------------------------------------
+_DEFAULT_PROMPTS = ", ".join(settings.prompts[:8])
+_HOW_IT_WORKS_MD = """\
+## How it works
+| Mode | Models | Output |
+|------|--------|--------|
+| **Detection** | OWLv2 | Bounding boxes + class labels |
+| **Segmentation** | OWLv2 → SAM2 | Bounding boxes + pixel masks + COCO polygons |
+**Detection** uses [OWLv2](https://huggingface.co/google/owlv2-large-patch14-finetuned), an
+open-vocabulary detector that converts your text prompts directly into bounding boxes — no
+fixed class list required.
+**Segmentation** uses the **Grounded SAM2** pattern:
+1. **OWLv2** reads your text prompts and produces bounding boxes
+2. **SAM2** (`sam2-hiera-tiny`) takes each box as a spatial prompt and refines it into a
+   pixel-level mask
+SAM2 has no concept of text — it only understands spatial prompts (boxes, points, masks).
+OWLv2 acts as the *grounding* step, translating words into coordinates that SAM2 can use.
+Both models must run in Segmentation mode; Detection mode skips SAM2 entirely.
+"""
+with gr.Blocks(title="autolabel") as demo:
+    gr.Markdown("# autolabel — OWLv2 + SAM2")
+    with gr.Accordion("ℹ️ How it works", open=False):
+        gr.Markdown(_HOW_IT_WORKS_MD)
+    with gr.Tabs():
+        # ── Tab 1: Test ──────────────────────────────────────────────────
+        with gr.Tab("🧪 Test"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    t1_image = gr.Image(label="Image — upload, paste, or pick a sample below",
+                                        type="numpy", sources=["upload", "clipboard"])
+                    t1_mode = gr.Radio(
+                        ["Detection", "Segmentation"],
+                        label="Mode", value="Detection",
+                        info="Detection: OWLv2 → boxes only.  "
+                             "Segmentation: OWLv2 → boxes → SAM2 → pixel masks.",
+                    )
+                    t1_prompts = gr.Textbox(label="Prompts (comma-separated)",
+                                            value=_DEFAULT_PROMPTS, lines=2)
+                    t1_threshold = gr.Slider(label="Threshold", minimum=0.01,
+                                             maximum=0.9, step=0.01, value=settings.threshold)
+                    t1_size = gr.Dropdown(label="Input size", choices=_SIZE_LABELS,
+                                          value="As is")
+                    t1_btn = gr.Button("Detect", variant="primary")
+                with gr.Column(scale=1):
+                    t1_output = gr.Image(label="Result", type="numpy")
+                    t1_table = gr.Dataframe(
+                        headers=["#", "Label", "Score", "Box (xyxy)"],
+                        row_count=(0, "dynamic"), column_count=(4, "fixed"),
+                    )
+                    t1_crops = gr.Gallery(
+                        label="Object crops",
+                        columns=4, height=220,
+                        object_fit="contain", show_label=True,
+                    )
+            # Sample images — click any thumbnail to load it into the image input
+            _SAMPLES_DIR = Path(__file__).parent / "samples"
+            gr.Examples(
+                label="Sample images (click to load)",
+                examples=[
+                    [str(_SAMPLES_DIR / "animals.jpg"), "Detection",
+                     "crown, necklace, ball, animal eye", 0.40, "As is"],
+                    [str(_SAMPLES_DIR / "kitchen.jpg"), "Detection",
+                     "apple, banana, orange, broccoli, carrot, bottle, bowl", 0.40, "As is"],
+                    [str(_SAMPLES_DIR / "dog.jpg"),     "Detection",
+                     "dog",                             0.40, "As is"],
+                    [str(_SAMPLES_DIR / "cat.jpg"),     "Detection",
+                     "cat",                             0.40, "As is"]
+                ],
+                inputs=[t1_image, t1_mode, t1_prompts, t1_threshold, t1_size],
+                examples_per_page=5,
+                cache_examples=False,
+            )
+            t1_btn.click(
+                run_test,
+                inputs=[t1_image, t1_prompts, t1_threshold, t1_size, t1_mode],
+                outputs=[t1_output, t1_table, t1_crops],
+            )
+            t1_prompts.submit(
+                run_test,
+                inputs=[t1_image, t1_prompts, t1_threshold, t1_size, t1_mode],
+                outputs=[t1_output, t1_table, t1_crops],
+            )
+        # ── Tab 2: Batch ─────────────────────────────────────────────────
+        with gr.Tab("📂 Batch"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    t2_files = gr.File(label="Images", file_count="multiple",
+                                       file_types=["image"])
+                    t2_mode = gr.Radio(
+                        ["Detection", "Segmentation"],
+                        label="Mode", value="Detection",
+                        info="Detection: OWLv2 → boxes only.  "
+                             "Segmentation: OWLv2 → boxes → SAM2 → pixel masks.",
+                    )
+                    t2_prompts = gr.Textbox(label="Prompts (comma-separated)",
+                                            value=_DEFAULT_PROMPTS, lines=2)
+                    t2_threshold = gr.Slider(label="Threshold", minimum=0.01,
+                                             maximum=0.9, step=0.01, value=settings.threshold)
+                    t2_size = gr.Dropdown(label="Input size", choices=_SIZE_LABELS,
+                                          value="640 × 640")
+                    t2_btn = gr.Button("Run", variant="primary")
+                    t2_stats = gr.Textbox(label="Stats", interactive=False)
+                    t2_download = gr.DownloadButton(
+                        label="Download ZIP (images + COCO JSON)",
+                        visible=False, variant="secondary", size="sm",
+                    )
+                with gr.Column(scale=2):
+                    t2_gallery = gr.Gallery(label="Results", columns=3,
+                                            height="auto", object_fit="contain")
+            def _run_and_reveal(files, prompts_text, threshold, size_label, mode):
+                gallery, stats, zip_path = run_batch(
+                    files, prompts_text, threshold, size_label, mode
+                )
+                return gallery, stats, gr.update(value=zip_path, visible=zip_path is not None)
+            t2_btn.click(
+                _run_and_reveal,
+                inputs=[t2_files, t2_prompts, t2_threshold, t2_size, t2_mode],
+                outputs=[t2_gallery, t2_stats, t2_download],
+            )
+demo.queue(max_size=5)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860,
+                share=True, inbrowser=True, theme=gr.themes.Soft())

autolabel/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+autolabel — OWLv2-powered auto-labeling pipeline for household object detection.
+Pipeline:
+  1. detect  — run OWLv2 on images, produce per-image detection JSON
+  2. export  — convert detections to COCO JSON for fine-tuning
+Primary interface: app.py (Gradio web UI)
+CLI interface:     scripts/run_detection.py, scripts/export_coco.py
+"""
+__version__ = "0.1.0"

autolabel/config.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+config.py — Pydantic settings for the autolabel pipeline.
+Handles:
+  - Auto device detection: CUDA → MPS → CPU
+  - OWLv2 model selection
+  - Detection thresholds
+  - Data paths derived from project root
+"""
+from __future__ import annotations
+import logging
+import os
+from pathlib import Path
+from typing import List
+import torch
+from pydantic import Field, field_validator, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Project root — two levels up from this file (autolabel/config.py → project/)
+# ---------------------------------------------------------------------------
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+def _detect_device() -> str:
+    """Return the best available torch device string."""
+    if torch.cuda.is_available():
+        device = "cuda"
+        name = torch.cuda.get_device_name(0)
+        logger.info("Device selected: CUDA (%s)", name)
+    elif torch.backends.mps.is_available():
+        device = "mps"
+        logger.info(
+            "Device selected: MPS (Apple Silicon). "
+            "Set PYTORCH_ENABLE_MPS_FALLBACK=1 for unsupported ops."
+        )
+    else:
+        device = "cpu"
+        logger.warning("Device selected: CPU — no CUDA or MPS found. Inference will be slow.")
+    return device
+class Settings(BaseSettings):
+    """Central configuration for the autolabel pipeline.
+    All values can be overridden via environment variables prefixed with
+    AUTOLABEL_ (e.g., AUTOLABEL_THRESHOLD=0.2).
+    The .env file is loaded automatically from the project root.
+    """
+    model_config = SettingsConfigDict(
+        env_prefix="AUTOLABEL_",
+        env_file=str(PROJECT_ROOT / ".env"),
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore",
+    )
+    # ------------------------------------------------------------------
+    # Device
+    # ------------------------------------------------------------------
+    device: str = Field(
+        default="",
+        description="Torch device override. Leave empty for auto-detection.",
+    )
+    # ------------------------------------------------------------------
+    # OWLv2 model
+    # ------------------------------------------------------------------
+    model: str = Field(
+        default="google/owlv2-large-patch14-finetuned",
+        description="Hugging Face model identifier for OWLv2.",
+    )
+    # ------------------------------------------------------------------
+    # Detection
+    # ------------------------------------------------------------------
+    threshold: float = Field(
+        default=0.1,
+        ge=0.0,
+        le=1.0,
+        description="Minimum confidence score to keep a detection.",
+    )
+    prompts: List[str] = Field(
+        default=[
+            "cup",
+            "bottle",
+            "keyboard",
+            "computer mouse",
+            "cell phone",
+            "remote control",
+            "book",
+            "plant",
+            "bowl",
+            "mug",
+            "laptop",
+            "monitor",
+            "pen",
+            "scissors",
+            "stapler",
+            "headphones",
+            "wallet",
+            "keys",
+            "glasses",
+            "candle",
+            "backpack",
+            "notebook",
+            "water bottle",
+            "coffee cup",
+            "charger",
+        ],
+        description="Text prompts sent to OWLv2 for open-vocabulary detection.",
+    )
+    # ------------------------------------------------------------------
+    # Paths
+    # ------------------------------------------------------------------
+    raw_dir: Path = Field(
+        default=PROJECT_ROOT / "data" / "raw",
+        description="Input images directory.",
+    )
+    detections_dir: Path = Field(
+        default=PROJECT_ROOT / "data" / "detections",
+        description="OWLv2 output JSON files.",
+    )
+    labeled_dir: Path = Field(
+        default=PROJECT_ROOT / "data" / "labeled",
+        description="Reviewed and accepted annotation JSON files.",
+    )
+    # ------------------------------------------------------------------
+    # Validators
+    # ------------------------------------------------------------------
+    @field_validator("threshold", mode="before")
+    @classmethod
+    def _coerce_threshold(cls, v: object) -> float:
+        return float(v)  # type: ignore[arg-type]
+    @field_validator("prompts", mode="before")
+    @classmethod
+    def _parse_prompts(cls, v: object) -> List[str]:
+        """Allow comma-separated string from env var."""
+        if isinstance(v, str):
+            return [p.strip() for p in v.split(",") if p.strip()]
+        return list(v)  # type: ignore[arg-type]
+    @model_validator(mode="after")
+    def _resolve_device(self) -> "Settings":
+        if not self.device:
+            self.device = _detect_device()
+        else:
+            logger.info("Device override from env/config: %s", self.device)
+        return self
+    @model_validator(mode="after")
+    def _ensure_dirs(self) -> "Settings":
+        for path in (self.raw_dir, self.detections_dir, self.labeled_dir):
+            path.mkdir(parents=True, exist_ok=True)
+        return self
+    # ------------------------------------------------------------------
+    # Convenience
+    # ------------------------------------------------------------------
+    @property
+    def torch_dtype(self) -> torch.dtype:
+        """fp16 on CUDA, fp32 everywhere else (MPS doesn't support fp16 fully)."""
+        return torch.float16 if self.device == "cuda" else torch.float32
+# Module-level singleton — import this everywhere.
+settings = Settings()

autolabel/detect.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+detect.py — OWLv2 runner.
+Loads google/owlv2-* from Hugging Face, runs open-vocabulary detection on
+every image in a folder, and saves per-image JSON with boxes, scores, and
+labels. Already-processed images are skipped unless --force is used.
+Output JSON schema (per image):
+{
+  "image_path": "/abs/path/to/image.jpg",
+  "image_width": 1920,
+  "image_height": 1080,
+  "detections": [
+    {
+      "label": "cup",
+      "score": 0.83,
+      "box_xyxy": [x1, y1, x2, y2]   # absolute pixel coords
+    },
+    ...
+  ]
+}
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import List, Optional
+import torch
+from PIL import Image
+from tqdm import tqdm
+from transformers import Owlv2ForObjectDetection, Owlv2Processor
+from autolabel.config import Settings, settings as default_settings
+from autolabel.utils import (
+    collect_images,
+    detection_json_path,
+    save_json,
+    setup_logging,
+)
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Core runner
+# ---------------------------------------------------------------------------
+def load_model(cfg: Settings) -> tuple[Owlv2Processor, Owlv2ForObjectDetection]:
+    """Download (or load from cache) OWLv2 processor and model."""
+    logger.info("Loading OWLv2 model: %s", cfg.model)
+    processor = Owlv2Processor.from_pretrained(cfg.model)
+    model = Owlv2ForObjectDetection.from_pretrained(
+        cfg.model,
+        torch_dtype=cfg.torch_dtype,
+    )
+    model = model.to(cfg.device)
+    model.eval()
+    logger.info("Model loaded on device: %s  dtype: %s", cfg.device, cfg.torch_dtype)
+    return processor, model
+def infer(
+    image: Image.Image,
+    processor: Owlv2Processor,
+    model: Owlv2ForObjectDetection,
+    prompts: List[str],
+    threshold: float,
+    device: str,
+    torch_dtype: torch.dtype,
+) -> List[dict]:
+    """Run OWLv2 on a PIL image and return a list of detection dicts.
+    This is the shared inference primitive used by both the web app and the
+    CLI batch runner.  Returns detections sorted by descending score.
+    """
+    width, height = image.size
+    inputs = processor(text=[prompts], images=image, return_tensors="pt")
+    inputs = {
+        k: (v.to(device=device, dtype=torch_dtype) if v.is_floating_point() else v.to(device=device))
+        for k, v in inputs.items()
+    }
+    with torch.no_grad():
+        outputs = model(**inputs)
+    target_sizes = torch.tensor([[height, width]], device=device)
+    results = processor.post_process_grounded_object_detection(
+        outputs, target_sizes=target_sizes, threshold=threshold,
+    )[0]
+    detections = [
+        {
+            "label": prompts[label_idx],
+            "score": round(float(score), 4),
+            "box_xyxy": [round(c, 1) for c in box],
+        }
+        for box, score, label_idx in zip(
+            results["boxes"].cpu().tolist(),
+            results["scores"].cpu().tolist(),
+            results["labels"].cpu().tolist(),
+        )
+    ]
+    detections.sort(key=lambda d: d["score"], reverse=True)
+    return detections
+def detect_image(
+    image_path: Path,
+    processor: Owlv2Processor,
+    model: Owlv2ForObjectDetection,
+    prompts: List[str],
+    threshold: float,
+    device: str,
+    torch_dtype: torch.dtype,
+) -> dict:
+    """Run OWLv2 on an image file and return the structured detection dict."""
+    image = Image.open(image_path).convert("RGB")
+    width, height = image.size
+    detections = infer(image, processor, model, prompts, threshold, device, torch_dtype)
+    return {
+        "image_path": str(image_path.resolve()),
+        "image_width": width,
+        "image_height": height,
+        "detections": detections,
+    }
+def run_detection(
+    image_dir: Path,
+    output_dir: Path,
+    prompts: Optional[List[str]] = None,
+    cfg: Optional[Settings] = None,
+    force: bool = False,
+) -> None:
+    """
+    Run OWLv2 detection on all images in *image_dir*.
+    Args:
+        image_dir:  Folder containing input images.
+        output_dir: Folder where per-image JSON files are written.
+        prompts:    Override text prompts (uses cfg.prompts if None).
+        cfg:        Settings instance (uses module default if None).
+        force:      Re-process images that already have a detection JSON.
+    """
+    cfg = cfg or default_settings
+    active_prompts = prompts or cfg.prompts
+    output_dir.mkdir(parents=True, exist_ok=True)
+    images = collect_images(image_dir)
+    if not images:
+        logger.warning("No images found in %s", image_dir)
+        return
+    processor, model = load_model(cfg)
+    skipped = 0
+    for image_path in tqdm(images, desc="Detecting", unit="img"):
+        out_path = detection_json_path(image_path, output_dir)
+        if out_path.exists() and not force:
+            logger.debug("Skipping (already processed): %s", image_path.name)
+            skipped += 1
+            continue
+        try:
+            result = detect_image(
+                image_path=image_path,
+                processor=processor,
+                model=model,
+                prompts=active_prompts,
+                threshold=cfg.threshold,
+                device=cfg.device,
+                torch_dtype=cfg.torch_dtype,
+            )
+            save_json(result, out_path)
+            logger.debug(
+                "%s → %d detection(s)", image_path.name, len(result["detections"])
+            )
+        except Exception:
+            logger.exception("Failed to process %s", image_path)
+    logger.info(
+        "Detection complete. Processed: %d  Skipped: %d",
+        len(images) - skipped,
+        skipped,
+    )

autolabel/export.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+export.py — COCO JSON exporter.
+Reads all per-image labeled JSON files from the labeled/ directory and
+assembles a valid COCO-format JSON file.  No pycocotools dependency — the
+format is built from scratch.
+COCO format reference:
+  https://cocodataset.org/#format-data
+Output structure:
+{
+  "info": {...},
+  "licenses": [],
+  "categories": [{"id": 1, "name": "cup", "supercategory": "object"}, ...],
+  "images": [{"id": 1, "file_name": "img.jpg", "width": W, "height": H}, ...],
+  "annotations": [
+    {
+      "id": 1,
+      "image_id": 1,
+      "category_id": 2,
+      "bbox": [x, y, w, h],          # COCO uses [x_min, y_min, width, height]
+      "area": w * h,
+      "iscrowd": 0
+    },
+    ...
+  ]
+}
+"""
+from __future__ import annotations
+import logging
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+from autolabel.config import settings as default_settings, Settings
+from autolabel.utils import load_json, save_json
+logger = logging.getLogger(__name__)
+def _xyxy_to_xywh(box: list[float]) -> list[float]:
+    """Convert [x1, y1, x2, y2] → [x, y, width, height] (COCO format)."""
+    x1, y1, x2, y2 = box
+    return [x1, y1, x2 - x1, y2 - y1]
+def build_coco(labeled_dir: Path) -> dict:
+    """
+    Read all JSON files in *labeled_dir* and build a COCO-format dict.
+    Returns the COCO dict ready for serialisation.
+    """
+    json_files = sorted(labeled_dir.glob("*.json"))
+    # Exclude any existing coco_export.json to avoid self-inclusion
+    json_files = [f for f in json_files if f.name != "coco_export.json"]
+    if not json_files:
+        logger.warning("No labeled JSON files found in %s", labeled_dir)
+        return {}
+    logger.info("Building COCO export from %d file(s)…", len(json_files))
+    # Collect all category names in encounter order, deduplicating
+    category_index: dict[str, int] = {}  # name → category_id
+    images_list: list[dict] = []
+    annotations_list: list[dict] = []
+    ann_id = 1
+    for img_id, json_path in enumerate(json_files, start=1):
+        data = load_json(json_path)
+        image_path = Path(data["image_path"])
+        images_list.append(
+            {
+                "id": img_id,
+                "file_name": image_path.name,
+                "width": data["image_width"],
+                "height": data["image_height"],
+            }
+        )
+        for det in data.get("detections", []):
+            label: str = det["label"]
+            if label not in category_index:
+                category_index[label] = len(category_index) + 1
+            cat_id = category_index[label]
+            xywh = _xyxy_to_xywh(det["box_xyxy"])
+            area = round(xywh[2] * xywh[3], 2)
+            annotations_list.append(
+                {
+                    "id": ann_id,
+                    "image_id": img_id,
+                    "category_id": cat_id,
+                    "bbox": [round(v, 1) for v in xywh],
+                    "area": area,
+                    "iscrowd": 0,
+                    "segmentation": det.get("segmentation", []),
+                }
+            )
+            ann_id += 1
+    categories = [
+        {"id": cat_id, "name": name, "supercategory": "object"}
+        for name, cat_id in sorted(category_index.items(), key=lambda x: x[1])
+    ]
+    coco = {
+        "info": {
+            "description": "autolabel — OWLv2 household object dataset",
+            "version": "1.0",
+            "year": datetime.now(tz=timezone.utc).year,
+            "date_created": datetime.now(tz=timezone.utc).isoformat(),
+        },
+        "licenses": [],
+        "categories": categories,
+        "images": images_list,
+        "annotations": annotations_list,
+    }
+    logger.info(
+        "COCO export: %d image(s), %d annotation(s), %d categor(ies)",
+        len(images_list),
+        len(annotations_list),
+        len(categories),
+    )
+    return coco
+def run_export(
+    labeled_dir: Path,
+    output_path: Path,
+    cfg: Optional[Settings] = None,
+) -> None:
+    """
+    Build COCO JSON from *labeled_dir* and write to *output_path*.
+    Args:
+        labeled_dir: Directory containing per-image labeled JSON files.
+        output_path: Destination path for the COCO JSON file.
+        cfg:         Settings instance (module default if None).
+    """
+    _ = cfg or default_settings  # reserved for future use
+    coco = build_coco(labeled_dir)
+    if not coco:
+        logger.error("Nothing to export.")
+        return
+    save_json(coco, output_path)
+    logger.info("COCO JSON written → %s", output_path)

autolabel/finetune.py ADDED Viewed

	@@ -0,0 +1,554 @@

+"""
+finetune.py — OWLv2 fine-tuning core: dataset, loss, and training loop.
+Architecture notes
+------------------
+In transformers v5.x, Owlv2ForObjectDetection.forward() does NOT compute loss
+internally — the loss/loss_dict output fields are always None.  We compute it
+ourselves using the DETR-style utilities already shipped in transformers:
+  • HungarianMatcher   — optimal bipartite assignment (scipy lsa under the hood)
+  • generalized_box_iou / center_to_corners_format — from the same module
+Loss used (OWLv2 uses sigmoid, not softmax, so binary CE fits better):
+  total = λ_cls * L_bce + λ_bbox * L_l1 + λ_giou * L_giou
+Freezing strategy (default: train detection heads only)
+  Frozen  : owlv2.vision_model, owlv2.text_model, owlv2.text_projection,
+            owlv2.visual_projection
+  Trained : box_head, class_head, objectness_head, layer_norm, owlv2.logit_scale
+  Optional: --unfreeze-vision also trains the ViT image encoder
+"""
+from __future__ import annotations
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable, Optional
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset, random_split
+from tqdm import tqdm
+from transformers import Owlv2ForObjectDetection, Owlv2Processor
+from transformers.loss.loss_for_object_detection import (
+    HungarianMatcher,
+    center_to_corners_format,
+    generalized_box_iou,
+)
+logger = logging.getLogger(__name__)
+class TrainingStoppedError(Exception):
+    """Raised by a progress_callback to cancel training mid-epoch."""
+# ---------------------------------------------------------------------------
+# Training hyperparameters (all overridable via the CLI)
+# ---------------------------------------------------------------------------
+@dataclass
+class FinetuneConfig:
+    coco_json: Path
+    image_dir: Path
+    output_dir: Path
+    model_name: str = "google/owlv2-large-patch14-finetuned"
+    device: str = "cpu"
+    torch_dtype: torch.dtype = torch.float32
+    epochs: int = 10
+    batch_size: int = 1
+    grad_accum_steps: int = 4          # effective batch = batch_size * grad_accum_steps
+    lr: float = 1e-4                   # for detection heads
+    backbone_lr: float = 0.0           # for vision encoder (0 = frozen)
+    weight_decay: float = 1e-4
+    grad_clip: float = 0.1
+    warmup_steps: int = 50
+    val_split: float = 0.2
+    save_every: int = 1                # save checkpoint every N epochs
+    # Loss weights
+    lambda_cls: float = 1.0
+    lambda_bbox: float = 5.0
+    lambda_giou: float = 2.0
+    # Hungarian matcher costs (separate from loss weights)
+    class_cost: float = 1.0
+    bbox_cost: float = 5.0
+    giou_cost: float = 2.0
+    resume_from: Optional[Path] = None
+    unfreeze_vision: bool = False
+# ---------------------------------------------------------------------------
+# COCO dataset
+# ---------------------------------------------------------------------------
+def _coco_xywh_to_cxcywh_norm(
+    bbox: list[float], img_w: int, img_h: int
+) -> list[float]:
+    """COCO [x, y, w, h] pixel → normalised [cx, cy, w, h] in [0, 1]."""
+    x, y, w, h = bbox
+    return [
+        (x + w / 2) / img_w,
+        (y + h / 2) / img_h,
+        w / img_w,
+        h / img_h,
+    ]
+class CocoOwlv2Dataset(Dataset):
+    """
+    Loads a COCO-format JSON and serves (image, boxes, class_labels) items.
+    Args:
+        coco_json_path: Path to the COCO JSON file.
+        image_dir:      Directory where image files live (matched by file_name).
+        categories:     List of category name strings (defines the query order).
+                        If None, derived from the JSON.
+    """
+    def __init__(
+        self,
+        coco_json_path: Path,
+        image_dir: Path,
+        categories: Optional[list[str]] = None,
+    ) -> None:
+        with coco_json_path.open() as fh:
+            coco = json.load(fh)
+        # Build category id → 0-based index into text queries list
+        if categories is None:
+            categories = [c["name"] for c in sorted(coco["categories"], key=lambda c: c["id"])]
+        self.categories = categories
+        cat_id_to_idx = {c["id"]: i for i, c in enumerate(
+            sorted(coco["categories"], key=lambda c: c["id"])
+        )}
+        # Index annotations by image_id
+        ann_by_image: dict[int, list[dict]] = {}
+        for ann in coco["annotations"]:
+            ann_by_image.setdefault(ann["image_id"], []).append(ann)
+        # Build valid items (images that have at least one annotation)
+        self.items: list[dict] = []
+        for img_meta in coco["images"]:
+            anns = ann_by_image.get(img_meta["id"], [])
+            if not anns:
+                continue
+            img_path = image_dir / img_meta["file_name"]
+            if not img_path.exists():
+                logger.warning("Image not found, skipping: %s", img_path)
+                continue
+            boxes_norm = []
+            class_labels = []
+            w, h = img_meta["width"], img_meta["height"]
+            for ann in anns:
+                boxes_norm.append(_coco_xywh_to_cxcywh_norm(ann["bbox"], w, h))
+                class_labels.append(cat_id_to_idx[ann["category_id"]])
+            self.items.append(
+                {
+                    "image_path": img_path,
+                    "boxes": boxes_norm,          # list of [cx, cy, w, h] normalised
+                    "class_labels": class_labels, # list of int indices
+                }
+            )
+        logger.info(
+            "Dataset: %d images with annotations  |  %d categories",
+            len(self.items),
+            len(self.categories),
+        )
+    def __len__(self) -> int:
+        return len(self.items)
+    def __getitem__(self, idx: int) -> dict:
+        item = self.items[idx]
+        image = Image.open(item["image_path"]).convert("RGB")
+        return {
+            "image": image,
+            "boxes": torch.tensor(item["boxes"], dtype=torch.float32),
+            "class_labels": torch.tensor(item["class_labels"], dtype=torch.long),
+        }
+# ---------------------------------------------------------------------------
+# Collate function
+# ---------------------------------------------------------------------------
+def make_collate_fn(processor: Owlv2Processor, categories: list[str], device: str, dtype: torch.dtype):
+    """Returns a collate_fn that encodes images + text queries into model inputs."""
+    def collate_fn(batch: list[dict]) -> dict:
+        images = [item["image"] for item in batch]
+        # All images in the batch share the same text queries (all categories)
+        inputs = processor(
+            text=[categories] * len(images),
+            images=images,
+            return_tensors="pt",
+        )
+        # Move to device with correct dtype
+        inputs = {
+            k: (v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device=device))
+            for k, v in inputs.items()
+        }
+        labels = [
+            {
+                "boxes": item["boxes"].to(device),
+                "class_labels": item["class_labels"].to(device),
+            }
+            for item in batch
+        ]
+        inputs["labels"] = labels
+        return inputs
+    return collate_fn
+# ---------------------------------------------------------------------------
+# Loss
+# ---------------------------------------------------------------------------
+def compute_detection_loss(
+    logits: torch.Tensor,       # [B, N_patches, N_classes]
+    pred_boxes: torch.Tensor,   # [B, N_patches, 4]  normalised CxCyWH
+    targets: list[dict],        # [{"boxes": [M,4], "class_labels": [M]}]
+    matcher: HungarianMatcher,
+    lambda_cls: float,
+    lambda_bbox: float,
+    lambda_giou: float,
+) -> tuple[torch.Tensor, dict[str, float]]:
+    """
+    Compute combined detection loss using Hungarian matching.
+    Classification uses sigmoid BCE (OWLv2 uses sigmoid, not softmax).
+    Box regression uses L1 + GIoU on matched pairs only.
+    """
+    B, N, C = logits.shape
+    # --- Hungarian matching ---
+    with torch.no_grad():
+        indices = matcher({"logits": logits, "pred_boxes": pred_boxes}, targets)
+    # --- Classification loss (sigmoid binary CE) ---
+    # Target tensor: shape [B, N, C], all zeros (background),
+    # set 1.0 at matched (prediction, class) positions.
+    target_cls = torch.zeros(B, N, C, device=logits.device, dtype=logits.dtype)
+    for b, (pred_idx, gt_idx) in enumerate(indices):
+        if len(pred_idx) == 0:
+            continue
+        gt_labels = targets[b]["class_labels"][gt_idx]   # [M]
+        target_cls[b, pred_idx, gt_labels] = 1.0
+    loss_cls = F.binary_cross_entropy_with_logits(logits, target_cls, reduction="mean")
+    # --- Box losses (matched pairs only) ---
+    loss_bbox = torch.tensor(0.0, device=logits.device)
+    loss_giou = torch.tensor(0.0, device=logits.device)
+    num_matched = sum(len(p) for p, _ in indices)
+    if num_matched > 0:
+        for b, (pred_idx, gt_idx) in enumerate(indices):
+            if len(pred_idx) == 0:
+                continue
+            p_boxes = pred_boxes[b][pred_idx]             # [M, 4] CxCyWH norm
+            g_boxes = targets[b]["boxes"][gt_idx]         # [M, 4] CxCyWH norm
+            loss_bbox = loss_bbox + F.l1_loss(p_boxes, g_boxes, reduction="sum")
+            p_xyxy = center_to_corners_format(p_boxes)
+            g_xyxy = center_to_corners_format(g_boxes)
+            # Clamp to [0, 1] to avoid degenerate boxes
+            p_xyxy = p_xyxy.clamp(0, 1)
+            g_xyxy = g_xyxy.clamp(0, 1)
+            giou_mat = generalized_box_iou(p_xyxy, g_xyxy)  # [M, M]
+            loss_giou = loss_giou + (1 - torch.diag(giou_mat)).sum()
+        loss_bbox = loss_bbox / num_matched
+        loss_giou = loss_giou / num_matched
+    total = lambda_cls * loss_cls + lambda_bbox * loss_bbox + lambda_giou * loss_giou
+    log = {
+        "loss": total.item(),
+        "cls": loss_cls.item(),
+        "bbox": loss_bbox.item(),
+        "giou": loss_giou.item(),
+    }
+    return total, log
+# ---------------------------------------------------------------------------
+# Freeze / unfreeze helpers
+# ---------------------------------------------------------------------------
+BACKBONE_PREFIXES = (
+    "owlv2.vision_model",
+    "owlv2.text_model",
+    "owlv2.text_projection",
+    "owlv2.visual_projection",
+)
+VISION_PREFIXES = ("owlv2.vision_model", "owlv2.visual_projection")
+def apply_freeze_strategy(model: Owlv2ForObjectDetection, unfreeze_vision: bool) -> None:
+    """
+    Freeze the CLIP backbone; only train the detection heads (+ layer_norm).
+    With unfreeze_vision=True, also allow gradients through the ViT encoder.
+    """
+    for name, param in model.named_parameters():
+        if any(name.startswith(pfx) for pfx in BACKBONE_PREFIXES):
+            if unfreeze_vision and any(name.startswith(pfx) for pfx in VISION_PREFIXES):
+                param.requires_grad_(True)
+            else:
+                param.requires_grad_(False)
+        else:
+            param.requires_grad_(True)
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total = sum(p.numel() for p in model.parameters())
+    logger.info(
+        "Trainable params: %s / %s  (%.1f%%)",
+        f"{trainable:,}", f"{total:,}", 100 * trainable / total,
+    )
+# ---------------------------------------------------------------------------
+# LR scheduler with linear warmup
+# ---------------------------------------------------------------------------
+def build_scheduler(optimizer: torch.optim.Optimizer, warmup_steps: int, total_steps: int):
+    def lr_lambda(step: int) -> float:
+        if step < warmup_steps:
+            return step / max(warmup_steps, 1)
+        progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1)
+        return max(0.0, 0.5 * (1.0 + torch.cos(torch.tensor(3.14159 * progress)).item()))
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+# ---------------------------------------------------------------------------
+# Single-epoch helpers
+# ---------------------------------------------------------------------------
+def _run_epoch(
+    model: Owlv2ForObjectDetection,
+    loader: DataLoader,
+    matcher: HungarianMatcher,
+    cfg: FinetuneConfig,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler=None,
+    grad_accum: int = 1,
+    desc: str = "train",
+) -> dict[str, float]:
+    """Run one pass over *loader*. If optimizer is None, runs in eval mode."""
+    training = optimizer is not None
+    model.train(training)
+    totals: dict[str, float] = {"loss": 0, "cls": 0, "bbox": 0, "giou": 0}
+    steps = 0
+    if training:
+        optimizer.zero_grad()
+    with tqdm(loader, desc=desc, leave=False) as pbar:
+        for i, batch in enumerate(pbar):
+            labels = batch.pop("labels")
+            inputs = batch
+            ctx = torch.no_grad() if not training else torch.enable_grad()
+            with ctx:
+                outputs = model(**inputs)
+            loss, log = compute_detection_loss(
+                logits=outputs.logits,
+                pred_boxes=outputs.pred_boxes,
+                targets=labels,
+                matcher=matcher,
+                lambda_cls=cfg.lambda_cls,
+                lambda_bbox=cfg.lambda_bbox,
+                lambda_giou=cfg.lambda_giou,
+            )
+            if training:
+                (loss / grad_accum).backward()
+                if (i + 1) % grad_accum == 0:
+                    torch.nn.utils.clip_grad_norm_(
+                        model.parameters(), cfg.grad_clip
+                    )
+                    optimizer.step()
+                    if scheduler is not None:
+                        scheduler.step()
+                    optimizer.zero_grad()
+            for k, v in log.items():
+                totals[k] += v
+            steps += 1
+            pbar.set_postfix({k: f"{v/steps:.4f}" for k, v in totals.items()})
+    return {k: v / max(steps, 1) for k, v in totals.items()}
+# ---------------------------------------------------------------------------
+# Main training loop
+# ---------------------------------------------------------------------------
+def run_finetune(
+    cfg: FinetuneConfig,
+    progress_callback: Optional[Callable[[int, dict, dict], None]] = None,
+) -> None:
+    """Full fine-tuning run from a FinetuneConfig."""
+    cfg.output_dir.mkdir(parents=True, exist_ok=True)
+    t0 = time.time()
+    # --- Load processor and model ---
+    resume_path = cfg.resume_from or cfg.model_name
+    logger.info("Loading processor from %s", cfg.model_name)
+    processor = Owlv2Processor.from_pretrained(cfg.model_name)
+    logger.info("Loading model from %s", resume_path)
+    model = Owlv2ForObjectDetection.from_pretrained(
+        str(resume_path),
+        torch_dtype=cfg.torch_dtype,
+        ignore_mismatched_sizes=True,
+    ).to(cfg.device)
+    apply_freeze_strategy(model, cfg.unfreeze_vision)
+    # --- Dataset ---
+    full_dataset = CocoOwlv2Dataset(cfg.coco_json, cfg.image_dir)
+    categories = full_dataset.categories
+    # Save category metadata alongside the model
+    meta_path = cfg.output_dir / "label_map.json"
+    with meta_path.open("w") as fh:
+        json.dump({"categories": categories}, fh, indent=2)
+    logger.info("Label map saved → %s", meta_path)
+    n_val = max(1, int(len(full_dataset) * cfg.val_split))
+    n_train = len(full_dataset) - n_val
+    if n_train < 1:
+        raise ValueError(
+            f"Not enough labeled images for training ({len(full_dataset)} total). "
+            "Use the Batch tab in the web UI and run `make export` to build a larger dataset."
+        )
+    train_ds, val_ds = random_split(
+        full_dataset,
+        [n_train, n_val],
+        generator=torch.Generator().manual_seed(42),
+    )
+    logger.info("Split: %d train  /  %d val", n_train, n_val)
+    collate = make_collate_fn(processor, categories, cfg.device, cfg.torch_dtype)
+    train_loader = DataLoader(
+        train_ds, batch_size=cfg.batch_size, shuffle=True,
+        collate_fn=collate, num_workers=0,
+    )
+    val_loader = DataLoader(
+        val_ds, batch_size=cfg.batch_size, shuffle=False,
+        collate_fn=collate, num_workers=0,
+    )
+    # --- Optimizer (separate LR for heads vs backbone if unfrozen) ---
+    head_params = [p for n, p in model.named_parameters()
+                   if p.requires_grad and not any(n.startswith(pfx) for pfx in BACKBONE_PREFIXES)]
+    vision_params = [p for n, p in model.named_parameters()
+                     if p.requires_grad and any(n.startswith(pfx) for pfx in VISION_PREFIXES)]
+    param_groups = [{"params": head_params, "lr": cfg.lr}]
+    if vision_params and cfg.backbone_lr > 0:
+        param_groups.append({"params": vision_params, "lr": cfg.backbone_lr})
+    optimizer = torch.optim.AdamW(param_groups, weight_decay=cfg.weight_decay)
+    total_steps = (len(train_loader) // cfg.grad_accum_steps) * cfg.epochs
+    scheduler = build_scheduler(optimizer, cfg.warmup_steps, total_steps)
+    matcher = HungarianMatcher(
+        class_cost=cfg.class_cost,
+        bbox_cost=cfg.bbox_cost,
+        giou_cost=cfg.giou_cost,
+    )
+    # --- Training loop ---
+    best_val_loss = float("inf")
+    history: list[dict] = []
+    for epoch in range(1, cfg.epochs + 1):
+        logger.info("─── Epoch %d / %d ───", epoch, cfg.epochs)
+        train_log = _run_epoch(
+            model, train_loader, matcher, cfg,
+            optimizer=optimizer, scheduler=scheduler,
+            grad_accum=cfg.grad_accum_steps,
+            desc=f"train {epoch}/{cfg.epochs}",
+        )
+        val_log = _run_epoch(
+            model, val_loader, matcher, cfg,
+            desc=f"val   {epoch}/{cfg.epochs}",
+        )
+        logger.info(
+            "Epoch %d  train_loss=%.4f  val_loss=%.4f  "
+            "(cls=%.4f  bbox=%.4f  giou=%.4f)",
+            epoch,
+            train_log["loss"], val_log["loss"],
+            val_log["cls"], val_log["bbox"], val_log["giou"],
+        )
+        history.append({"epoch": epoch, "train": train_log, "val": val_log})
+        if progress_callback is not None:
+            try:
+                progress_callback(epoch, train_log, val_log)
+            except TrainingStoppedError:
+                logger.info("Training stopped by user at epoch %d.", epoch)
+                break
+            except Exception as exc:
+                logger.warning("progress_callback raised %s; continuing.", exc)
+        # Save best
+        if val_log["loss"] < best_val_loss:
+            best_val_loss = val_log["loss"]
+            best_path = cfg.output_dir / "best"
+            model.save_pretrained(str(best_path))
+            processor.save_pretrained(str(best_path))
+            logger.info("  ✓ New best val_loss=%.4f  saved → %s", best_val_loss, best_path)
+        # Periodic checkpoint
+        if epoch % cfg.save_every == 0:
+            ckpt_path = cfg.output_dir / f"checkpoint-epoch-{epoch:03d}"
+            model.save_pretrained(str(ckpt_path))
+            processor.save_pretrained(str(ckpt_path))
+            logger.info("  Checkpoint saved → %s", ckpt_path)
+    # Save training history
+    history_path = cfg.output_dir / "training_history.json"
+    with history_path.open("w") as fh:
+        json.dump(history, fh, indent=2)
+    elapsed = time.time() - t0
+    logger.info(
+        "Fine-tuning complete in %.1f min. Best val_loss=%.4f. Model at %s/best",
+        elapsed / 60, best_val_loss, cfg.output_dir,
+    )

autolabel/segment.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+segment.py — SAM2 segmentation using bounding-box prompts.
+Workflow (Grounded SAM2 pattern):
+    OWLv2 text prompts → bounding boxes
+    SAM2 box prompts   → pixel masks
+Model: facebook/sam2-hiera-tiny (~160 MB, fast enough for development)
+Each detection returned by segment_with_boxes() gains two extra fields:
+  "mask":        bool numpy array (H, W) — pixel mask in image space
+  "segmentation": COCO polygon list [[x, y, x, y, ...], ...]
+"""
+from __future__ import annotations
+import logging
+from typing import Optional
+import numpy as np
+import torch
+from PIL import Image
+logger = logging.getLogger(__name__)
+SAM2_DEFAULT_MODEL = "facebook/sam2-hiera-tiny"
+def load_sam2(device: str, model_id: str = SAM2_DEFAULT_MODEL):
+    """Load SAM2 processor and model onto *device*. Returns (processor, model)."""
+    from transformers import Sam2Processor, Sam2Model
+    logger.info("Loading SAM2 %s on %s …", model_id, device)
+    processor = Sam2Processor.from_pretrained(model_id)
+    # SAM2 runs in float32 — bfloat16/float16 not reliably supported on all backends
+    model = Sam2Model.from_pretrained(model_id, torch_dtype=torch.float32).to(device)
+    model.eval()
+    logger.info("SAM2 ready.")
+    return processor, model
+def _mask_to_polygon(mask: np.ndarray) -> list[list[float]]:
+    """Convert a boolean 2-D mask to a COCO polygon list.
+    Returns a list of polygons; each polygon is a flat [x1,y1,x2,y2,…] list.
+    Returns [] if cv2 is unavailable or no contour is found.
+    """
+    try:
+        import cv2
+    except ImportError:
+        logger.warning("opencv-python not installed — segmentation polygons skipped.")
+        return []
+    mask_u8 = mask.astype(np.uint8) * 255
+    contours, _ = cv2.findContours(mask_u8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    polygons: list[list[float]] = []
+    for contour in contours:
+        if contour.size >= 6:  # need at least 3 points
+            polygons.append(contour.flatten().tolist())
+    return polygons
+def segment_with_boxes(
+    pil_image: Image.Image,
+    detections: list[dict],
+    processor,
+    model,
+    device: str,
+) -> list[dict]:
+    """Run SAM2 on *pil_image* using the bounding box from each detection.
+    Each detection in the returned list gains:
+      "mask"         — bool numpy array (H, W)
+      "segmentation" — COCO polygon list
+    Detections without a valid box are passed through unchanged (no mask field).
+    """
+    if not detections:
+        return detections
+    augmented: list[dict] = []
+    h, w = pil_image.height, pil_image.width
+    for det in detections:
+        box = det.get("box_xyxy")
+        if box is None:
+            augmented.append(det)
+            continue
+        x1, y1, x2, y2 = box
+        try:
+            # input_boxes: [batch=1, n_boxes=1, 4]
+            encoding = processor(
+                images=pil_image,
+                input_boxes=[[[x1, y1, x2, y2]]],
+                return_tensors="pt",
+            )
+            # transformers 5.x Sam2Processor returns: pixel_values, original_sizes,
+            # input_boxes — no reshaped_input_sizes.  Move all tensors to device.
+            inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in encoding.items()}
+            with torch.no_grad():
+                outputs = model(**inputs, multimask_output=False)
+            # pred_masks shape: [batch, n_boxes, n_masks, H_low, W_low]
+            # post_process_masks(masks, original_sizes) — transformers 5.x API:
+            #   iterates over batch; each masks[i] goes through F.interpolate to
+            #   original_size, then optional binarise. Expects 4-D per-image tensor.
+            # We pass pred_masks directly; masks[0] = [n_boxes, n_masks, H_low, W_low]
+            # which F.interpolate handles as [N, C, H, W].
+            original_sizes = encoding.get("original_sizes", torch.tensor([[h, w]]))
+            masks = processor.post_process_masks(
+                outputs.pred_masks,
+                original_sizes,
+            )
+            # masks[0]: [n_boxes=1, n_masks=1, H_orig, W_orig]
+            mask_np: np.ndarray = masks[0][0, 0].cpu().numpy().astype(bool)
+        except Exception:
+            logger.exception(
+                "SAM2 failed for '%s' — using empty mask", det.get("label", "?")
+            )
+            mask_np = np.zeros((h, w), dtype=bool)
+        polygons = _mask_to_polygon(mask_np)
+        augmented.append({**det, "mask": mask_np, "segmentation": polygons})
+    return augmented

autolabel/utils.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+utils.py — shared helpers for the autolabel pipeline.
+"""
+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+from typing import Any
+logger = logging.getLogger(__name__)
+IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}
+def collect_images(directory: Path) -> list[Path]:
+    """Return sorted list of image files under *directory*."""
+    images = sorted(
+        p for p in directory.rglob("*") if p.suffix.lower() in IMAGE_EXTENSIONS
+    )
+    logger.info("Found %d image(s) in %s", len(images), directory)
+    return images
+def load_json(path: Path) -> Any:
+    """Load and return JSON from *path*."""
+    with path.open("r", encoding="utf-8") as fh:
+        return json.load(fh)
+def save_json(data: Any, path: Path, indent: int = 2) -> None:
+    """Serialise *data* to JSON at *path*, creating parent dirs as needed."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as fh:
+        json.dump(data, fh, indent=indent, ensure_ascii=False)
+    logger.debug("Saved JSON → %s", path)
+def detection_json_path(image_path: Path, detections_dir: Path) -> Path:
+    """Return the expected detection JSON path for a given image."""
+    return detections_dir / (image_path.stem + ".json")
+def setup_logging(level: int = logging.INFO) -> None:
+    """Configure root logger with a sensible format."""
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s [%(levelname)s] %(name)s — %(message)s",
+        datefmt="%H:%M:%S",
+    )

pyproject.toml ADDED Viewed

	@@ -0,0 +1,47 @@

+[project]
+name = "labelplayground"
+version = "0.1.0"
+description = "Auto-labeling pipeline using OWLv2 + SAM2 for household object detection"
+readme = "README.md"
+requires-python = "==3.11.*"
+license = { text = "MIT" }
+authors = [{ name = "Erick Rosas" }]
+dependencies = [
+    # Deep learning — device-agnostic (CUDA / MPS / CPU)
+    "torch>=2.2.0",
+    "torchvision>=0.17.0",
+    # Hugging Face — OWLv2 + SAM2 models & processors (Apache 2.0)
+    "transformers>=4.45.0",  # SAM2 support added in 4.45
+    # Computer vision
+    "pillow>=10.3.0",
+    "opencv-python>=4.9.0",  # mask → COCO polygon via cv2.findContours (SAM2)
+    # Data & utilities
+    "numpy>=1.26.0",
+    "pydantic>=2.7.0",
+    "pydantic-settings>=2.3.0",
+    # CLI
+    "click>=8.1.7",
+    "tqdm>=4.66.0",
+    # Environment
+    "python-dotenv>=1.0.1",
+    # Web UI
+    "gradio>=6.0.0",
+]
+[project.scripts]
+autolabel-detect = "scripts.run_detection:main"
+autolabel-export = "scripts.export_coco:main"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["autolabel", "scripts"]

samples/CREDITS.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+Sample images used in the autolabel demo
+=========================================
+kitchen.jpg
+  Title:   "Good Food Display - NCI Visuals Online"
+  Source:  https://commons.wikimedia.org/wiki/File:Good_Food_Display_-_NCI_Visuals_Online.jpg
+  License: Public domain (National Cancer Institute / US Government)
+dog.jpg
+  Title:   "Yellow Labrador Looking"
+  Source:  https://commons.wikimedia.org/wiki/File:YellowLabradorLooking_new.jpg
+  License: CC BY-SA 3.0 — Jenn Durfey
+cat.jpg
+  Title:   "Cat November 2010-1a"
+  Source:  https://commons.wikimedia.org/wiki/File:Cat_November_2010-1a.jpg
+  License: CC BY-SA 3.0 — Alvesgaspar

samples/animals.jpg ADDED Viewed

Git LFS Details

SHA256: e01da968645b26c22d4ef1474869e1806116403be515e79e995e8d97b4c6f147
Pointer size: 131 Bytes
Size of remote file: 240 kB

samples/cat.jpg ADDED Viewed

Git LFS Details

SHA256: 3296abb6812fdb036ae9cd5d8890a386216779f4e018eddae59f7e019004dc57
Pointer size: 131 Bytes
Size of remote file: 121 kB

samples/dog.jpg ADDED Viewed

samples/kitchen.jpg ADDED Viewed

Git LFS Details

SHA256: 71366fe7b056ecf81ed4225beac8b3dcc64fda1451527c863455b20b3ee8000e
Pointer size: 131 Bytes
Size of remote file: 151 kB

scripts/export_coco.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+export_coco.py — CLI entrypoint for the COCO JSON export stage.
+Usage:
+    uv run python scripts/export_coco.py
+    uv run python scripts/export_coco.py --labeled-dir data/labeled --output data/labeled/coco_export.json
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+import click
+from dotenv import load_dotenv
+load_dotenv()
+from autolabel.export import run_export
+from autolabel.config import settings
+from autolabel.utils import setup_logging
+@click.command()
+@click.option(
+    "--labeled-dir",
+    default=str(settings.labeled_dir),
+    show_default=True,
+    type=click.Path(exists=True, file_okay=False, path_type=Path),
+    help="Directory containing accepted-annotation JSON files.",
+)
+@click.option(
+    "--output",
+    default=str(settings.labeled_dir / "coco_export.json"),
+    show_default=True,
+    type=click.Path(path_type=Path),
+    help="Output path for the COCO JSON file.",
+)
+@click.option("--verbose", "-v", is_flag=True, default=False, help="Debug logging.")
+def main(labeled_dir: Path, output: Path, verbose: bool) -> None:
+    """Export accepted annotations from LABELED_DIR to COCO JSON format."""
+    setup_logging(logging.DEBUG if verbose else logging.INFO)
+    run_export(labeled_dir=labeled_dir, output_path=output, cfg=settings)
+if __name__ == "__main__":
+    main()

scripts/finetune_owlv2.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+finetune_owlv2.py — CLI for fine-tuning OWLv2 on a COCO-format dataset.
+Usage:
+    uv run python scripts/finetune_owlv2.py
+    uv run python scripts/finetune_owlv2.py --epochs 20 --lr 5e-5
+    uv run python scripts/finetune_owlv2.py --unfreeze-vision --backbone-lr 1e-5
+    uv run python scripts/finetune_owlv2.py --resume models/owlv2-finetuned/checkpoint-epoch-005
+Recommended hardware:
+    CUDA (Windows/Linux) — use fp16 for speed, set --device cuda
+    MPS  (Apple Silicon) — fp32 only, slower but functional for small datasets
+    CPU  — very slow, only for tiny sanity-check runs
+Typical first run:
+    1. make export              # build data/labeled/coco_export.json
+    2. make finetune            # train with defaults
+    3. Update app.py to load from models/owlv2-finetuned/best
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+import click
+import torch
+from dotenv import load_dotenv
+load_dotenv()
+from autolabel.config import settings
+from autolabel.finetune import FinetuneConfig, run_finetune
+from autolabel.utils import setup_logging
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+DEFAULT_OUTPUT = PROJECT_ROOT / "models" / "owlv2-finetuned"
+@click.command()
+@click.option(
+    "--coco-json",
+    default=str(settings.labeled_dir / "coco_export.json"),
+    show_default=True,
+    type=click.Path(exists=True, path_type=Path),
+    help="COCO JSON file produced by `make export`.",
+)
+@click.option(
+    "--image-dir",
+    default=str(settings.raw_dir),
+    show_default=True,
+    type=click.Path(exists=True, file_okay=False, path_type=Path),
+    help="Directory containing the source images (matched by file_name in COCO JSON).",
+)
+@click.option(
+    "--output-dir",
+    default=str(DEFAULT_OUTPUT),
+    show_default=True,
+    type=click.Path(file_okay=False, path_type=Path),
+    help="Directory to save checkpoints and the best model.",
+)
+@click.option("--model", default=settings.model, show_default=True,
+              help="Base model to fine-tune.")
+@click.option("--epochs", default=10, show_default=True, type=int)
+@click.option("--batch-size", default=1, show_default=True, type=int,
+              help="Images per forward pass. Keep at 1 for OWLv2-large on ≤8 GB VRAM.")
+@click.option("--grad-accum", default=4, show_default=True, type=int,
+              help="Gradient accumulation steps. Effective batch = batch_size * grad_accum.")
+@click.option("--lr", default=1e-4, show_default=True, type=float,
+              help="Learning rate for detection heads.")
+@click.option("--val-split", default=0.2, show_default=True, type=float,
+              help="Fraction of data to use for validation.")
+@click.option("--warmup-steps", default=50, show_default=True, type=int)
+@click.option("--save-every", default=1, show_default=True, type=int,
+              help="Save a checkpoint every N epochs.")
+@click.option(
+    "--unfreeze-vision", is_flag=True, default=False,
+    help="Also fine-tune the ViT image encoder (needs more VRAM, slower).",
+)
+@click.option(
+    "--backbone-lr", default=1e-5, show_default=True, type=float,
+    help="LR for the vision encoder when --unfreeze-vision is set.",
+)
+@click.option(
+    "--resume",
+    default=None,
+    type=click.Path(path_type=Path),
+    help="Path to a saved checkpoint to resume from.",
+)
+@click.option(
+    "--device",
+    default=settings.device,
+    show_default=True,
+    help="Torch device: cuda | mps | cpu.",
+)
+@click.option("--verbose", "-v", is_flag=True, default=False)
+def main(
+    coco_json: Path,
+    image_dir: Path,
+    output_dir: Path,
+    model: str,
+    epochs: int,
+    batch_size: int,
+    grad_accum: int,
+    lr: float,
+    val_split: float,
+    warmup_steps: int,
+    save_every: int,
+    unfreeze_vision: bool,
+    backbone_lr: float,
+    resume: Path | None,
+    device: str,
+    verbose: bool,
+) -> None:
+    """Fine-tune OWLv2 on your labeled COCO dataset."""
+    setup_logging(logging.DEBUG if verbose else logging.INFO)
+    dtype = torch.float16 if device == "cuda" else torch.float32
+    cfg = FinetuneConfig(
+        coco_json=coco_json,
+        image_dir=image_dir,
+        output_dir=output_dir,
+        model_name=model,
+        device=device,
+        torch_dtype=dtype,
+        epochs=epochs,
+        batch_size=batch_size,
+        grad_accum_steps=grad_accum,
+        lr=lr,
+        backbone_lr=backbone_lr if unfreeze_vision else 0.0,
+        val_split=val_split,
+        warmup_steps=warmup_steps,
+        save_every=save_every,
+        unfreeze_vision=unfreeze_vision,
+        resume_from=resume,
+    )
+    click.echo(f"Fine-tuning OWLv2 on {coco_json}")
+    click.echo(f"  device       : {device}  ({dtype})")
+    click.echo(f"  epochs       : {epochs}")
+    click.echo(f"  effective bs : {batch_size * grad_accum}")
+    click.echo(f"  heads lr     : {lr}")
+    click.echo(f"  unfreeze ViT : {unfreeze_vision}")
+    click.echo(f"  output       : {output_dir}")
+    click.echo()
+    run_finetune(cfg)
+if __name__ == "__main__":
+    main()

scripts/run_detection.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+run_detection.py — CLI entrypoint for the OWLv2 detection stage.
+Usage:
+    uv run python scripts/run_detection.py --image-dir data/raw --output-dir data/detections
+    uv run python scripts/run_detection.py --help
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Optional
+import click
+from dotenv import load_dotenv
+load_dotenv()  # picks up PYTORCH_ENABLE_MPS_FALLBACK and other vars
+from autolabel.detect import run_detection
+from autolabel.config import settings
+from autolabel.utils import setup_logging
+@click.command()
+@click.option(
+    "--image-dir",
+    default=str(settings.raw_dir),
+    show_default=True,
+    type=click.Path(exists=True, file_okay=False, path_type=Path),
+    help="Directory containing input images.",
+)
+@click.option(
+    "--output-dir",
+    default=str(settings.detections_dir),
+    show_default=True,
+    type=click.Path(file_okay=False, path_type=Path),
+    help="Directory to write per-image detection JSON files.",
+)
+@click.option(
+    "--prompts",
+    default=None,
+    help="Comma-separated list of text prompts (overrides config defaults).",
+)
+@click.option(
+    "--threshold",
+    default=None,
+    type=float,
+    help="Score threshold override (0.0–1.0).",
+)
+@click.option(
+    "--force",
+    is_flag=True,
+    default=False,
+    help="Re-process images even if a detection JSON already exists.",
+)
+@click.option("--verbose", "-v", is_flag=True, default=False, help="Debug logging.")
+def main(
+    image_dir: Path,
+    output_dir: Path,
+    prompts: Optional[str],
+    threshold: Optional[float],
+    force: bool,
+    verbose: bool,
+) -> None:
+    """Run OWLv2 open-vocabulary detection on IMAGE_DIR images."""
+    setup_logging(logging.DEBUG if verbose else logging.INFO)
+    prompt_list = None
+    if prompts:
+        prompt_list = [p.strip() for p in prompts.split(",") if p.strip()]
+    if threshold is not None:
+        settings.threshold = threshold
+    run_detection(
+        image_dir=image_dir,
+        output_dir=output_dir,
+        prompts=prompt_list,
+        cfg=settings,
+        force=force,
+    )
+if __name__ == "__main__":
+    main()