Spaces:

Napron
/

small_object_detection

Sleeping

App Files Files Community

orik-ss commited on Mar 7

Commit

82551bb

0 Parent(s):

Log device, Jina CPU warning, pin revision

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +27 -0
.gitattributes +2 -0
.gitignore +14 -0
Dockerfile +35 -0
README.md +38 -0
app.py +202 -0
dfine_jina_pipeline.py +565 -0
jina_fewshot.py +399 -0
models/README.md +6 -0
models/v1/best.pt +3 -0
nomic_fewshot.py +147 -0
refs/cigarette/c2.png +3 -0
refs/cigarette/c3.png +3 -0
refs/cigarette/c4.png +3 -0
refs/cigarette/c5.png +3 -0
refs/cigarette/c6.png +3 -0
refs/cigarette/c7.png +3 -0
refs/cigarette/c9.png +3 -0
refs/cigarette/cigarette.jpg +3 -0
refs/gun/g1.png +3 -0
refs/gun/g2.png +3 -0
refs/gun/g3.png +3 -0
refs/gun/g4.png +3 -0
refs/gun/g5.png +3 -0
refs/gun/g6.png +3 -0
refs/gun/g7.png +3 -0
refs/gun/g8.png +3 -0
refs/gun/g9.png +3 -0
refs/gun/pistol.jpeg +3 -0
refs/knife/k1.png +3 -0
refs/knife/k2.png +3 -0
refs/knife/k3.png +3 -0
refs/knife/k4.png +3 -0
refs/knife/k5.png +3 -0
refs/knife/k6.png +3 -0
refs/knife/k7.png +3 -0
refs/knife/k8.png +3 -0
refs/knife/k9.png +3 -0
refs/knife/knife.jpeg +3 -0
refs/phone/p1.png +3 -0
refs/phone/p2.png +3 -0
refs/phone/p3.png +3 -0
refs/phone/p4.png +3 -0
refs/phone/p5.png +3 -0
refs/phone/p6.png +3 -0
refs/phone/p7.png +3 -0
refs/phone/p8.png +3 -0
refs/phone/p9.jpg +3 -0
refs/phone/phone.jpg +3 -0
requirements-lock.txt +17 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,27 @@

+# Git and env
+.git
+.gitignore
+.gitattributes
+.venv
+venv
+env
+.env
+# Build / cache
+__pycache__
+*.py[cod]
+*.pyo
+.pytest_cache
+.mypy_cache
+# Large or generated (not needed in image)
+full_frames_GT
+threshold_tuning
+*.pt
+models/*.pt
+models/*.onnx
+# IDE / OS
+.cursor
+.DS_Store
+*.swp

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ models/v1/best.pt filter=lfs diff=lfs merge=lfs -text
2	+ refs/** filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+__pycache__/
+*.py[cod]
+.env
+.venv/
+venv/
+# Binary / large assets (refs/ tracked with Git LFS; models not pushed to Space)
+models/*.pt
+models/*.onnx
+full_frames_GT/
+threshold_tuning/crops/
+threshold_tuning/jina_crops/
+threshold_tuning/nomic_crops/
+threshold_tuning/detection_crops/

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+# Small Object Detection — Docker Space
+# Match local: Python 3.10, pinned deps (requirements-lock.txt). Gradio on 7860.
+FROM python:3.10-slim-bookworm
+# System deps: font for draw_label; opencv/ultralytics headless (libxcb, glib, etc.)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    fonts-dejavu-core \
+    libglib2.0-0 \
+    libxcb1 \
+    libxcb-shm0 \
+    libxcb-xfixes0 \
+    libxrender1 \
+    libsm6 \
+    libxext6 \
+    libgl1-mesa-glx \
+    && rm -rf /var/lib/apt/lists/*
+# HF Spaces run as user 1000
+RUN useradd -m -u 1000 user
+ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+USER user
+# Install Python deps from lock file so Space matches local versions (no GPU at build time)
+COPY --chown=user requirements-lock.txt .
+RUN pip install --no-cache-dir --upgrade pip \
+    && pip install --no-cache-dir -r requirements-lock.txt
+# App code (refs/ and code)
+COPY --chown=user . .
+# Gradio must listen on 0.0.0.0 for Docker
+ENV GRADIO_SERVER_NAME=0.0.0.0
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+---
+title: Small Object Detection
+emoji: 🔍
+sdk: docker
+app_port: 7860
+pinned: false
+---
+# Small Object Detection
+Upload an image to detect objects using the trained YOLO model **`best.pt`** in this repo. **CPU-only** — runs on basic (free) Hugging Face Spaces. The `train26m` folder is not part of this repo; only `best.pt` is included.
+## Run locally
+**Using uv (recommended):**
+```bash
+pip install uv
+uv pip install -r requirements.txt
+python app.py
+```
+**Or with pip only:**
+```bash
+pip install -r requirements.txt
+python app.py
+```
+Then open the URL shown in the terminal (e.g. http://127.0.0.1:7860).
+## Docker (Space)
+The Space builds from the Dockerfile using **Python 3.10** and **requirements-lock.txt** so the container matches a known set of versions. To match your local env exactly, from your venv run:
+```bash
+pip freeze > requirements-lock.txt
+```
+Then commit and push; the next Space build will use those exact versions.

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+Gradio app: Tab 1 = Object Detection (YOLO models/v1), Tab 2 = D-FINE + Classify (Jina or Nomic).
+"""
+import os
+os.environ["YOLO_CONFIG_DIR"] = os.environ.get("YOLO_CONFIG_DIR", "/tmp")
+import json
+import numpy as np
+import gradio as gr
+from ultralytics import YOLO
+from pathlib import Path
+# Tab 2: D-FINE runs first, then user chooses Jina or Nomic for crop classification
+from dfine_jina_pipeline import run_single_image
+# --- Object Detection (Tab 1) ---
+PERSON_CLASS = 0
+CAR_CLASS = 2
+KNIFE_CLASS = 80
+WEAPON_CLASS = 81
+DRAW_CLASSES = [PERSON_CLASS, CAR_CLASS, KNIFE_CLASS, WEAPON_CLASS]
+CLASS_NAMES = {
+    PERSON_CLASS: "person",
+    CAR_CLASS: "car",
+    KNIFE_CLASS: "knife",
+    WEAPON_CLASS: "weapon",
+}
+CONF = 0.25
+IMGSZ = 640
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+MODELS_DIR = os.path.join(BASE_DIR, "models")
+REFS_DIR = os.path.join(BASE_DIR, "refs")
+def _load_model(version: str):
+    path = os.path.join(MODELS_DIR, version, "best.pt")
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"Model not found: {path}")
+    return YOLO(path)
+MODELS = {"v1": _load_model("v1")}
+MODEL_CLASSES = {"v1": ["person", "car", "knife", "weapon"]}
+def run_detection(image, model):
+    if image is None:
+        return None, "{}"
+    img = image if isinstance(image, np.ndarray) else np.array(image)
+    if img.ndim == 2:
+        img = np.stack([img] * 3, axis=-1)
+    results = model.predict(
+        source=img,
+        imgsz=IMGSZ,
+        conf=CONF,
+        device="cpu",
+        verbose=False,
+    )
+    r = results[0]
+    if r.boxes is None or len(r.boxes) == 0:
+        return image, json.dumps({"detections": []}, indent=2)
+    clss = r.boxes.cls.cpu().numpy()
+    confs = r.boxes.conf.cpu().numpy()
+    keep = [i for i in range(len(r.boxes)) if int(clss[i]) in DRAW_CLASSES]
+    if not keep:
+        return image, json.dumps({"detections": []}, indent=2)
+    detections = []
+    for i in keep:
+        cls_id = int(clss[i])
+        detections.append({
+            "class": CLASS_NAMES.get(cls_id, str(cls_id)),
+            "confidence": round(float(confs[i]), 3),
+            "bbox": r.boxes.xyxy[i].cpu().numpy().tolist(),
+        })
+    r.boxes = r.boxes[keep]
+    out_img = r.plot()
+    det_json = json.dumps({"detections": detections}, indent=2)
+    return out_img, det_json
+def run_dfine_classify(image, encoder_choice, refs_path):
+    """Tab 2: D-FINE first, then classify crops with Jina or Nomic."""
+    if image is None:
+        return None, "Upload an image."
+    refs = Path(refs_path.strip()) if refs_path and refs_path.strip() else Path(REFS_DIR)
+    if not refs.is_dir():
+        return None, f"Refs folder not found: {refs}"
+    # Tuned on COCO GT: conf=0.5, gap=0.02. Lower det_threshold/min_side so D-FINE picks up more objects (gun, phone, etc.) like local.
+    out_img, text = run_single_image(
+        image,
+        refs_dir=refs,
+        encoder_choice=encoder_choice.lower(),
+        det_threshold=0.15,
+        conf_threshold=0.5,
+        gap_threshold=0.02,
+        min_side=24,
+        crop_dedup_iou=0.4,
+    )
+    if out_img is None:
+        return None, text
+    return out_img, text
+IMG_HEIGHT = 400
+TAB_STYLE = """
+<style>
+[data-testid="tabs"] > div:first-child,
+.gr-tabs > div:first-child,
+div[class*="tabs"] > div:first-child {
+    display: flex !important;
+    width: 100% !important;
+}
+[data-testid="tabs"] button,
+.gr-tabs button,
+div[class*="tabs"] > div:first-child button {
+    flex: 1 !important;
+    min-width: 0 !important;
+    min-height: 40px !important;
+    color: white !important;
+    font-weight: 700 !important;
+    font-size: 1rem !important;
+    text-align: center !important;
+    justify-content: center !important;
+}
+[data-testid="tabs"] button:not([aria-selected="true"]),
+.gr-tabs button:not([aria-selected="true"]),
+div[class*="tabs"] > div:first-child button:not([aria-selected="true"]) {
+    background: #6b7280 !important;
+    border-color: #6b7280 !important;
+}
+[data-testid="tabs"] button[aria-selected="true"],
+.gr-tabs button[aria-selected="true"],
+div[class*="tabs"] > div:first-child button[aria-selected="true"] {
+    background: var(--primary-500, #f97316) !important;
+    border-color: var(--primary-500, #f97316) !important;
+}
+</style>
+"""
+with gr.Blocks(title="Small Object Detection") as app:
+    gr.HTML(TAB_STYLE)
+    gr.Markdown("# Small Object Detection")
+    with gr.Tabs():
+        with gr.TabItem("Object Detection"):
+            gr.Markdown("**Classes:** " + ", ".join(MODEL_CLASSES["v1"]))
+            with gr.Row():
+                with gr.Column(scale=1):
+                    inp_det = gr.Image(label="Input image", height=IMG_HEIGHT)
+                    btn_det = gr.Button("Detect", variant="primary")
+                out_img_det = gr.Image(label="Output", height=IMG_HEIGHT)
+            det_output = gr.JSON(label="Detections")
+            btn_det.click(
+                fn=lambda img: run_detection(img, MODELS["v1"]),
+                inputs=inp_det,
+                outputs=[out_img_det, det_output],
+            )
+        with gr.TabItem("D-FINE + Classify"):
+            gr.Markdown(
+                "**D-FINE** runs first (person/car grouping), then small-object crops are classified. "
+                "Choose **Jina** or **Nomic** for the embedding/classification model. "
+                "Uses the **refs** folder (one subfolder per class, e.g. refs/phone/, refs/cigarette/) with reference images."
+            )
+            with gr.Row():
+                with gr.Column(scale=1):
+                    inp_dfine = gr.Image(type="pil", label="Input image", height=IMG_HEIGHT)
+                    encoder_choice = gr.Radio(
+                        choices=["Jina", "Nomic"],
+                        value="Jina",
+                        label="Embedding / classification model",
+                    )
+                    refs_path = gr.Textbox(
+                        label="Refs folder path",
+                        value=REFS_DIR,
+                        placeholder="e.g. refs or /path/to/refs",
+                    )
+                    btn_dfine = gr.Button("Run D-FINE + Classify", variant="primary")
+                with gr.Column(scale=1):
+                    out_img_dfine = gr.Image(label="Output (crops with labels)", height=IMG_HEIGHT)
+                    out_text_dfine = gr.Textbox(label="Crop predictions", lines=10, interactive=False)
+            btn_dfine.click(
+                fn=run_dfine_classify,
+                inputs=[inp_dfine, encoder_choice, refs_path],
+                outputs=[out_img_dfine, out_text_dfine],
+                concurrency_limit=1,
+            )
+app.launch(
+    server_name=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
+    server_port=int(os.environ.get("PORT", os.environ.get("GRADIO_SERVER_PORT", 7860))),
+)

dfine_jina_pipeline.py ADDED Viewed

	@@ -0,0 +1,565 @@

+"""
+Pipeline: D-FINE (person/car only) → group detections → crop regions →
+  find all bboxes inside each crop → Jina-CLIP-v2 and Nomic embeddings on those crops.
+Outputs separate crop folders per model (jina_crops, nomic_crops) for visual comparison.
+"""
+import argparse
+import csv
+import time
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from transformers import AutoImageProcessor, DFineForObjectDetection
+# Jina-CLIP-v2 few-shot (same refs + classify as jina_fewshot.py)
+from jina_fewshot import (
+    IMAGE_EXTS,
+    TRUNCATE_DIM,
+    JinaCLIPv2Encoder,
+    build_refs,
+    classify as jina_classify,
+    draw_label_on_image,
+)
+# -----------------------------------------------------------------------------
+# Detection + grouping (from reference_detection.py)
+# -----------------------------------------------------------------------------
+def get_box_dist(box1, box2):
+    """Euclidean distance between box centers. box = [x1, y1, x2, y2]."""
+    c1 = np.array([(box1[0] + box1[2]) / 2, (box1[1] + box1[3]) / 2])
+    c2 = np.array([(box2[0] + box2[2]) / 2, (box2[1] + box2[3]) / 2])
+    return np.linalg.norm(c1 - c2)
+def group_detections(detections, threshold):
+    """
+    Group detections by proximity (center distance < threshold).
+    detections: list of {"box": [x1,y1,x2,y2], "conf", "cls", ...}
+    Returns list of {"box": merged [x1,y1,x2,y2], "conf": best in group, "cls": best in group}.
+    """
+    if not detections:
+        return []
+    boxes = [d["box"] for d in detections]
+    n = len(boxes)
+    adj = {i: [] for i in range(n)}
+    for i in range(n):
+        for j in range(i + 1, n):
+            if get_box_dist(boxes[i], boxes[j]) < threshold:
+                adj[i].append(j)
+                adj[j].append(i)
+    groups = []
+    visited = [False] * n
+    for i in range(n):
+        if not visited[i]:
+            group_indices = []
+            stack = [i]
+            visited[i] = True
+            while stack:
+                curr = stack.pop()
+                group_indices.append(curr)
+                for neighbor in adj[curr]:
+                    if not visited[neighbor]:
+                        visited[neighbor] = True
+                        stack.append(neighbor)
+            group_dets = [detections[k] for k in group_indices]
+            x1 = min(d["box"][0] for d in group_dets)
+            y1 = min(d["box"][1] for d in group_dets)
+            x2 = max(d["box"][2] for d in group_dets)
+            y2 = max(d["box"][3] for d in group_dets)
+            best_det = max(group_dets, key=lambda x: x["conf"])
+            groups.append({
+                "box": [x1, y1, x2, y2],
+                "conf": best_det["conf"],
+                "cls": best_det["cls"],
+                "label": best_det.get("label", str(best_det["cls"])),
+            })
+    return groups
+def box_center_inside(box, crop_box):
+    """True if center of box is inside crop_box. All [x1,y1,x2,y2]."""
+    cx = (box[0] + box[2]) / 2
+    cy = (box[1] + box[3]) / 2
+    return (
+        crop_box[0] <= cx <= crop_box[2]
+        and crop_box[1] <= cy <= crop_box[3]
+    )
+def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
+    """
+    Expand the shorter side to match the longer (same ratio / square), centered, clamped to image.
+    If height > width: expand width. If width >= height: expand height.
+    Returns (bx1, by1, bx2, by2) as integers.
+    """
+    orig = (int(bx1), int(by1), int(bx2), int(by2))
+    w = bx2 - bx1
+    h = by2 - by1
+    if w <= 0 or h <= 0:
+        return orig
+    if h > w:
+        add = (h - w) / 2.0
+        bx1 = max(0, bx1 - add)
+        bx2 = min(img_w, bx2 + add)
+    else:
+        add = (w - h) / 2.0
+        by1 = max(0, by1 - add)
+        by2 = min(img_h, by2 + add)
+    bx1, by1, bx2, by2 = int(bx1), int(by1), int(bx2), int(by2)
+    if bx2 <= bx1 or by2 <= by1:
+        return orig
+    return bx1, by1, bx2, by2
+def box_iou(box1, box2):
+    """IoU of two boxes [x1,y1,x2,y2]. Returns float in [0, 1]."""
+    ix1 = max(box1[0], box2[0])
+    iy1 = max(box1[1], box2[1])
+    ix2 = min(box1[2], box2[2])
+    iy2 = min(box1[3], box2[3])
+    inter_w = max(0, ix2 - ix1)
+    inter_h = max(0, iy2 - iy1)
+    inter = inter_w * inter_h
+    a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union = a1 + a2 - inter
+    return inter / union if union > 0 else 0.0
+def deduplicate_by_iou(detections, iou_threshold=0.9):
+    """Keep one detection per overlapping group (IoU >= iou_threshold). Prefer higher confidence."""
+    if not detections:
+        return []
+    # Sort by confidence descending; keep first, then add only if no kept box overlaps >= threshold
+    sorted_d = sorted(detections, key=lambda x: -x["conf"])
+    kept = []
+    for d in sorted_d:
+        if not any(box_iou(d["box"], k["box"]) >= iou_threshold for k in kept):
+            kept.append(d)
+    return kept
+from nomic_fewshot import NomicTextEncoder, NomicVisionEncoder, build_refs_nomic
+def parse_args():
+    p = argparse.ArgumentParser(
+        description="D-FINE (person/car) → group → Jina-CLIP-v2 on crops inside groups"
+    )
+    p.add_argument("--refs", required=True, help="Reference images folder for Jina and Nomic (e.g. refs/)")
+    p.add_argument("--input", required=True, help="Full-frame images folder")
+    p.add_argument("--output", default="pipeline_results", help="Output folder (CSV, etc.)")
+    p.add_argument("--det-threshold", type=float, default=0.13, help="D-FINE score threshold")
+    p.add_argument("--group-dist", type=float, default=None,
+                   help="Group distance (default: 0.1 * max(H,W))")
+    p.add_argument("--min-side", type=int, default=40, help="Min side of expanded bbox in px (skip smaller)")
+    p.add_argument("--crop-dedup-iou", type=float, default=0.35, help="Min IoU to treat two crops as same object (keep larger)")
+    p.add_argument("--no-squarify", action="store_true", help="Skip squarify; use expanded bbox only (tighter crops, often better recognition)")
+    p.add_argument("--padding", type=float, default=0.2, help="Crop padding around group box (0.2 = 20%%)")
+    p.add_argument("--conf-threshold", type=float, default=0.75, help="Jina accept confidence")
+    p.add_argument("--gap-threshold", type=float, default=0.05, help="Jina accept gap")
+    p.add_argument("--text-weight", type=float, default=0.3)
+    p.add_argument("--max-images", type=int, default=None)
+    p.add_argument("--device", default=None)
+    return p.parse_args()
+def get_person_car_label_ids(model):
+    """Return set of label IDs for person and car (Objects365: Person, Car, SUV, etc.)."""
+    id2label = getattr(model.config, "id2label", None) or {}
+    ids = set()
+    for idx, name in id2label.items():
+        try:
+            i = int(idx)
+        except (ValueError, TypeError):
+            continue
+        n = (name or "").lower()
+        if "person" in n or n in ("car", "suv"):
+            ids.add(i)
+    return ids
+def run_dfine(image, processor, model, device, score_threshold):
+    """Run D-FINE, return all detections as list of {box, score, label_id, label}."""
+    from PIL import Image
+    if isinstance(image, Image.Image):
+        pil = image.convert("RGB")
+    else:
+        pil = Image.fromarray(image).convert("RGB")
+    w, h = pil.size
+    target_size = torch.tensor([[h, w]], device=device)
+    inputs = processor(images=pil, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+    target_sizes = target_size.to(outputs["logits"].device)
+    results = processor.post_process_object_detection(
+        outputs, target_sizes=target_sizes, threshold=score_threshold
+    )
+    id2label = getattr(model.config, "id2label", {}) or {}
+    detections = []
+    for result in results:
+        for score, label_id, box in zip(
+            result["scores"], result["labels"], result["boxes"]
+        ):
+            sid = int(label_id.item())
+            detections.append({
+                "box": [float(x) for x in box.cpu().tolist()],
+                "conf": float(score.item()),
+                "cls": sid,
+                "label": id2label.get(sid, str(sid)),
+            })
+    return detections
+def main():
+    args = parse_args()
+    device = args.device or ("cuda" if torch.cuda.is_available() else "cpu")
+    input_dir = Path(args.input)
+    output_dir = Path(args.output)
+    refs_dir = Path(args.refs)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if not refs_dir.is_dir():
+        raise SystemExit(f"Refs folder not found: {refs_dir}")
+    if not input_dir.is_dir():
+        raise SystemExit(f"Input folder not found: {input_dir}")
+    paths = sorted(
+        p for p in input_dir.iterdir()
+        if p.suffix.lower() in IMAGE_EXTS
+    )
+    if args.max_images is not None:
+        paths = paths[: args.max_images]
+    if not paths:
+        raise SystemExit(f"No images in {input_dir}")
+    # Load D-FINE
+    print("[*] Loading D-FINE (dfine-medium-obj365)...")
+    t0 = time.perf_counter()
+    image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine-medium-obj365")
+    dfine_model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-medium-obj365")
+    dfine_model = dfine_model.to(device).eval()
+    person_car_ids = get_person_car_label_ids(dfine_model)
+    print(f"    Person/car label IDs: {person_car_ids} ({time.perf_counter()-t0:.1f}s)")
+    # Load Jina-CLIP-v2 + build refs
+    print("[*] Loading Jina-CLIP-v2 and building refs...")
+    t0 = time.perf_counter()
+    jina_encoder = JinaCLIPv2Encoder(device)
+    ref_labels, ref_embs = build_refs(
+        jina_encoder, refs_dir, TRUNCATE_DIM, args.text_weight, batch_size=16
+    )
+    print(f"    Jina refs: {ref_labels} ({time.perf_counter()-t0:.1f}s)\n")
+    # Load Nomic vision + text, build refs (same as Jina: image + text prompts, text_weight 0.3)
+    print("[*] Loading Nomic embed-vision + embed-text and building refs...")
+    t0 = time.perf_counter()
+    nomic_encoder = NomicVisionEncoder(device)
+    nomic_text_encoder = NomicTextEncoder(device)
+    ref_labels_nomic, ref_embs_nomic = build_refs_nomic(
+        nomic_encoder, refs_dir, batch_size=16,
+        text_encoder=nomic_text_encoder, text_weight=args.text_weight,
+    )
+    print(f"    Nomic refs: {ref_labels_nomic} ({time.perf_counter()-t0:.1f}s)\n")
+    # Separate output folders per model for visual comparison
+    jina_crops_dir = output_dir / "jina_crops"
+    nomic_crops_dir = output_dir / "nomic_crops"
+    jina_crops_dir.mkdir(parents=True, exist_ok=True)
+    nomic_crops_dir.mkdir(parents=True, exist_ok=True)
+    # CSV
+    csv_path = output_dir / "results.csv"
+    f = open(csv_path, "w", newline="")
+    w = csv.writer(f)
+    w.writerow([
+        "image", "crop_filename", "group_idx", "crop_x1", "crop_y1", "crop_x2", "crop_y2",
+        "bbox_x1", "bbox_y1", "bbox_x2", "bbox_y2", "dfine_label", "dfine_conf",
+        "jina_prediction", "jina_confidence", "jina_status",
+        "nomic_prediction", "nomic_confidence", "nomic_status",
+    ])
+    for img_path in paths:
+        pil = Image.open(img_path).convert("RGB")
+        img_w, img_h = pil.size
+        group_dist = args.group_dist if args.group_dist is not None else 0.1 * max(img_h, img_w)
+        # 1) D-FINE: detect everything, keep all bboxes for the image
+        detections = run_dfine(
+            pil, image_processor, dfine_model, device, args.det_threshold
+        )
+        person_car = [d for d in detections if d["cls"] in person_car_ids]
+        if not person_car:
+            continue
+        # 2) Group person/car detections (same as reference)
+        grouped = group_detections(person_car, group_dist)
+        grouped.sort(key=lambda x: x["conf"], reverse=True)
+        top_groups = grouped[:10]  # limit groups per image
+        # 3) Collect all candidate crops (bboxes inside person/car groups)
+        #    Each: (crop_box, crop_pil, d, gidx, crop_idx, x1, y1, x2, y2)
+        candidates = []
+        for gidx, grp in enumerate(top_groups):
+            x1, y1, x2, y2 = grp["box"]
+            group_box = [x1, y1, x2, y2]
+            inside = [
+                d for d in detections
+                if box_center_inside(d["box"], group_box)
+                and d["cls"] not in person_car_ids
+            ]
+            inside = deduplicate_by_iou(inside, iou_threshold=0.9)
+            for crop_idx, d in enumerate(inside):
+                bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
+                obj_w, obj_h = bx2 - bx1, by2 - by1
+                if obj_w <= 0 or obj_h <= 0:
+                    continue
+                pad_x = obj_w * 0.3
+                pad_y = obj_h * 0.3
+                bx1 = max(0, int(bx1 - pad_x))
+                by1 = max(0, int(by1 - pad_y))
+                bx2 = min(img_w, int(bx2 + pad_x))
+                by2 = min(img_h, int(by2 + pad_y))
+                if bx2 <= bx1 or by2 <= by1:
+                    continue
+                if min(bx2 - bx1, by2 - by1) < args.min_side:
+                    continue
+                expanded_box = [bx1, by1, bx2, by2]
+                candidates.append((expanded_box, d, gidx, crop_idx, x1, y1, x2, y2))
+        # 4) Dedup on EXPANDED boxes (before squarify), keep larger; then squarify only kept
+        def crop_area(box):
+            return (box[2] - box[0]) * (box[3] - box[1])
+        candidates.sort(key=lambda c: -crop_area(c[0]))
+        kept = []
+        for c in candidates:
+            expanded_box = c[0]
+            def is_same_object(box_a, box_b):
+                if box_iou(box_a, box_b) >= args.crop_dedup_iou:
+                    return True
+                if box_center_inside(box_a, box_b) or box_center_inside(box_b, box_a):
+                    return True
+                return False
+            if not any(is_same_object(expanded_box, k[0]) for k in kept):
+                kept.append(c)
+        # 5) Optionally squarify, then run Jina and Nomic only on kept crops
+        for i, (expanded_box, d, gidx, crop_idx, x1, y1, x2, y2) in enumerate(kept):
+            if not args.no_squarify:
+                bx1, by1, bx2, by2 = squarify_crop_box(
+                    expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3], img_w, img_h
+                )
+            else:
+                bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
+            crop_pil = pil.crop((bx1, by1, bx2, by2))
+            crop_name = f"{img_path.stem}_g{gidx}_{i}_{bx1}_{by1}_{bx2}_{by2}{img_path.suffix}"
+            q_jina = jina_encoder.encode_images([crop_pil], TRUNCATE_DIM)
+            result_jina = jina_classify(
+                q_jina, ref_labels, ref_embs,
+                args.conf_threshold, args.gap_threshold
+            )
+            if result_jina["prediction"] in ref_labels:
+                label_jina = result_jina["prediction"]
+                conf_jina = result_jina["confidence"]
+            else:
+                label_jina = f"unnamed (dfine: {d['label']})"
+                conf_jina = 0.0
+            ann_jina = draw_label_on_image(crop_pil, label_jina, conf_jina)
+            ann_jina.save(jina_crops_dir / crop_name)
+            q_nomic = nomic_encoder.encode_images([crop_pil])
+            result_nomic = jina_classify(
+                q_nomic, ref_labels_nomic, ref_embs_nomic,
+                args.conf_threshold, args.gap_threshold
+            )
+            if result_nomic["prediction"] in ref_labels_nomic:
+                label_nomic = result_nomic["prediction"]
+                conf_nomic = result_nomic["confidence"]
+            else:
+                label_nomic = f"unnamed (dfine: {d['label']})"
+                conf_nomic = 0.0
+            ann_nomic = draw_label_on_image(crop_pil, label_nomic, conf_nomic)
+            ann_nomic.save(nomic_crops_dir / crop_name)
+            w.writerow([
+                img_path.name, crop_name, gidx,
+                x1, y1, x2, y2,
+                bx1, by1, bx2, by2,
+                d["label"], f"{d['conf']:.4f}",
+                result_jina["prediction"], f"{result_jina['confidence']:.4f}", result_jina["status"],
+                result_nomic["prediction"], f"{result_nomic['confidence']:.4f}", result_nomic["status"],
+            ])
+    f.close()
+    print(f"[*] Wrote {csv_path}")
+    print(f"[*] Jina crops:  {jina_crops_dir}")
+    print(f"[*] Nomic crops: {nomic_crops_dir}")
+# -----------------------------------------------------------------------------
+# Single-image runner for Gradio app: D-FINE first, then Jina or Nomic (user choice)
+# -----------------------------------------------------------------------------
+_APP_DFINE = None
+_APP_JINA = None
+_APP_NOMIC = None
+_APP_REFS_JINA = None
+_APP_REFS_NOMIC = None
+def run_single_image(
+    pil_image,
+    refs_dir,
+    device=None,
+    encoder_choice="jina",
+    det_threshold=0.3,
+    conf_threshold=0.75,
+    gap_threshold=0.05,
+    min_side=40,
+    crop_dedup_iou=0.35,
+    squarify=True,
+):
+    """
+    Run D-FINE on one image, then classify small-object crops with Jina or Nomic.
+    refs_dir: path to refs folder (str or Path). encoder_choice: "jina" or "nomic".
+    Returns (annotated_pil, result_text) for display in app.
+    """
+    import numpy as np
+    from PIL import Image
+    global _APP_DFINE, _APP_JINA, _APP_NOMIC, _APP_REFS_JINA, _APP_REFS_NOMIC
+    refs_dir = Path(refs_dir)
+    if not refs_dir.is_dir():
+        return None, f"Refs folder not found: {refs_dir}"
+    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"[*] Device: {device}")
+    pil = pil_image.convert("RGB") if isinstance(pil_image, Image.Image) else Image.fromarray(pil_image).convert("RGB")
+    img_w, img_h = pil.size
+    group_dist = 0.1 * max(img_h, img_w)
+    # Load D-FINE once
+    if _APP_DFINE is None:
+        image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine-medium-obj365")
+        dfine_model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-medium-obj365")
+        dfine_model = dfine_model.to(device).eval()
+        person_car_ids = get_person_car_label_ids(dfine_model)
+        _APP_DFINE = (image_processor, dfine_model, person_car_ids)
+    image_processor, dfine_model, person_car_ids = _APP_DFINE
+    detections = run_dfine(pil, image_processor, dfine_model, device, det_threshold)
+    person_car = [d for d in detections if d["cls"] in person_car_ids]
+    if not person_car:
+        return np.array(pil), "No person/car detected. No small-object crops."
+    grouped = group_detections(person_car, group_dist)
+    grouped.sort(key=lambda x: x["conf"], reverse=True)
+    top_groups = grouped[:10]
+    candidates = []
+    for gidx, grp in enumerate(top_groups):
+        x1, y1, x2, y2 = grp["box"]
+        group_box = [x1, y1, x2, y2]
+        inside = [
+            d for d in detections
+            if box_center_inside(d["box"], group_box) and d["cls"] not in person_car_ids
+        ]
+        inside = deduplicate_by_iou(inside, iou_threshold=0.9)
+        for crop_idx, d in enumerate(inside):
+            bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
+            obj_w, obj_h = bx2 - bx1, by2 - by1
+            if obj_w <= 0 or obj_h <= 0:
+                continue
+            pad_x, pad_y = obj_w * 0.3, obj_h * 0.3
+            bx1 = max(0, int(bx1 - pad_x))
+            by1 = max(0, int(by1 - pad_y))
+            bx2 = min(img_w, int(bx2 + pad_x))
+            by2 = min(img_h, int(by2 + pad_y))
+            if bx2 <= bx1 or by2 <= by1:
+                continue
+            if min(bx2 - bx1, by2 - by1) < min_side:
+                continue
+            expanded_box = [bx1, by1, bx2, by2]
+            candidates.append((expanded_box, d, gidx, crop_idx))
+    def crop_area(box):
+        return (box[2] - box[0]) * (box[3] - box[1])
+    candidates.sort(key=lambda c: -crop_area(c[0]))
+    kept = []
+    for c in candidates:
+        def is_same_object(box_a, box_b):
+            if box_iou(box_a, box_b) >= crop_dedup_iou:
+                return True
+            if box_center_inside(box_a, box_b) or box_center_inside(box_b, box_a):
+                return True
+            return False
+        if not any(is_same_object(c[0], k[0]) for k in kept):
+            kept.append(c)
+    if not kept:
+        if not candidates:
+            return np.array(pil), "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
+        return np.array(pil), "No small-object crops (after dedup)."
+    # Load encoder + refs for chosen model
+    if encoder_choice == "jina":
+        if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
+            jina_encoder = JinaCLIPv2Encoder(device)
+            ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
+            _APP_JINA = (jina_encoder, ref_labels, ref_embs)
+            _APP_REFS_JINA = str(refs_dir)
+        jina_encoder, ref_labels, ref_embs = _APP_JINA
+    else:
+        if _APP_NOMIC is None or _APP_REFS_NOMIC != str(refs_dir):
+            nomic_encoder = NomicVisionEncoder(device)
+            nomic_text_encoder = NomicTextEncoder(device)
+            ref_labels, ref_embs = build_refs_nomic(
+                nomic_encoder, refs_dir, batch_size=16,
+                text_encoder=nomic_text_encoder, text_weight=0.3,
+            )
+            _APP_NOMIC = (nomic_encoder, ref_labels, ref_embs)
+            _APP_REFS_NOMIC = str(refs_dir)
+        nomic_encoder, ref_labels, ref_embs = _APP_NOMIC
+    lines = []
+    out_img = pil.copy()
+    for i, (expanded_box, d, gidx, crop_idx) in enumerate(kept):
+        if squarify:
+            bx1, by1, bx2, by2 = squarify_crop_box(
+                expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3], img_w, img_h
+            )
+        else:
+            bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
+        crop_pil = pil.crop((bx1, by1, bx2, by2))
+        if encoder_choice == "jina":
+            q = jina_encoder.encode_images([crop_pil], TRUNCATE_DIM)
+            result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
+        else:
+            q = nomic_encoder.encode_images([crop_pil])
+            result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
+        pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
+        conf = result["confidence"]
+        lines.append(f"Crop {i+1}: {pred} ({conf:.2f})")
+        labeled = draw_label_on_image(crop_pil, pred, conf)
+        out_img.paste(labeled, (bx1, by1))
+    result_text = "\n".join(lines) if lines else "No crops"
+    return np.array(out_img), result_text
+if __name__ == "__main__":
+    main()

jina_fewshot.py ADDED Viewed

	@@ -0,0 +1,399 @@

+"""
+Few-shot object classification using jina-clip-v2 (EVA02-L, 304M).
+Combines IMAGE embeddings from reference photos + TEXT embeddings
+from class names. Dual threshold: confidence + gap between top-1 and top-2.
+Usage:
+    python jina_clipv2_fewshot.py \
+        --refs refs/ \
+        --input crops/ \
+        --output results/ \
+        --text-weight 0.3 \
+        --conf-threshold 0.75 \
+        --gap-threshold 0.05
+refs/ folder structure (3-10 images per class recommended):
+    refs/
+    ├── cigarette/
+    ├── gun/
+    ├── knife/
+    ├── phone/
+    └── nothing/   (empty hands, random objects)
+"""
+import argparse
+import csv
+import json
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from transformers import AutoModel
+IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tiff"}
+TRUNCATE_DIM = 1024
+def _to_numpy(embs):
+    """Convert to numpy; Jina may return tensor on some code paths."""
+    if hasattr(embs, "cpu"):
+        embs = embs.cpu().float().numpy()
+    return np.asarray(embs, dtype=np.float64)
+def draw_label_on_image(img: Image.Image, label: str, confidence: float) -> Image.Image:
+    """Draw the label in a bar outside and on top of the image (full width). Returns new image."""
+    img = img.convert("RGB")
+    w, h = img.width, img.height
+    text = f"{label} ({confidence:.2f})"
+    margin = 8
+    max_text_w = max(1, w - 2 * margin)
+    font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
+    try:
+        font_size = max(10, min(h, w) // 12)
+        font = ImageFont.truetype(font_path, size=font_size)
+    except OSError:
+        font = ImageFont.load_default()
+        font_size = None
+    dummy = Image.new("RGB", (1, 1))
+    ddraw = ImageDraw.Draw(dummy)
+    bbox = ddraw.textbbox((0, 0), text, font=font)
+    tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
+    # Shrink font until text fits within image width (only when using truetype)
+    if font_size is not None:
+        while tw > max_text_w and font_size > 8:
+            font_size = max(8, font_size - 2)
+            font = ImageFont.truetype(font_path, size=font_size)
+            bbox = ddraw.textbbox((0, 0), text, font=font)
+            tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
+    bar_height = th + 2 * margin
+    # Canvas: bar on top (full width) + image below
+    out = Image.new("RGB", (w, bar_height + h), color=(255, 255, 255))
+    draw = ImageDraw.Draw(out)
+    draw.rectangle([0, 0, w, bar_height], fill=(0, 0, 0))
+    x = (w - tw) // 2
+    y = margin
+    draw.text((x, y), text, fill=(255, 255, 255), font=font)
+    out.paste(img, (0, bar_height))
+    return out
+CLASS_PROMPTS = {
+    "knife": [
+        "a knife",
+        "a person holding a knife",
+        "a sharp blade knife",
+    ],
+    "gun": [
+        "a gun",
+        "a pistol",
+        "a handgun",
+        "a person holding a gun",
+        "a person holding a pistol",
+        "a firearm weapon",
+    ],
+    "cigarette": [
+        "a cigarette",
+        "a person smoking a cigarette",
+        "a lit cigarette in hand",
+    ],
+    "phone": [
+        "a phone",
+        "a person holding a smartphone",
+        "a mobile phone cell phone",
+    ],
+    "nothing": [
+        "a person with empty hands",
+        "a person standing with no objects",
+        "empty hands no weapon",
+    ],
+}
+def parse_args():
+    p = argparse.ArgumentParser(description="Jina-CLIP-v2 few-shot classifier")
+    p.add_argument("--refs", required=True, help="Reference images folder")
+    p.add_argument("--input", required=True, help="Query crop images folder")
+    p.add_argument("--output", default="jinaclip_results", help="Output folder")
+    p.add_argument("--dim", type=int, default=TRUNCATE_DIM, help="Embedding dim (64-1024)")
+    p.add_argument("--text-weight", type=float, default=0.3,
+                   help="Text embedding weight (0.0=image only, default 0.3)")
+    p.add_argument("--conf-threshold", type=float, default=0.75,
+                   help="Min confidence to accept prediction (default 0.75)")
+    p.add_argument("--gap-threshold", type=float, default=0.05,
+                   help="Min gap between top-1 and top-2 (default 0.05)")
+    p.add_argument("--batch-size", type=int, default=16)
+    p.add_argument("--save-refs", action="store_true",
+                   help="Save reference embeddings to .npy for fast reload")
+    return p.parse_args()
+class JinaCLIPv2Encoder:
+    def __init__(self, device="cuda"):
+        self.device = device
+        print("[*] Loading jina-clip-v2...")
+        t0 = time.perf_counter()
+        # On HF Spaces, accelerate is pre-installed and transformers uses its meta-device
+        # context during init, so set_default_device("cpu") is overridden. Jina's
+        # eva_model.py:606 does torch.linspace(0, drop_path_rate, depth).item() and
+        # crashes on meta tensors. Monkey-patch linspace to force device="cpu" for init.
+        _orig_linspace = torch.linspace
+        def _safe_linspace(*args, **kwargs):
+            kwargs.pop("device", None)
+            return _orig_linspace(*args, **kwargs, device="cpu")
+        torch.linspace = _safe_linspace
+        try:
+            self.model = AutoModel.from_pretrained(
+                "jinaai/jina-clip-v2",
+                trust_remote_code=True,
+                low_cpu_mem_usage=False,
+                revision="main",
+            )
+        finally:
+            torch.linspace = _orig_linspace
+        self.model = self.model.to(device).eval()
+        if device == "cpu":
+            self.model = self.model.float()
+        print(f"[*] Loaded in {time.perf_counter() - t0:.1f}s (device={device})\n")
+    def encode_images(self, images: list[Image.Image], dim: int = TRUNCATE_DIM) -> np.ndarray:
+        rgb = [img.convert("RGB") for img in images]
+        with torch.no_grad():
+            embs = self.model.encode_image(rgb, truncate_dim=dim)
+        embs = _to_numpy(embs)
+        embs = np.nan_to_num(embs, nan=0.0, posinf=0.0, neginf=0.0)
+        norms = np.linalg.norm(embs, axis=-1, keepdims=True)
+        norms = np.maximum(norms, 1e-12)
+        return (embs / norms).astype(np.float32)
+    def encode_texts(self, texts: list[str], dim: int = TRUNCATE_DIM) -> np.ndarray:
+        with torch.no_grad():
+            embs = self.model.encode_text(texts, truncate_dim=dim)
+        embs = _to_numpy(embs)
+        embs = np.nan_to_num(embs, nan=0.0, posinf=0.0, neginf=0.0)
+        norms = np.linalg.norm(embs, axis=-1, keepdims=True)
+        norms = np.maximum(norms, 1e-12)
+        return (embs / norms).astype(np.float32)
+    def encode_image_paths(self, paths: list[str], dim: int = TRUNCATE_DIM,
+                           batch_size: int = 16) -> np.ndarray:
+        all_embs = []
+        for i in range(0, len(paths), batch_size):
+            batch = [Image.open(p) for p in paths[i:i + batch_size]]
+            all_embs.append(self.encode_images(batch, dim))
+        return np.concatenate(all_embs, axis=0)
+def build_refs(encoder: JinaCLIPv2Encoder, refs_dir: Path,
+               dim: int, text_weight: float, batch_size: int):
+    class_dirs = sorted(d for d in refs_dir.iterdir() if d.is_dir())
+    if not class_dirs:
+        raise ValueError(f"No subfolders in {refs_dir}")
+    labels, embeddings = [], []
+    _device = getattr(encoder, "device", "?")
+    print(f"  Device: {_device} | Text weight: {text_weight:.1f} | Image weight: {1 - text_weight:.1f}\n")
+    if _device == "cpu":
+        print("  [WARNING] Jina is on CPU. Ref embeddings are often all zeros on CPU. Use a Space with GPU (e.g. T4) for D-FINE + Classify.\n")
+    for d in class_dirs:
+        name = d.name
+        paths = sorted(str(p) for p in d.iterdir() if p.suffix.lower() in IMAGE_EXTS)
+        if not paths:
+            continue
+        # Image embeddings
+        img_embs = encoder.encode_image_paths(paths, dim, batch_size)
+        img_avg = np.nan_to_num(img_embs.mean(axis=0), nan=0.0, posinf=0.0, neginf=0.0)
+        # Text embeddings
+        prompts = CLASS_PROMPTS.get(name, [f"a {name}", f"a person holding a {name}"])
+        text_embs = encoder.encode_texts(prompts, dim)
+        text_avg = np.nan_to_num(text_embs.mean(axis=0), nan=0.0, posinf=0.0, neginf=0.0)
+        # Combine
+        combined = (1.0 - text_weight) * img_avg + text_weight * text_avg
+        combined = np.nan_to_num(combined, nan=0.0, posinf=0.0, neginf=0.0)
+        combined = combined / (np.linalg.norm(combined) + 1e-12)
+        labels.append(name)
+        embeddings.append(combined)
+        img_norm = img_avg / (np.linalg.norm(img_avg) + 1e-12)
+        text_norm = text_avg / (np.linalg.norm(text_avg) + 1e-12)
+        sim = float(np.nan_to_num(np.dot(img_norm, text_norm), nan=0.0))
+        print(f"  {name:<14}: {len(paths)} imgs + {len(prompts)} prompts | "
+              f"img-text sim: {sim:.4f}")
+    if labels and np.allclose(np.stack(embeddings), 0.0):
+        print("\n  [WARNING] All ref embeddings are zero. Jina-CLIP often returns zeros on CPU. "
+              "Use a Space with GPU (e.g. T4) for D-FINE + Classify to work correctly.")
+    return labels, np.stack(embeddings)
+def classify(query_emb: np.ndarray, ref_labels: list[str], ref_embs: np.ndarray,
+             conf_threshold: float, gap_threshold: float) -> dict:
+    sims = (query_emb @ ref_embs.T).squeeze(0)
+    sims = np.nan_to_num(sims.astype(np.float64), nan=0.0, posinf=0.0, neginf=0.0)
+    sorted_idx = np.argsort(sims)[::-1]
+    best_idx = sorted_idx[0]
+    second_idx = sorted_idx[1]
+    conf = float(sims[best_idx])
+    gap = float(sims[best_idx] - sims[second_idx])
+    # Dual threshold
+    conf_ok = conf >= conf_threshold
+    gap_ok = gap >= gap_threshold
+    if conf_ok and gap_ok:
+        prediction = ref_labels[best_idx]
+        status = "accepted"
+    else:
+        prediction = "unknown"
+        reasons = []
+        if not conf_ok:
+            reasons.append(f"conf {conf:.4f} < {conf_threshold}")
+        if not gap_ok:
+            reasons.append(f"gap {gap:.4f} < {gap_threshold}")
+        status = "rejected: " + ", ".join(reasons)
+    return {
+        "prediction": prediction,
+        "raw_prediction": ref_labels[best_idx],
+        "confidence": conf,
+        "gap": gap,
+        "second_best": ref_labels[second_idx],
+        "second_conf": float(sims[second_idx]),
+        "status": status,
+        "all_sims": {ref_labels[j]: float(sims[j]) for j in range(len(ref_labels))},
+    }
+def main():
+    args = parse_args()
+    input_dir, output_dir = Path(args.input), Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    paths = sorted(p for p in input_dir.iterdir() if p.suffix.lower() in IMAGE_EXTS)
+    if not paths:
+        return print(f"[!] No images in {input_dir}")
+    print(f"[*] {len(paths)} query images")
+    print(f"[*] Conf threshold: {args.conf_threshold} | Gap threshold: {args.gap_threshold}\n")
+    encoder = JinaCLIPv2Encoder("cuda")
+    # Build references
+    print("[*] Building references...")
+    ref_labels, ref_embs = build_refs(
+        encoder, Path(args.refs), args.dim, args.text_weight, args.batch_size
+    )
+    print(f"\n[*] {len(ref_labels)} classes: {ref_labels}\n")
+    # Save refs if requested
+    if args.save_refs:
+        np.save(output_dir / "ref_embeddings.npy", ref_embs)
+        with open(output_dir / "ref_labels.json", "w") as jf:
+            json.dump(ref_labels, jf)
+        print(f"[*] Saved refs to {output_dir}\n")
+    # CSV
+    csv_path = output_dir / "classifications.csv"
+    f = open(csv_path, "w", newline="")
+    w = csv.writer(f)
+    w.writerow(["image", "prediction", "raw_prediction", "confidence", "gap",
+                "second_best", "second_conf", "status"] +
+               [f"sim_{l}" for l in ref_labels] + ["time_ms"])
+    # Stats
+    times = []
+    counts = {"unknown": 0}
+    for l in ref_labels:
+        counts[l] = 0
+    accepted, rejected = 0, 0
+    # Header
+    hdr = "  ".join(f"{l:>10}" for l in ref_labels)
+    print(f"{'Image':<30} {'Result':<10} {'Conf':>6} {'Gap':>6}  {hdr}  {'Status'}")
+    print("=" * (30 + 10 + 14 + len(hdr) + 40))
+    # Classify
+    for p in paths:
+        t0 = time.perf_counter()
+        img = Image.open(p)
+        q = encoder.encode_images([img], args.dim)
+        ms = (time.perf_counter() - t0) * 1000
+        times.append(ms)
+        result = classify(q, ref_labels, ref_embs, args.conf_threshold, args.gap_threshold)
+        counts[result["prediction"]] += 1
+        if result["prediction"] != "unknown":
+            accepted += 1
+        else:
+            rejected += 1
+        # Draw label on image and save to output folder
+        annotated = draw_label_on_image(img, result["prediction"], result["confidence"])
+        out_path = output_dir / p.name
+        annotated.save(out_path)
+        sim_str = "  ".join(f"{result['all_sims'][l]:>10.4f}" for l in ref_labels)
+        print(f"{p.name:<30} {result['prediction']:<10} "
+              f"{result['confidence']:>6.4f} {result['gap']:>6.4f}  "
+              f"{sim_str}  {result['status']}")
+        w.writerow([
+            p.name,
+            result["prediction"],
+            result["raw_prediction"],
+            f"{result['confidence']:.4f}",
+            f"{result['gap']:.4f}",
+            result["second_best"],
+            f"{result['second_conf']:.4f}",
+            result["status"],
+        ] + [f"{result['all_sims'][l]:.4f}" for l in ref_labels] +
+            [f"{ms:.1f}"])
+    f.close()
+    # Summary
+    n = len(times)
+    total = sum(times)
+    print(f"\n{'='*70}")
+    print("SUMMARY")
+    print(f"{'='*70}")
+    print(f"  Model          : jina-clip-v2 (EVA02-L, 304M, CLS pooling)")
+    print(f"  Embed dim      : {args.dim}")
+    print(f"  Text weight    : {args.text_weight}")
+    print(f"  Conf threshold : {args.conf_threshold}")
+    print(f"  Gap threshold  : {args.gap_threshold}")
+    print(f"  Images         : {n}")
+    if n:
+        print(f"  Accepted       : {accepted} ({accepted/n*100:.1f}%)")
+        print(f"  Rejected       : {rejected} ({rejected/n*100:.1f}%)")
+    print(f"  ──────────────────────────────────────────")
+    for l in ref_labels + ["unknown"]:
+        c = counts.get(l, 0)
+        pct = (c / n * 100) if n else 0
+        print(f"  {l:<14}: {c:>4} ({pct:.1f}%)")
+    print(f"  ──────────────────────────────────────────")
+    if n:
+        print(f"  Total          : {total:.0f}ms ({total/1000:.2f}s)")
+        print(f"  Avg/image      : {total/n:.1f}ms")
+        print(f"  Throughput     : {n/(total/1000):.1f} img/s")
+    print(f"  CSV            : {csv_path}")
+    print(f"  Annotated imgs : {output_dir}")
+    print(f"{'='*70}")
+if __name__ == "__main__":
+    main()

models/README.md ADDED Viewed

	@@ -0,0 +1,6 @@

+# Model versions
+- **v1** — `v1/best.pt` (current)
+- **v2** — add `v2/best.pt` and a new tab in `app.py` when ready
+Each version folder should contain `best.pt` (or the weights file used by the app).

models/v1/best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ce9c6f1f6193256572eae61176c486e52a11aa7f5885778aa8f3a445e04d1e5
+size 44256473

nomic_fewshot.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""
+Few-shot object classification using Nomic embed-vision-v1.5 + embed-text-v1.5.
+Same treatment as Jina: image refs + text prompts, combined with text_weight (default 0.3).
+Used by dfine_jina_pipeline.py and tune_thresholds.py for Nomic crop classification.
+"""
+import time
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
+from transformers import modeling_utils
+from jina_fewshot import CLASS_PROMPTS, IMAGE_EXTS
+def _patch_tied_weights_for_nomic():
+    """NomicVisionModel has _tied_weights_keys but newer transformers expect all_tied_weights_keys.
+    Only patch when this method exists (newer transformers); older versions don't need it."""
+    if not hasattr(modeling_utils.PreTrainedModel, "mark_tied_weights_as_initialized"):
+        return
+    _orig = modeling_utils.PreTrainedModel.mark_tied_weights_as_initialized
+    def _patched(self, loading_info):
+        if not hasattr(self, "all_tied_weights_keys"):
+            self.all_tied_weights_keys = getattr(self, "_tied_weights_keys", None) or {}
+        return _orig(self, loading_info)
+    modeling_utils.PreTrainedModel.mark_tied_weights_as_initialized = _patched
+def _nomic_mean_pool(last_hidden_state, attention_mask):
+    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
+    return torch.sum(last_hidden_state * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
+class NomicTextEncoder:
+    """Nomic embed-text-v1.5: text → normalized embedding (aligned to vision space)."""
+    def __init__(self, device="cuda"):
+        self.device = device
+        print("[*] Loading nomic-embed-text-v1.5...")
+        t0 = time.perf_counter()
+        self.tokenizer = AutoTokenizer.from_pretrained("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
+        if hasattr(torch, "set_default_device"):
+            torch.set_default_device("cpu")
+        try:
+            self.model = AutoModel.from_pretrained(
+                "nomic-ai/nomic-embed-text-v1.5",
+                trust_remote_code=True,
+                low_cpu_mem_usage=False,
+            )
+        finally:
+            if hasattr(torch, "set_default_device"):
+                torch.set_default_device("cpu")
+        self.model = self.model.to(device).eval()
+        print(f"[*] Loaded in {time.perf_counter() - t0:.1f}s\n")
+    def encode_texts(self, texts: list[str]) -> np.ndarray:
+        prefixed = [f"classification: {t}" for t in texts]
+        inputs = self.tokenizer(prefixed, padding=True, truncation=True, return_tensors="pt", max_length=512)
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            out = self.model(**inputs)
+        embs = _nomic_mean_pool(out.last_hidden_state, inputs["attention_mask"])
+        embs = F.normalize(embs, p=2, dim=1)
+        return embs.cpu().float().numpy()
+class NomicVisionEncoder:
+    """Nomic embed-vision-v1.5: image → normalized CLS embedding."""
+    def __init__(self, device="cuda"):
+        self.device = device
+        print("[*] Loading nomic-embed-vision-v1.5...")
+        t0 = time.perf_counter()
+        self.processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")
+        _patch_tied_weights_for_nomic()
+        if hasattr(torch, "set_default_device"):
+            torch.set_default_device("cpu")
+        try:
+            self.model = AutoModel.from_pretrained(
+                "nomic-ai/nomic-embed-vision-v1.5",
+                trust_remote_code=True,
+                low_cpu_mem_usage=False,
+            )
+        finally:
+            if hasattr(torch, "set_default_device"):
+                torch.set_default_device("cpu")
+        self.model = self.model.to(device).eval()
+        print(f"[*] Loaded in {time.perf_counter() - t0:.1f}s\n")
+    def encode_images(self, images: list) -> np.ndarray:
+        """Encode images to L2-normalized embeddings (CLS token)."""
+        inputs = self.processor(images=images, return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            out = self.model(**inputs).last_hidden_state
+        # CLS token, then normalize
+        embs = F.normalize(out[:, 0], p=2, dim=1)
+        return embs.cpu().float().numpy()
+def build_refs_nomic(
+    encoder: NomicVisionEncoder,
+    refs_dir: Path,
+    batch_size: int = 16,
+    text_encoder: NomicTextEncoder | None = None,
+    text_weight: float = 0.3,
+):
+    """Build one ref embedding per class. Same treatment as Jina: image refs + text prompts, combined with text_weight (default 0.3)."""
+    class_dirs = sorted(d for d in refs_dir.iterdir() if d.is_dir())
+    if not class_dirs:
+        raise ValueError(f"No subfolders in {refs_dir}")
+    labels = []
+    embeddings = []
+    if text_encoder is not None:
+        print(f"  Text weight: {text_weight:.1f} | Image weight: {1 - text_weight:.1f}\n")
+    for d in class_dirs:
+        name = d.name
+        paths = sorted(str(p) for p in d.iterdir() if p.suffix.lower() in IMAGE_EXTS)
+        if not paths:
+            continue
+        all_embs = []
+        for i in range(0, len(paths), batch_size):
+            batch = [Image.open(p).convert("RGB") for p in paths[i : i + batch_size]]
+            all_embs.append(encoder.encode_images(batch))
+        img_embs = np.concatenate(all_embs, axis=0)
+        img_avg = img_embs.mean(axis=0)
+        if text_encoder is not None:
+            prompts = CLASS_PROMPTS.get(name, [f"a {name}", f"a person holding a {name}"])
+            text_embs = text_encoder.encode_texts(prompts)
+            text_avg = text_embs.mean(axis=0)
+            combined = (1.0 - text_weight) * img_avg + text_weight * text_avg
+            combined = combined / (np.linalg.norm(combined) + 1e-12)
+            labels.append(name)
+            embeddings.append(combined)
+            print(f"  {name:<14}: {len(paths)} imgs + {len(prompts)} prompts")
+        else:
+            img_avg = img_avg / (np.linalg.norm(img_avg) + 1e-12)
+            labels.append(name)
+            embeddings.append(img_avg)
+            print(f"  {name:<14}: {len(paths)} imgs")
+    return labels, np.stack(embeddings)

refs/cigarette/c2.png ADDED Viewed

Git LFS Details

SHA256: 20b1f64a2f9fcc6da80a0867ab35a9f7b779933bdde8909b722361953ab6401c
Pointer size: 130 Bytes
Size of remote file: 58.2 kB

refs/cigarette/c3.png ADDED Viewed

Git LFS Details

SHA256: a4801004bb068cf64bc4c02169bbfcf55c476821e969157bfc12e9d0128de45a
Pointer size: 130 Bytes
Size of remote file: 31.6 kB

refs/cigarette/c4.png ADDED Viewed

Git LFS Details

SHA256: 60b65fcaf12a5b94a772e70edd253da9163f00adee0accaf6bbf7b85af5fdbd1
Pointer size: 130 Bytes
Size of remote file: 38.8 kB

refs/cigarette/c5.png ADDED Viewed

Git LFS Details

SHA256: 75d8405ada9ad5bd8280e4fd3bea0f76ef3d7bcdfa3f85349414d8adfd2cb63e
Pointer size: 130 Bytes
Size of remote file: 47.4 kB

refs/cigarette/c6.png ADDED Viewed

Git LFS Details

SHA256: 8d4d2795a59160387592896e37dfb1a047c1e4c08a479dcbc6763d9217edb2d3
Pointer size: 130 Bytes
Size of remote file: 30.9 kB

refs/cigarette/c7.png ADDED Viewed

Git LFS Details

SHA256: e53523f282293b8c21c061ecb783959fe68406f9d262a8f68cf81a96f32b3445
Pointer size: 130 Bytes
Size of remote file: 35.4 kB

refs/cigarette/c9.png ADDED Viewed

Git LFS Details

SHA256: a254ceb43f668e994c95e6c60f42d36088aa371ad595b5c8919f5dbe712f632f
Pointer size: 130 Bytes
Size of remote file: 30 kB

refs/cigarette/cigarette.jpg ADDED Viewed

Git LFS Details

SHA256: 532628a4385dc0c514cdf492eed5625dcbce27321971c0149c239ee9819abe83
Pointer size: 130 Bytes
Size of remote file: 33.6 kB

refs/gun/g1.png ADDED Viewed

Git LFS Details

SHA256: 844c3f1a7985259befeeeabec35a7909aa0e40fbc354255bb227569dd001d650
Pointer size: 130 Bytes
Size of remote file: 53.6 kB

refs/gun/g2.png ADDED Viewed

Git LFS Details

SHA256: 7cc5f6aa39b7639676fad2f096c23c3edc09559b7d6ccd9c1a40e9a419c31d85
Pointer size: 130 Bytes
Size of remote file: 29.1 kB

refs/gun/g3.png ADDED Viewed

Git LFS Details

SHA256: de0150ead2a28c77b4935268d9e4ca06d42c6728a76b503b13ffbfd888128386
Pointer size: 130 Bytes
Size of remote file: 56.7 kB

refs/gun/g4.png ADDED Viewed

Git LFS Details

SHA256: 59d496ba274fdbaa6f67dd839ce44a574b96268cdf0ee5891e38092d7f303553
Pointer size: 130 Bytes
Size of remote file: 33.8 kB

refs/gun/g5.png ADDED Viewed

Git LFS Details

SHA256: dfb9cc5fca0f41ff0d4b2f1f04808248f9b086559253c770425cddadf95e38fa
Pointer size: 130 Bytes
Size of remote file: 27.2 kB

refs/gun/g6.png ADDED Viewed

Git LFS Details

SHA256: 31a575c10b5f6b5b3a67adf5fe091f6067adc5a6b94f6fa60be922cb236acebd
Pointer size: 130 Bytes
Size of remote file: 34.3 kB

refs/gun/g7.png ADDED Viewed

Git LFS Details

SHA256: f5cc5b7606404c69c472b8feb1b4c53a4fecda088a20b0a2c2164fad6e86e397
Pointer size: 130 Bytes
Size of remote file: 66.9 kB

refs/gun/g8.png ADDED Viewed

Git LFS Details

SHA256: 0a2e6bc7d81f97a1fb548f5257d3eb120c9b663fd7823756e96d691b6dd1716d
Pointer size: 130 Bytes
Size of remote file: 86.7 kB

refs/gun/g9.png ADDED Viewed

Git LFS Details

SHA256: cbc1758027c1c681f5bcb1a9a6372d9268c78e7554ff94d62e55425d5cf42cbf
Pointer size: 130 Bytes
Size of remote file: 63.8 kB

refs/gun/pistol.jpeg ADDED Viewed

Git LFS Details

SHA256: d5395794a962c585aec507fd745149cd5edee3b70320022ddc9f70bb1ddf042c
Pointer size: 129 Bytes
Size of remote file: 4.35 kB

refs/knife/k1.png ADDED Viewed

Git LFS Details

SHA256: d6b265d54c05cc462e209be548e3113c4d8372c48ccb55b0fe254ba6e53dedf6
Pointer size: 130 Bytes
Size of remote file: 56.1 kB

refs/knife/k2.png ADDED Viewed

Git LFS Details

SHA256: 7a5cd23bd915ede8913848aa18517c6e0e8359dc6e97aa411428267a74e857de
Pointer size: 130 Bytes
Size of remote file: 71 kB

refs/knife/k3.png ADDED Viewed

Git LFS Details

SHA256: bd5ef88e472313505bcee276cd4fa64a64fe0a9b392fb83b218f3767cdf87e4f
Pointer size: 130 Bytes
Size of remote file: 62 kB

refs/knife/k4.png ADDED Viewed

Git LFS Details

SHA256: bef2302b51cb7457434164cc16fbe4e927204deacf70441fea2495f0a7c90f7c
Pointer size: 130 Bytes
Size of remote file: 46.4 kB

refs/knife/k5.png ADDED Viewed

Git LFS Details

SHA256: 8922ea7d8616eff9788aea72a95ec8fb8ce8cf902f6ac4ce04cb67c72ddcdd02
Pointer size: 130 Bytes
Size of remote file: 55.5 kB

refs/knife/k6.png ADDED Viewed

Git LFS Details

SHA256: c64cd6e1d6cd16fa3fb1ab28f7d3f08b50b973f6759c3d0797a9d38acab554f5
Pointer size: 130 Bytes
Size of remote file: 55.5 kB

refs/knife/k7.png ADDED Viewed

Git LFS Details

SHA256: 494ba36bf234412940e2dacfded4aadb280ec02a16d8385caa9b90d7fd9816ac
Pointer size: 130 Bytes
Size of remote file: 63.2 kB

refs/knife/k8.png ADDED Viewed

Git LFS Details

SHA256: 3afa7b28ed054ec1c34e158b4085d4b8cd56c411c488aa85f24b8dd37a6d2810
Pointer size: 130 Bytes
Size of remote file: 43.4 kB

refs/knife/k9.png ADDED Viewed

Git LFS Details

SHA256: ff7b7c07cd9cede920ece8826ce3c7bebd3bc3b87619eb257d0d6970bece4abe
Pointer size: 130 Bytes
Size of remote file: 32.7 kB

refs/knife/knife.jpeg ADDED Viewed

Git LFS Details

SHA256: edcae93d7aca0de5c5eb082a6f1968cc0397b5de324b7fd54a6b5e2b7ce2efd6
Pointer size: 129 Bytes
Size of remote file: 3.03 kB

refs/phone/p1.png ADDED Viewed

Git LFS Details

SHA256: 78cec0ab64af85f2481aba6d2a39d9927e9e88bc3a8884121ef0ac5c59cf41d4
Pointer size: 130 Bytes
Size of remote file: 56.5 kB

refs/phone/p2.png ADDED Viewed

Git LFS Details

SHA256: 938d565a03f65bb8e7264031ed31fc449bb142084d8fc9f331058398daae343c
Pointer size: 130 Bytes
Size of remote file: 44.6 kB

refs/phone/p3.png ADDED Viewed

Git LFS Details

SHA256: 059ed6c891f2a0cbf97b0143b86f60727c2d8c9ed6ff51f083a716a14a889a55
Pointer size: 130 Bytes
Size of remote file: 50.5 kB

refs/phone/p4.png ADDED Viewed

Git LFS Details

SHA256: a4f6b1c92830d70186471cc971b4caa756c6e6ab248def7f201c5ee3fa241a61
Pointer size: 130 Bytes
Size of remote file: 37.7 kB

refs/phone/p5.png ADDED Viewed

Git LFS Details

SHA256: 3dceea87636789b621befa836e012075778204cf5777b3ef9c3994e2623f60cb
Pointer size: 130 Bytes
Size of remote file: 46.3 kB

refs/phone/p6.png ADDED Viewed

Git LFS Details

SHA256: 356b463600e8199e42558a2aff06c3dac614a328e1868c2fe9829dcecf617fcf
Pointer size: 130 Bytes
Size of remote file: 62.4 kB

refs/phone/p7.png ADDED Viewed

Git LFS Details

SHA256: 746a7868d2d7f68d2501cc330a7b8e623000770dc2767be0e01fbb83adb6131c
Pointer size: 130 Bytes
Size of remote file: 51.4 kB

refs/phone/p8.png ADDED Viewed

Git LFS Details

SHA256: 15992f9626087a1e7fed6a209c4ecd1ffb40a77ca02f6a6bc8e372434abf445a
Pointer size: 130 Bytes
Size of remote file: 55.7 kB

refs/phone/p9.jpg ADDED Viewed

Git LFS Details

SHA256: 6208d8ab84fa62a8b9064eadac6bceeb131778e85a7762eb8abe4409bfdb5e97
Pointer size: 130 Bytes
Size of remote file: 32.9 kB

refs/phone/phone.jpg ADDED Viewed

Git LFS Details

SHA256: 946e73ffb44079ae74c48dbb1bdb6799a48ad5b298038022b20ade87469f0360
Pointer size: 131 Bytes
Size of remote file: 847 kB

requirements-lock.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# Pinned versions for Docker so Space matches local. Regenerate from your venv with:
+#   pip freeze > requirements-lock.txt
+# Then rebuild the Docker image.
+gradio==6.0.0
+ultralytics==8.3.0
+torch==2.2.2
+torchvision==0.17.2
+transformers==4.44.2
+accelerate==0.33.0
+pillow>=9.0.0
+numpy>=1.24.0
+huggingface_hub>=0.20.0
+matplotlib>=3.5.0
+requests>=2.28.0
+einops>=0.7.0
+timm>=0.9.0
+sentencepiece>=0.1.99