Spaces:

Napron
/

small_object_detection

Running

App Files Files Community

Napron commited on about 1 month ago

Commit

d6dee9e

verified ·

1 Parent(s): cd64594

Update dfine_jina_pipeline.py

Browse files

Files changed (1) hide show

dfine_jina_pipeline.py +327 -379

dfine_jina_pipeline.py CHANGED Viewed

@@ -1,8 +1,6 @@
-"""
-Pipeline: D-FINE (person/car only) → group detections → crop group regions →
-  classify all inner object detections with Jina-CLIP-v2 or Nomic.
 Outputs separate crop folders per model (jina_crops, nomic_crops) for visual comparison.
-Each saved image is the D-FINE group crop, with bboxes drawn only for known classes.
 """
 import argparse
@@ -12,7 +10,8 @@ from pathlib import Path
 import numpy as np
 import torch
-from PIL import Image, ImageDraw, ImageFont
 from transformers import AutoImageProcessor, DFineForObjectDetection
 # Jina-CLIP-v2 few-shot (same refs + classify as jina_fewshot.py)
@@ -22,16 +21,16 @@ from jina_fewshot import (
     JinaCLIPv2Encoder,
     build_refs,
     classify as jina_classify,
 )
 from nomic_fewshot import NomicTextEncoder, NomicVisionEncoder, build_refs_nomic
 # -----------------------------------------------------------------------------
-# Detection + grouping
 # -----------------------------------------------------------------------------
 def get_box_dist(box1, box2):
     """Euclidean distance between box centers. box = [x1, y1, x2, y2]."""
     c1 = np.array([(box1[0] + box1[2]) / 2, (box1[1] + box1[3]) / 2])
@@ -42,6 +41,7 @@ def get_box_dist(box1, box2):
 def group_detections(detections, threshold):
     """
     Group detections by proximity (center distance < threshold).
     detections: list of {"box": [x1,y1,x2,y2], "conf", "cls", ...}
     Returns list of {"box": merged [x1,y1,x2,y2], "conf": best in group, "cls": best in group}.
     """
@@ -51,6 +51,7 @@ def group_detections(detections, threshold):
     boxes = [d["box"] for d in detections]
     n = len(boxes)
     adj = {i: [] for i in range(n)}
     for i in range(n):
         for j in range(i + 1, n):
             if get_box_dist(boxes[i], boxes[j]) < threshold:
@@ -59,14 +60,17 @@ def group_detections(detections, threshold):
     groups = []
     visited = [False] * n
     for i in range(n):
         if not visited[i]:
             group_indices = []
             stack = [i]
             visited[i] = True
             while stack:
                 curr = stack.pop()
                 group_indices.append(curr)
                 for neighbor in adj[curr]:
                     if not visited[neighbor]:
                         visited[neighbor] = True
@@ -77,14 +81,15 @@ def group_detections(detections, threshold):
             y1 = min(d["box"][1] for d in group_dets)
             x2 = max(d["box"][2] for d in group_dets)
             y2 = max(d["box"][3] for d in group_dets)
-            best_det = max(group_dets, key=lambda x: x["conf"])
             groups.append({
                 "box": [x1, y1, x2, y2],
                 "conf": best_det["conf"],
                 "cls": best_det["cls"],
                 "label": best_det.get("label", str(best_det["cls"])),
             })
     return groups
@@ -92,7 +97,10 @@ def box_center_inside(box, crop_box):
     """True if center of box is inside crop_box. All [x1,y1,x2,y2]."""
     cx = (box[0] + box[2]) / 2
     cy = (box[1] + box[3]) / 2
-    return crop_box[0] <= cx <= crop_box[2] and crop_box[1] <= cy <= crop_box[3]
 def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
@@ -104,8 +112,10 @@ def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
     orig = (int(bx1), int(by1), int(bx2), int(by2))
     w = bx2 - bx1
     h = by2 - by1
     if w <= 0 or h <= 0:
         return orig
     if h > w:
         add = (h - w) / 2.0
         bx1 = max(0, bx1 - add)
@@ -114,9 +124,12 @@ def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
         add = (w - h) / 2.0
         by1 = max(0, by1 - add)
         by2 = min(img_h, by2 + add)
     bx1, by1, bx2, by2 = int(bx1), int(by1), int(bx2), int(by2)
     if bx2 <= bx1 or by2 <= by1:
         return orig
     return bx1, by1, bx2, by2
@@ -126,12 +139,15 @@ def box_iou(box1, box2):
     iy1 = max(box1[1], box2[1])
     ix2 = min(box1[2], box2[2])
     iy2 = min(box1[3], box2[3])
     inter_w = max(0, ix2 - ix1)
     inter_h = max(0, iy2 - iy1)
     inter = inter_w * inter_h
     a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
     a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
     union = a1 + a2 - inter
     return inter / union if union > 0 else 0.0
@@ -139,190 +155,33 @@ def deduplicate_by_iou(detections, iou_threshold=0.9):
     """Keep one detection per overlapping group (IoU >= iou_threshold). Prefer higher confidence."""
     if not detections:
         return []
     sorted_d = sorted(detections, key=lambda x: -x["conf"])
     kept = []
     for d in sorted_d:
         if not any(box_iou(d["box"], k["box"]) >= iou_threshold for k in kept):
             kept.append(d)
-    return kept
-# -----------------------------------------------------------------------------
-# Drawing / layout helpers
-# -----------------------------------------------------------------------------
-def _load_font_for_box(img_w, img_h):
-    font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
-    size = max(10, min(img_w, img_h) // 14)
-    try:
-        return ImageFont.truetype(font_path, size=size)
-    except OSError:
-        return ImageFont.load_default()
-def draw_bbox_with_optional_label(
-    img,
-    box,
-    label=None,
-    confidence=None,
-    color=(255, 0, 0),
-    width=3,
-):
-    """
-    Draw bbox on the given image. If label is None, draw only the box.
-    box is [x1, y1, x2, y2] in this image's coordinate space.
-    """
-    out = img.copy().convert("RGB")
-    draw = ImageDraw.Draw(out)
-    x1, y1, x2, y2 = [int(v) for v in box]
-    x1 = max(0, min(x1, out.width - 1))
-    y1 = max(0, min(y1, out.height - 1))
-    x2 = max(0, min(x2, out.width - 1))
-    y2 = max(0, min(y2, out.height - 1))
-    for i in range(width):
-        draw.rectangle([x1 - i, y1 - i, x2 + i, y2 + i], outline=color)
-    if label:
-        font = _load_font_for_box(out.width, out.height)
-        text = f"{label} ({confidence:.2f})" if confidence is not None else label
-        bbox = draw.textbbox((0, 0), text, font=font)
-        tw = bbox[2] - bbox[0]
-        th = bbox[3] - bbox[1]
-        tx = max(0, min(x1, out.width - tw - 8))
-        ty = y1 - th - 8
-        if ty < 0:
-            ty = min(out.height - th - 4, y1 + 4)
-        draw.rectangle([tx, ty, tx + tw + 8, ty + th + 6], fill=color)
-        draw.text((tx + 4, ty + 3), text, fill=(255, 255, 255), font=font)
-    return out
-def stack_images_vertical(images, bg=(255, 255, 255), pad=10):
-    """Stack PIL images vertically into one output image."""
-    if not images:
-        return None
-    widths = [img.width for img in images]
-    heights = [img.height for img in images]
-    out_w = max(widths)
-    out_h = sum(heights) + pad * (len(images) - 1)
-    canvas = Image.new("RGB", (out_w, out_h), color=bg)
-    y = 0
-    for img in images:
-        x = (out_w - img.width) // 2
-        canvas.paste(img, (x, y))
-        y += img.height + pad
-    return canvas
-# -----------------------------------------------------------------------------
-# Shared crop/object preparation
-# -----------------------------------------------------------------------------
-def expand_box_with_padding(box, img_w, img_h, padding):
-    x1, y1, x2, y2 = [float(v) for v in box]
-    w = x2 - x1
-    h = y2 - y1
-    if w <= 0 or h <= 0:
-        return None
-    pad_x = w * padding
-    pad_y = h * padding
-    ex1 = max(0, int(x1 - pad_x))
-    ey1 = max(0, int(y1 - pad_y))
-    ex2 = min(img_w, int(x2 + pad_x))
-    ey2 = min(img_h, int(y2 + pad_y))
-    if ex2 <= ex1 or ey2 <= ey1:
-        return None
-    return [ex1, ey1, ex2, ey2]
-def build_group_crop_box(group_box, img_w, img_h, padding=0.2, squarify=True):
-    expanded = expand_box_with_padding(group_box, img_w, img_h, padding)
-    if expanded is None:
-        return None
-    if squarify:
-        return list(squarify_crop_box(expanded[0], expanded[1], expanded[2], expanded[3], img_w, img_h))
-    return expanded
-def collect_group_object_candidates(
-    detections,
-    person_car_ids,
-    group_box,
-    img_w,
-    img_h,
-    min_side,
-    crop_dedup_iou,
-    object_padding=0.3,
-):
-    """
-    For one group crop, collect and deduplicate all non-person/car detections inside it.
-    Returns list of dicts with:
-      {
-        "det": original detection,
-        "expanded_box": expanded object crop box in full-image coords
-      }
-    """
-    inside = [
-        d for d in detections
-        if box_center_inside(d["box"], group_box) and d["cls"] not in person_car_ids
-    ]
-    inside = deduplicate_by_iou(inside, iou_threshold=0.9)
-    candidates = []
-    for d in inside:
-        expanded_box = expand_box_with_padding(d["box"], img_w, img_h, object_padding)
-        if expanded_box is None:
-            continue
-        if min(expanded_box[2] - expanded_box[0], expanded_box[3] - expanded_box[1]) < min_side:
-            continue
-        candidates.append({
-            "det": d,
-            "expanded_box": expanded_box,
-        })
-    def crop_area(box):
-        return (box[2] - box[0]) * (box[3] - box[1])
-    candidates.sort(key=lambda c: -crop_area(c["expanded_box"]))
-    kept = []
-    def is_same_object(box_a, box_b):
-        if box_iou(box_a, box_b) >= crop_dedup_iou:
-            return True
-        if box_center_inside(box_a, box_b) or box_center_inside(box_b, box_a):
-            return True
-        return False
-    for c in candidates:
-        if not any(is_same_object(c["expanded_box"], k["expanded_box"]) for k in kept):
-            kept.append(c)
     return kept
 def parse_args():
     p = argparse.ArgumentParser(
-        description="D-FINE (person/car) → group → classify objects inside each group crop"
     )
     p.add_argument("--refs", required=True, help="Reference images folder for Jina and Nomic (e.g. refs/)")
     p.add_argument("--input", required=True, help="Full-frame images folder")
     p.add_argument("--output", default="pipeline_results", help="Output folder (CSV, etc.)")
     p.add_argument("--det-threshold", type=float, default=0.13, help="D-FINE score threshold")
     p.add_argument("--group-dist", type=float, default=None, help="Group distance (default: 0.1 * max(H,W))")
-    p.add_argument("--min-side", type=int, default=40, help="Min side of expanded object bbox in px (skip smaller)")
-    p.add_argument("--crop-dedup-iou", type=float, default=0.35, help="Min IoU to treat two object crops as same object")
-    p.add_argument("--no-squarify", action="store_true", help="Skip squarify on group crop")
-    p.add_argument("--padding", type=float, default=0.2, help="Padding around D-FINE group crop box")
-    p.add_argument("--conf-threshold", type=float, default=0.75, help="Accept confidence")
-    p.add_argument("--gap-threshold", type=float, default=0.05, help="Accept gap")
     p.add_argument("--text-weight", type=float, default=0.3)
     p.add_argument("--max-images", type=int, default=None)
     p.add_argument("--device", default=None)
@@ -333,25 +192,32 @@ def get_person_car_label_ids(model):
     """Return set of label IDs for person and car (Objects365: Person, Car, SUV, etc.)."""
     id2label = getattr(model.config, "id2label", None) or {}
     ids = set()
     for idx, name in id2label.items():
         try:
             i = int(idx)
         except (ValueError, TypeError):
             continue
         n = (name or "").lower()
         if "person" in n or n in ("car", "suv"):
             ids.add(i)
     return ids
 def run_dfine(image, processor, model, device, score_threshold):
     """Run D-FINE, return all detections as list of {box, score, label_id, label}."""
     if isinstance(image, Image.Image):
         pil = image.convert("RGB")
     else:
         pil = Image.fromarray(image).convert("RGB")
     w, h = pil.size
     target_size = torch.tensor([[h, w]], device=device)
     inputs = processor(images=pil, return_tensors="pt")
     inputs = {k: v.to(device) for k, v in inputs.items()}
@@ -360,13 +226,20 @@ def run_dfine(image, processor, model, device, score_threshold):
     target_sizes = target_size.to(outputs["logits"].device)
     results = processor.post_process_object_detection(
-        outputs, target_sizes=target_sizes, threshold=score_threshold
     )
-    id2label = getattr(model.config, "id2label", {}) or {}
     detections = []
     for result in results:
-        for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
             sid = int(label_id.item())
             detections.append({
                 "box": [float(x) for x in box.cpu().tolist()],
@@ -374,54 +247,14 @@ def run_dfine(image, processor, model, device, score_threshold):
                 "cls": sid,
                 "label": id2label.get(sid, str(sid)),
             })
-    return detections
-def annotate_group_crop(
-    pil,
-    detections,
-    person_car_ids,
-    group_box,
-    crop_box,
-    encoder_choice,
-    encoder,
-    ref_labels,
-    ref_embs,
-    conf_threshold,
-    gap_threshold,
-):
-    """
-    Build one D-FINE group crop, classify all object candidates inside it,
-    and draw bbox+label only for known classes.
-    Returns:
-      annotated_crop_pil,
-      rows_for_csv,
-      known_lines
-    """
-    crop_x1, crop_y1, crop_x2, crop_y2 = crop_box
-    group_crop = pil.crop((crop_x1, crop_y1, crop_x2, crop_y2)).convert("RGB")
-    obj_candidates = collect_group_object_candidates(
-        detections=detections,
-        person_car_ids=person_car_ids,
-        group_box=group_box,
-        img_w=pil.width,
-        img_h=pil.height,
-        min_side=1,  # not used here; caller filters before this if needed
-        crop_dedup_iou=1.0,  # not used here; caller passes already-filtered set if needed
-        object_padding=0.0,
-    )
-    # This helper is not used directly because caller already builds filtered candidates.
-    # Kept here only for API symmetry.
-    _ = obj_candidates
-    return group_crop, [], []
 def main():
     args = parse_args()
     device = args.device or ("cuda" if torch.cuda.is_available() else "cpu")
     input_dir = Path(args.input)
     output_dir = Path(args.output)
     refs_dir = Path(args.refs)
@@ -432,9 +265,13 @@ def main():
     if not input_dir.is_dir():
         raise SystemExit(f"Input folder not found: {input_dir}")
-    paths = sorted(p for p in input_dir.iterdir() if p.suffix.lower() in IMAGE_EXTS)
     if args.max_images is not None:
         paths = paths[: args.max_images]
     if not paths:
         raise SystemExit(f"No images in {input_dir}")
@@ -445,16 +282,22 @@ def main():
     dfine_model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-medium-obj365")
     dfine_model = dfine_model.to(device).eval()
     person_car_ids = get_person_car_label_ids(dfine_model)
-    print(f"    Person/car label IDs: {person_car_ids} ({time.perf_counter() - t0:.1f}s)")
-    # Load Jina refs
     print("[*] Loading Jina-CLIP-v2 and building refs...")
     t0 = time.perf_counter()
     jina_encoder = JinaCLIPv2Encoder(device)
-    ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, args.text_weight, batch_size=16)
-    print(f"    Jina refs: {ref_labels} ({time.perf_counter() - t0:.1f}s)\n")
-    # Load Nomic refs
     print("[*] Loading Nomic embed-vision + embed-text and building refs...")
     t0 = time.perf_counter()
     nomic_encoder = NomicVisionEncoder(device)
@@ -466,21 +309,38 @@ def main():
         text_encoder=nomic_text_encoder,
         text_weight=args.text_weight,
     )
-    print(f"    Nomic refs: {ref_labels_nomic} ({time.perf_counter() - t0:.1f}s)\n")
     jina_crops_dir = output_dir / "jina_crops"
     nomic_crops_dir = output_dir / "nomic_crops"
     jina_crops_dir.mkdir(parents=True, exist_ok=True)
     nomic_crops_dir.mkdir(parents=True, exist_ok=True)
     csv_path = output_dir / "results.csv"
     f = open(csv_path, "w", newline="")
     w = csv.writer(f)
     w.writerow([
-        "image", "crop_filename", "group_idx", "crop_x1", "crop_y1", "crop_x2", "crop_y2",
-        "bbox_x1", "bbox_y1", "bbox_x2", "bbox_y2", "dfine_label", "dfine_conf",
-        "jina_prediction", "jina_confidence", "jina_status",
-        "nomic_prediction", "nomic_confidence", "nomic_status",
     ])
     for img_path in paths:
@@ -488,101 +348,167 @@ def main():
         img_w, img_h = pil.size
         group_dist = args.group_dist if args.group_dist is not None else 0.1 * max(img_h, img_w)
-        detections = run_dfine(pil, image_processor, dfine_model, device, args.det_threshold)
         person_car = [d for d in detections if d["cls"] in person_car_ids]
         if not person_car:
             continue
         grouped = group_detections(person_car, group_dist)
         grouped.sort(key=lambda x: x["conf"], reverse=True)
-        top_groups = grouped[:10]
         for gidx, grp in enumerate(top_groups):
-            group_box = grp["box"]
-            crop_box = build_group_crop_box(
-                group_box,
-                img_w,
-                img_h,
-                padding=args.padding,
-                squarify=not args.no_squarify,
-            )
-            if crop_box is None:
-                continue
-            crop_x1, crop_y1, crop_x2, crop_y2 = crop_box
-            crop_name = f"{img_path.stem}_g{gidx}_{crop_x1}_{crop_y1}_{crop_x2}_{crop_y2}{img_path.suffix}"
-            base_group_crop = pil.crop((crop_x1, crop_y1, crop_x2, crop_y2)).convert("RGB")
-            obj_candidates = collect_group_object_candidates(
-                detections=detections,
-                person_car_ids=person_car_ids,
-                group_box=group_box,
-                img_w=img_w,
-                img_h=img_h,
-                min_side=args.min_side,
-                crop_dedup_iou=args.crop_dedup_iou,
-                object_padding=0.3,
             )
-            ann_jina = base_group_crop.copy()
-            ann_nomic = base_group_crop.copy()
-            for item in obj_candidates:
-                d = item["det"]
-                ex1, ey1, ex2, ey2 = item["expanded_box"]
-                obj_crop = pil.crop((ex1, ey1, ex2, ey2)).convert("RGB")
-                q_jina = jina_encoder.encode_images([obj_crop], TRUNCATE_DIM)
-                result_jina = jina_classify(q_jina, ref_labels, ref_embs, args.conf_threshold, args.gap_threshold)
-                q_nomic = nomic_encoder.encode_images([obj_crop])
-                result_nomic = jina_classify(q_nomic, ref_labels_nomic, ref_embs_nomic, args.conf_threshold, args.gap_threshold)
-                rel_box = [
-                    max(0, int(round(d["box"][0] - crop_x1))),
-                    max(0, int(round(d["box"][1] - crop_y1))),
-                    min(base_group_crop.width, int(round(d["box"][2] - crop_x1))),
-                    min(base_group_crop.height, int(round(d["box"][3] - crop_y1))),
-                ]
-                if result_jina["prediction"] in ref_labels:
-                    ann_jina = draw_bbox_with_optional_label(
-                        ann_jina,
-                        rel_box,
-                        label=result_jina["prediction"],
-                        confidence=result_jina["confidence"],
-                    )
-                if result_nomic["prediction"] in ref_labels_nomic:
-                    ann_nomic = draw_bbox_with_optional_label(
-                        ann_nomic,
-                        rel_box,
-                        label=result_nomic["prediction"],
-                        confidence=result_nomic["confidence"],
-                    )
-                w.writerow([
-                    img_path.name, crop_name, gidx,
-                    crop_x1, crop_y1, crop_x2, crop_y2,
-                    d["box"][0], d["box"][1], d["box"][2], d["box"][3],
-                    d["label"], f"{d['conf']:.4f}",
-                    result_jina["prediction"], f"{result_jina['confidence']:.4f}", result_jina["status"],
-                    result_nomic["prediction"], f"{result_nomic['confidence']:.4f}", result_nomic["status"],
-                ])
             ann_jina.save(jina_crops_dir / crop_name)
             ann_nomic.save(nomic_crops_dir / crop_name)
     f.close()
     print(f"[*] Wrote {csv_path}")
-    print(f"[*] Jina crops:  {jina_crops_dir}")
     print(f"[*] Nomic crops: {nomic_crops_dir}")
 # -----------------------------------------------------------------------------
-# Single-image runner for Gradio app: D-FINE first, then Jina or Nomic
 # -----------------------------------------------------------------------------
 _APP_DFINE = None
 _APP_JINA = None
 _APP_NOMIC = None
@@ -603,11 +529,16 @@ def run_single_image(
     squarify=True,
 ):
     """
-    Run D-FINE on one image, then classify small-object detections inside each group crop.
-    Returns:
-      - one vertically stacked image of all D-FINE group crops
-      - text containing only known-class predictions
     """
     global _APP_DFINE, _APP_JINA, _APP_NOMIC, _APP_REFS_JINA, _APP_REFS_NOMIC
     refs_dir = Path(refs_dir)
@@ -640,6 +571,61 @@ def run_single_image(
     grouped.sort(key=lambda x: x["conf"], reverse=True)
     top_groups = grouped[:10]
     # Load encoder + refs for chosen model
     if encoder_choice == "jina":
         if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
@@ -647,7 +633,8 @@ def run_single_image(
             ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
             _APP_JINA = (jina_encoder, ref_labels, ref_embs)
             _APP_REFS_JINA = str(refs_dir)
-        encoder, ref_labels, ref_embs = _APP_JINA
     else:
         if _APP_NOMIC is None or _APP_REFS_NOMIC != str(refs_dir):
             nomic_encoder = NomicVisionEncoder(device)
@@ -661,83 +648,44 @@ def run_single_image(
             )
             _APP_NOMIC = (nomic_encoder, ref_labels, ref_embs)
             _APP_REFS_NOMIC = str(refs_dir)
-        encoder, ref_labels, ref_embs = _APP_NOMIC
-    output_crops = []
-    lines = []
-    for gidx, grp in enumerate(top_groups):
-        group_box = grp["box"]
-        crop_box = build_group_crop_box(
-            group_box,
-            img_w,
-            img_h,
-            padding=0.2,
-            squarify=squarify,
-        )
-        if crop_box is None:
-            continue
-        crop_x1, crop_y1, crop_x2, crop_y2 = crop_box
-        group_crop = pil.crop((crop_x1, crop_y1, crop_x2, crop_y2)).convert("RGB")
-        obj_candidates = collect_group_object_candidates(
-            detections=detections,
-            person_car_ids=person_car_ids,
-            group_box=group_box,
-            img_w=img_w,
-            img_h=img_h,
-            min_side=min_side,
-            crop_dedup_iou=crop_dedup_iou,
-            object_padding=0.3,
-        )
-        ann_crop = group_crop.copy()
-        group_known_lines = []
-        for item in obj_candidates:
-            d = item["det"]
-            ex1, ey1, ex2, ey2 = item["expanded_box"]
-            obj_crop = pil.crop((ex1, ey1, ex2, ey2)).convert("RGB")
-            if encoder_choice == "jina":
-                q = encoder.encode_images([obj_crop], TRUNCATE_DIM)
-                result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
-            else:
-                q = encoder.encode_images([obj_crop])
-                result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
-            known = result["prediction"] in ref_labels
-            if not known:
-                continue
-            rel_box = [
-                max(0, int(round(d["box"][0] - crop_x1))),
-                max(0, int(round(d["box"][1] - crop_y1))),
-                min(group_crop.width, int(round(d["box"][2] - crop_x1))),
-                min(group_crop.height, int(round(d["box"][3] - crop_y1))),
-            ]
-            ann_crop = draw_bbox_with_optional_label(
-                ann_crop,
-                rel_box,
-                label=result["prediction"],
-                confidence=result["confidence"],
-            )
-            group_known_lines.append(f"{result['prediction']} ({result['confidence']:.2f})")
-        output_crops.append(ann_crop)
-        if group_known_lines:
-            lines.append(f"Crop {gidx + 1}:")
-            lines.extend(group_known_lines)
-    if not output_crops:
-        return np.array(pil), "No D-FINE group crops."
-    stacked = stack_images_vertical(output_crops, pad=10)
-    result_text = "\n".join(lines) if lines else ""
-    return np.array(stacked), result_text
 if __name__ == "__main__":

+""" Pipeline: D-FINE (person/car only) → group detections → crop regions →
+find all bboxes inside each crop → Jina-CLIP-v2 and Nomic embeddings on those crops.
 Outputs separate crop folders per model (jina_crops, nomic_crops) for visual comparison.
 """
 import argparse
 import numpy as np
 import torch
+import torch.nn.functional as F
+from PIL import Image
 from transformers import AutoImageProcessor, DFineForObjectDetection
 # Jina-CLIP-v2 few-shot (same refs + classify as jina_fewshot.py)
     JinaCLIPv2Encoder,
     build_refs,
     classify as jina_classify,
+    draw_label_on_image,
 )
 from nomic_fewshot import NomicTextEncoder, NomicVisionEncoder, build_refs_nomic
 # -----------------------------------------------------------------------------
+# Detection + grouping (from reference_detection.py)
 # -----------------------------------------------------------------------------
 def get_box_dist(box1, box2):
     """Euclidean distance between box centers. box = [x1, y1, x2, y2]."""
     c1 = np.array([(box1[0] + box1[2]) / 2, (box1[1] + box1[3]) / 2])
 def group_detections(detections, threshold):
     """
     Group detections by proximity (center distance < threshold).
     detections: list of {"box": [x1,y1,x2,y2], "conf", "cls", ...}
     Returns list of {"box": merged [x1,y1,x2,y2], "conf": best in group, "cls": best in group}.
     """
     boxes = [d["box"] for d in detections]
     n = len(boxes)
     adj = {i: [] for i in range(n)}
     for i in range(n):
         for j in range(i + 1, n):
             if get_box_dist(boxes[i], boxes[j]) < threshold:
     groups = []
     visited = [False] * n
     for i in range(n):
         if not visited[i]:
             group_indices = []
             stack = [i]
             visited[i] = True
             while stack:
                 curr = stack.pop()
                 group_indices.append(curr)
                 for neighbor in adj[curr]:
                     if not visited[neighbor]:
                         visited[neighbor] = True
             y1 = min(d["box"][1] for d in group_dets)
             x2 = max(d["box"][2] for d in group_dets)
             y2 = max(d["box"][3] for d in group_dets)
+            best_det = max(group_dets, key=lambda x: x["conf"])
             groups.append({
                 "box": [x1, y1, x2, y2],
                 "conf": best_det["conf"],
                 "cls": best_det["cls"],
                 "label": best_det.get("label", str(best_det["cls"])),
             })
     return groups
     """True if center of box is inside crop_box. All [x1,y1,x2,y2]."""
     cx = (box[0] + box[2]) / 2
     cy = (box[1] + box[3]) / 2
+    return (
+        crop_box[0] <= cx <= crop_box[2]
+        and crop_box[1] <= cy <= crop_box[3]
+    )
 def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
     orig = (int(bx1), int(by1), int(bx2), int(by2))
     w = bx2 - bx1
     h = by2 - by1
     if w <= 0 or h <= 0:
         return orig
     if h > w:
         add = (h - w) / 2.0
         bx1 = max(0, bx1 - add)
         add = (w - h) / 2.0
         by1 = max(0, by1 - add)
         by2 = min(img_h, by2 + add)
     bx1, by1, bx2, by2 = int(bx1), int(by1), int(bx2), int(by2)
     if bx2 <= bx1 or by2 <= by1:
         return orig
     return bx1, by1, bx2, by2
     iy1 = max(box1[1], box2[1])
     ix2 = min(box1[2], box2[2])
     iy2 = min(box1[3], box2[3])
     inter_w = max(0, ix2 - ix1)
     inter_h = max(0, iy2 - iy1)
     inter = inter_w * inter_h
     a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
     a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
     union = a1 + a2 - inter
     return inter / union if union > 0 else 0.0
     """Keep one detection per overlapping group (IoU >= iou_threshold). Prefer higher confidence."""
     if not detections:
         return []
+    # Sort by confidence descending; keep first, then add only if no kept box overlaps >= threshold
     sorted_d = sorted(detections, key=lambda x: -x["conf"])
     kept = []
     for d in sorted_d:
         if not any(box_iou(d["box"], k["box"]) >= iou_threshold for k in kept):
             kept.append(d)
     return kept
 def parse_args():
     p = argparse.ArgumentParser(
+        description="D-FINE (person/car) → group → Jina-CLIP-v2 on crops inside groups"
     )
     p.add_argument("--refs", required=True, help="Reference images folder for Jina and Nomic (e.g. refs/)")
     p.add_argument("--input", required=True, help="Full-frame images folder")
     p.add_argument("--output", default="pipeline_results", help="Output folder (CSV, etc.)")
     p.add_argument("--det-threshold", type=float, default=0.13, help="D-FINE score threshold")
     p.add_argument("--group-dist", type=float, default=None, help="Group distance (default: 0.1 * max(H,W))")
+    p.add_argument("--min-side", type=int, default=40, help="Min side of expanded bbox in px (skip smaller)")
+    p.add_argument("--crop-dedup-iou", type=float, default=0.35, help="Min IoU to treat two crops as same object (keep larger)")
+    p.add_argument("--no-squarify", action="store_true", help="Skip squarify; use expanded bbox only (tighter crops, often better recognition)")
+    p.add_argument("--padding", type=float, default=0.2, help="Crop padding around group box (0.2 = 20%%)")
+    p.add_argument("--conf-threshold", type=float, default=0.75, help="Jina accept confidence")
+    p.add_argument("--gap-threshold", type=float, default=0.05, help="Jina accept gap")
     p.add_argument("--text-weight", type=float, default=0.3)
     p.add_argument("--max-images", type=int, default=None)
     p.add_argument("--device", default=None)
     """Return set of label IDs for person and car (Objects365: Person, Car, SUV, etc.)."""
     id2label = getattr(model.config, "id2label", None) or {}
     ids = set()
     for idx, name in id2label.items():
         try:
             i = int(idx)
         except (ValueError, TypeError):
             continue
         n = (name or "").lower()
         if "person" in n or n in ("car", "suv"):
             ids.add(i)
     return ids
 def run_dfine(image, processor, model, device, score_threshold):
     """Run D-FINE, return all detections as list of {box, score, label_id, label}."""
+    from PIL import Image
     if isinstance(image, Image.Image):
         pil = image.convert("RGB")
     else:
         pil = Image.fromarray(image).convert("RGB")
     w, h = pil.size
     target_size = torch.tensor([[h, w]], device=device)
     inputs = processor(images=pil, return_tensors="pt")
     inputs = {k: v.to(device) for k, v in inputs.items()}
     target_sizes = target_size.to(outputs["logits"].device)
     results = processor.post_process_object_detection(
+        outputs,
+        target_sizes=target_sizes,
+        threshold=score_threshold,
     )
+    id2label = getattr(model.config, "id2label", {}) or {}
     detections = []
     for result in results:
+        for score, label_id, box in zip(
+            result["scores"],
+            result["labels"],
+            result["boxes"]
+        ):
             sid = int(label_id.item())
             detections.append({
                 "box": [float(x) for x in box.cpu().tolist()],
                 "cls": sid,
                 "label": id2label.get(sid, str(sid)),
             })
+    return detections
 def main():
     args = parse_args()
     device = args.device or ("cuda" if torch.cuda.is_available() else "cpu")
     input_dir = Path(args.input)
     output_dir = Path(args.output)
     refs_dir = Path(args.refs)
     if not input_dir.is_dir():
         raise SystemExit(f"Input folder not found: {input_dir}")
+    paths = sorted(
+        p for p in input_dir.iterdir()
+        if p.suffix.lower() in IMAGE_EXTS
+    )
     if args.max_images is not None:
         paths = paths[: args.max_images]
     if not paths:
         raise SystemExit(f"No images in {input_dir}")
     dfine_model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-medium-obj365")
     dfine_model = dfine_model.to(device).eval()
     person_car_ids = get_person_car_label_ids(dfine_model)
+    print(f"  Person/car label IDs: {person_car_ids} ({time.perf_counter()-t0:.1f}s)")
+    # Load Jina-CLIP-v2 + build refs
     print("[*] Loading Jina-CLIP-v2 and building refs...")
     t0 = time.perf_counter()
     jina_encoder = JinaCLIPv2Encoder(device)
+    ref_labels, ref_embs = build_refs(
+        jina_encoder,
+        refs_dir,
+        TRUNCATE_DIM,
+        args.text_weight,
+        batch_size=16
+    )
+    print(f"  Jina refs: {ref_labels} ({time.perf_counter()-t0:.1f}s)\n")
+    # Load Nomic vision + text, build refs (same as Jina: image + text prompts, text_weight 0.3)
     print("[*] Loading Nomic embed-vision + embed-text and building refs...")
     t0 = time.perf_counter()
     nomic_encoder = NomicVisionEncoder(device)
         text_encoder=nomic_text_encoder,
         text_weight=args.text_weight,
     )
+    print(f"  Nomic refs: {ref_labels_nomic} ({time.perf_counter()-t0:.1f}s)\n")
+    # Separate output folders per model for visual comparison
     jina_crops_dir = output_dir / "jina_crops"
     nomic_crops_dir = output_dir / "nomic_crops"
     jina_crops_dir.mkdir(parents=True, exist_ok=True)
     nomic_crops_dir.mkdir(parents=True, exist_ok=True)
+    # CSV
     csv_path = output_dir / "results.csv"
     f = open(csv_path, "w", newline="")
     w = csv.writer(f)
     w.writerow([
+        "image",
+        "crop_filename",
+        "group_idx",
+        "crop_x1",
+        "crop_y1",
+        "crop_x2",
+        "crop_y2",
+        "bbox_x1",
+        "bbox_y1",
+        "bbox_x2",
+        "bbox_y2",
+        "dfine_label",
+        "dfine_conf",
+        "jina_prediction",
+        "jina_confidence",
+        "jina_status",
+        "nomic_prediction",
+        "nomic_confidence",
+        "nomic_status",
     ])
     for img_path in paths:
         img_w, img_h = pil.size
         group_dist = args.group_dist if args.group_dist is not None else 0.1 * max(img_h, img_w)
+        # 1) D-FINE: detect everything, keep all bboxes for the image
+        detections = run_dfine(
+            pil,
+            image_processor,
+            dfine_model,
+            device,
+            args.det_threshold
+        )
         person_car = [d for d in detections if d["cls"] in person_car_ids]
         if not person_car:
             continue
+        # 2) Group person/car detections (same as reference)
         grouped = group_detections(person_car, group_dist)
         grouped.sort(key=lambda x: x["conf"], reverse=True)
+        top_groups = grouped[:10]  # limit groups per image
+        # 3) Collect all candidate crops (bboxes inside person/car groups)
+        # Each: (crop_box, crop_pil, d, gidx, crop_idx, x1, y1, x2, y2)
+        candidates = []
         for gidx, grp in enumerate(top_groups):
+            x1, y1, x2, y2 = grp["box"]
+            group_box = [x1, y1, x2, y2]
+            inside = [
+                d for d in detections
+                if box_center_inside(d["box"], group_box) and d["cls"] not in person_car_ids
+            ]
+            inside = deduplicate_by_iou(inside, iou_threshold=0.9)
+            for crop_idx, d in enumerate(inside):
+                bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
+                obj_w, obj_h = bx2 - bx1, by2 - by1
+                if obj_w <= 0 or obj_h <= 0:
+                    continue
+                pad_x = obj_w * 0.3
+                pad_y = obj_h * 0.3
+                bx1 = max(0, int(bx1 - pad_x))
+                by1 = max(0, int(by1 - pad_y))
+                bx2 = min(img_w, int(bx2 + pad_x))
+                by2 = min(img_h, int(by2 + pad_y))
+                if bx2 <= bx1 or by2 <= by1:
+                    continue
+                if min(bx2 - bx1, by2 - by1) < args.min_side:
+                    continue
+                expanded_box = [bx1, by1, bx2, by2]
+                candidates.append((expanded_box, d, gidx, crop_idx, x1, y1, x2, y2))
+        # 4) Dedup on EXPANDED boxes (before squarify), keep larger; then squarify only kept
+        def crop_area(box):
+            return (box[2] - box[0]) * (box[3] - box[1])
+        candidates.sort(key=lambda c: -crop_area(c[0]))
+        kept = []
+        for c in candidates:
+            expanded_box = c[0]
+            def is_same_object(box_a, box_b):
+                if box_iou(box_a, box_b) >= args.crop_dedup_iou:
+                    return True
+                if box_center_inside(box_a, box_b) or box_center_inside(box_b, box_a):
+                    return True
+                return False
+            if not any(is_same_object(expanded_box, k[0]) for k in kept):
+                kept.append(c)
+        # 5) Optionally squarify, then run Jina and Nomic only on kept crops
+        for i, (expanded_box, d, gidx, crop_idx, x1, y1, x2, y2) in enumerate(kept):
+            if not args.no_squarify:
+                bx1, by1, bx2, by2 = squarify_crop_box(
+                    expanded_box[0],
+                    expanded_box[1],
+                    expanded_box[2],
+                    expanded_box[3],
+                    img_w,
+                    img_h
+                )
+            else:
+                bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
+            crop_pil = pil.crop((bx1, by1, bx2, by2))
+            crop_name = f"{img_path.stem}_g{gidx}_{i}_{bx1}_{by1}_{bx2}_{by2}{img_path.suffix}"
+            q_jina = jina_encoder.encode_images([crop_pil], TRUNCATE_DIM)
+            result_jina = jina_classify(
+                q_jina,
+                ref_labels,
+                ref_embs,
+                args.conf_threshold,
+                args.gap_threshold
             )
+            if result_jina["prediction"] in ref_labels:
+                label_jina = result_jina["prediction"]
+                conf_jina = result_jina["confidence"]
+            else:
+                label_jina = f"unnamed (dfine: {d['label']})"
+                conf_jina = 0.0
+            ann_jina = draw_label_on_image(crop_pil, label_jina, conf_jina)
             ann_jina.save(jina_crops_dir / crop_name)
+            q_nomic = nomic_encoder.encode_images([crop_pil])
+            result_nomic = jina_classify(
+                q_nomic,
+                ref_labels_nomic,
+                ref_embs_nomic,
+                args.conf_threshold,
+                args.gap_threshold
+            )
+            if result_nomic["prediction"] in ref_labels_nomic:
+                label_nomic = result_nomic["prediction"]
+                conf_nomic = result_nomic["confidence"]
+            else:
+                label_nomic = f"unnamed (dfine: {d['label']})"
+                conf_nomic = 0.0
+            ann_nomic = draw_label_on_image(crop_pil, label_nomic, conf_nomic)
             ann_nomic.save(nomic_crops_dir / crop_name)
+            w.writerow([
+                img_path.name,
+                crop_name,
+                gidx,
+                x1,
+                y1,
+                x2,
+                y2,
+                bx1,
+                by1,
+                bx2,
+                by2,
+                d["label"],
+                f"{d['conf']:.4f}",
+                result_jina["prediction"],
+                f"{result_jina['confidence']:.4f}",
+                result_jina["status"],
+                result_nomic["prediction"],
+                f"{result_nomic['confidence']:.4f}",
+                result_nomic["status"],
+            ])
     f.close()
     print(f"[*] Wrote {csv_path}")
+    print(f"[*] Jina crops: {jina_crops_dir}")
     print(f"[*] Nomic crops: {nomic_crops_dir}")
 # -----------------------------------------------------------------------------
+# Single-image runner for Gradio app: D-FINE first, then Jina or Nomic (user choice)
 # -----------------------------------------------------------------------------
 _APP_DFINE = None
 _APP_JINA = None
 _APP_NOMIC = None
     squarify=True,
 ):
     """
+    Run D-FINE on one image, then classify small-object crops with Jina or Nomic.
+    refs_dir: path to refs folder (str or Path).
+    encoder_choice: "jina" or "nomic".
+    Returns (annotated_pil, result_text) for display in app.
     """
+    import numpy as np
+    from PIL import Image
     global _APP_DFINE, _APP_JINA, _APP_NOMIC, _APP_REFS_JINA, _APP_REFS_NOMIC
     refs_dir = Path(refs_dir)
     grouped.sort(key=lambda x: x["conf"], reverse=True)
     top_groups = grouped[:10]
+    candidates = []
+    for gidx, grp in enumerate(top_groups):
+        x1, y1, x2, y2 = grp["box"]
+        group_box = [x1, y1, x2, y2]
+        inside = [
+            d for d in detections
+            if box_center_inside(d["box"], group_box) and d["cls"] not in person_car_ids
+        ]
+        inside = deduplicate_by_iou(inside, iou_threshold=0.9)
+        for crop_idx, d in enumerate(inside):
+            bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
+            obj_w, obj_h = bx2 - bx1, by2 - by1
+            if obj_w <= 0 or obj_h <= 0:
+                continue
+            pad_x, pad_y = obj_w * 0.3, obj_h * 0.3
+            bx1 = max(0, int(bx1 - pad_x))
+            by1 = max(0, int(by1 - pad_y))
+            bx2 = min(img_w, int(bx2 + pad_x))
+            by2 = min(img_h, int(by2 + pad_y))
+            if bx2 <= bx1 or by2 <= by1:
+                continue
+            if min(bx2 - bx1, by2 - by1) < min_side:
+                continue
+            expanded_box = [bx1, by1, bx2, by2]
+            candidates.append((expanded_box, d, gidx, crop_idx))
+    def crop_area(box):
+        return (box[2] - box[0]) * (box[3] - box[1])
+    candidates.sort(key=lambda c: -crop_area(c[0]))
+    kept = []
+    for c in candidates:
+        def is_same_object(box_a, box_b):
+            if box_iou(box_a, box_b) >= crop_dedup_iou:
+                return True
+            if box_center_inside(box_a, box_b) or box_center_inside(box_b, box_a):
+                return True
+            return False
+        if not any(is_same_object(c[0], k[0]) for k in kept):
+            kept.append(c)
+    if not kept:
+        if not candidates:
+            return np.array(pil), "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
+        return np.array(pil), "No small-object crops (after dedup)."
     # Load encoder + refs for chosen model
     if encoder_choice == "jina":
         if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
             ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
             _APP_JINA = (jina_encoder, ref_labels, ref_embs)
             _APP_REFS_JINA = str(refs_dir)
+        jina_encoder, ref_labels, ref_embs = _APP_JINA
     else:
         if _APP_NOMIC is None or _APP_REFS_NOMIC != str(refs_dir):
             nomic_encoder = NomicVisionEncoder(device)
             )
             _APP_NOMIC = (nomic_encoder, ref_labels, ref_embs)
             _APP_REFS_NOMIC = str(refs_dir)
+        nomic_encoder, ref_labels, ref_embs = _APP_NOMIC
+    lines = []
+    out_img = pil.copy()
+    for i, (expanded_box, d, gidx, crop_idx) in enumerate(kept):
+        if squarify:
+            bx1, by1, bx2, by2 = squarify_crop_box(
+                expanded_box[0],
+                expanded_box[1],
+                expanded_box[2],
+                expanded_box[3],
+                img_w,
+                img_h
+            )
+        else:
+            bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
+        crop_pil = pil.crop((bx1, by1, bx2, by2))
+        if encoder_choice == "jina":
+            q = jina_encoder.encode_images([crop_pil], TRUNCATE_DIM)
+            result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
+        else:
+            q = nomic_encoder.encode_images([crop_pil])
+            result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
+        pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
+        conf = result["confidence"]
+        lines.append(f"Crop {i+1}: {pred} ({conf:.2f})")
+        labeled = draw_label_on_image(crop_pil, pred, conf)
+        out_img.paste(labeled, (bx1, by1))
+    result_text = "\n".join(lines) if lines else "No crops"
+    return np.array(out_img), result_text
 if __name__ == "__main__":