Spaces:

Napron
/

small_object_detection

Running

App Files Files Community

Orkhan Hasanli commited on Mar 8

Commit

be400de

1 Parent(s): d6dee9e

D-FINE: person/car crop galleries, known-object bboxes only

Browse files

Files changed (3) hide show

app.py +26 -17
dfine_jina_pipeline.py +53 -15
jina_fewshot.py +22 -0

app.py CHANGED Viewed

@@ -109,20 +109,20 @@ def run_detection(image, model):
 def run_dfine_classify(image, encoder_choice, refs_path):
-    """Tab 2: D-FINE first, then classify crops with Jina or Nomic."""
     if image is None:
-        return None, "Upload an image."
     refs = Path(refs_path.strip()) if refs_path and refs_path.strip() else Path(REFS_DIR)
     if not refs.is_dir():
-        return None, f"Refs folder not found: {refs}"
     # Tuned on COCO GT: conf=0.5, gap=0.02.
     # Lower det_threshold/min_side so D-FINE picks up more objects (gun, phone, etc.) like local.
-    out_img, text = run_single_image(
         image,
         refs_dir=refs,
         encoder_choice=encoder_choice.lower(),
@@ -133,10 +133,10 @@ def run_dfine_classify(image, encoder_choice, refs_path):
         crop_dedup_iou=0.4,
     )
-    if out_img is None:
-        return None, text
-    return out_img, text
 IMG_HEIGHT = 400
@@ -264,21 +264,30 @@ with gr.Blocks(title="Small Object Detection") as app:
                 with gr.Column(scale=1):
-                    out_img_dfine = gr.Image(
-                        label="Output (crops with labels)",
-                        height=IMG_HEIGHT
                     )
-                    out_text_dfine = gr.Textbox(
-                        label="Crop predictions",
-                        lines=10,
-                        interactive=False
                     )
             btn_dfine.click(
                 fn=run_dfine_classify,
                 inputs=[inp_dfine, encoder_choice, refs_path],
-                outputs=[out_img_dfine, out_text_dfine],
                 concurrency_limit=1,
             )

 def run_dfine_classify(image, encoder_choice, refs_path):
+    """Tab 2: D-FINE first, then classify crops with Jina or Nomic.
+    Returns (group_crop_gallery, known_crop_gallery, status_message).
+    """
     if image is None:
+        return [], [], "Upload an image."
     refs = Path(refs_path.strip()) if refs_path and refs_path.strip() else Path(REFS_DIR)
     if not refs.is_dir():
+        return [], [], f"Refs folder not found: {refs}"
     # Tuned on COCO GT: conf=0.5, gap=0.02.
     # Lower det_threshold/min_side so D-FINE picks up more objects (gun, phone, etc.) like local.
+    group_crops, known_crops, status = run_single_image(
         image,
         refs_dir=refs,
         encoder_choice=encoder_choice.lower(),
         crop_dedup_iou=0.4,
     )
+    if status is not None:
+        return [(g, None) for g in (group_crops or [])], [(k, None) for k in (known_crops or [])], status
+    return [(g, None) for g in group_crops], [(k, None) for k in known_crops], ""
 IMG_HEIGHT = 400
                 with gr.Column(scale=1):
+                    out_gallery_dfine = gr.Gallery(
+                        label="Person/car crops (bboxes: gun, knife, cigarette, phone only)",
+                        height=IMG_HEIGHT,
+                        columns=2,
+                        object_fit="contain",
+                    )
+                    out_gallery_known = gr.Gallery(
+                        label="Known objects (class + score above each crop)",
+                        height=IMG_HEIGHT,
+                        columns=4,
+                        object_fit="contain",
                     )
+                    out_status_dfine = gr.Textbox(
+                        label="Status",
+                        lines=2,
+                        interactive=False,
                     )
             btn_dfine.click(
                 fn=run_dfine_classify,
                 inputs=[inp_dfine, encoder_choice, refs_path],
+                outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
                 concurrency_limit=1,
             )

dfine_jina_pipeline.py CHANGED Viewed

@@ -21,9 +21,13 @@ from jina_fewshot import (
     JinaCLIPv2Encoder,
     build_refs,
     classify as jina_classify,
     draw_label_on_image,
 )
 from nomic_fewshot import NomicTextEncoder, NomicVisionEncoder, build_refs_nomic
@@ -534,7 +538,10 @@ def run_single_image(
     refs_dir: path to refs folder (str or Path).
     encoder_choice: "jina" or "nomic".
-    Returns (annotated_pil, result_text) for display in app.
     """
     import numpy as np
     from PIL import Image
@@ -543,7 +550,7 @@ def run_single_image(
     refs_dir = Path(refs_dir)
     if not refs_dir.is_dir():
-        return None, f"Refs folder not found: {refs_dir}"
     device = device or ("cuda" if torch.cuda.is_available() else "cpu")
     print(f"[*] Device: {device}")
@@ -565,7 +572,7 @@ def run_single_image(
     detections = run_dfine(pil, image_processor, dfine_model, device, det_threshold)
     person_car = [d for d in detections if d["cls"] in person_car_ids]
     if not person_car:
-        return np.array(pil), "No person/car detected. No small-object crops."
     grouped = group_detections(person_car, group_dist)
     grouped.sort(key=lambda x: x["conf"], reverse=True)
@@ -623,8 +630,8 @@ def run_single_image(
     if not kept:
         if not candidates:
-            return np.array(pil), "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
-        return np.array(pil), "No small-object crops (after dedup)."
     # Load encoder + refs for chosen model
     if encoder_choice == "jina":
@@ -651,10 +658,9 @@ def run_single_image(
         nomic_encoder, ref_labels, ref_embs = _APP_NOMIC
-    lines = []
-    out_img = pil.copy()
-    for i, (expanded_box, d, gidx, crop_idx) in enumerate(kept):
         if squarify:
             bx1, by1, bx2, by2 = squarify_crop_box(
                 expanded_box[0],
@@ -662,7 +668,7 @@ def run_single_image(
                 expanded_box[2],
                 expanded_box[3],
                 img_w,
-                img_h
             )
         else:
             bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
@@ -678,14 +684,46 @@ def run_single_image(
         pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
         conf = result["confidence"]
-        lines.append(f"Crop {i+1}: {pred} ({conf:.2f})")
-        labeled = draw_label_on_image(crop_pil, pred, conf)
-        out_img.paste(labeled, (bx1, by1))
-    result_text = "\n".join(lines) if lines else "No crops"
-    return np.array(out_img), result_text
 if __name__ == "__main__":

     JinaCLIPv2Encoder,
     build_refs,
     classify as jina_classify,
+    draw_bboxes_on_image,
     draw_label_on_image,
 )
+# Only these ref classes get bboxes on group crops and appear in the known-object gallery
+KNOWN_DISPLAY_CLASSES = {"gun", "knife", "cigarette", "phone"}
 from nomic_fewshot import NomicTextEncoder, NomicVisionEncoder, build_refs_nomic
     refs_dir: path to refs folder (str or Path).
     encoder_choice: "jina" or "nomic".
+    Returns (group_crop_images, known_crop_composites, status_message).
+    - group_crop_images: list of PIL/numpy (one per person/car group, with bboxes for known objects only).
+    - known_crop_composites: list of PIL/numpy (label+score above + crop) for known classes only.
+    - status_message: None on success, or error/empty-state string.
     """
     import numpy as np
     from PIL import Image
     refs_dir = Path(refs_dir)
     if not refs_dir.is_dir():
+        return [], [], f"Refs folder not found: {refs_dir}"
     device = device or ("cuda" if torch.cuda.is_available() else "cpu")
     print(f"[*] Device: {device}")
     detections = run_dfine(pil, image_processor, dfine_model, device, det_threshold)
     person_car = [d for d in detections if d["cls"] in person_car_ids]
     if not person_car:
+        return [], [], "No person/car detected. No small-object crops."
     grouped = group_detections(person_car, group_dist)
     grouped.sort(key=lambda x: x["conf"], reverse=True)
     if not kept:
         if not candidates:
+            return [], [], "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
+        return [], [], "No small-object crops (after dedup)."
     # Load encoder + refs for chosen model
     if encoder_choice == "jina":
         nomic_encoder, ref_labels, ref_embs = _APP_NOMIC
+    # Classify each kept crop and store (gidx, box_in_full_image, crop_pil, pred, conf)
+    results_per_crop = []
+    for expanded_box, d, gidx, crop_idx in kept:
         if squarify:
             bx1, by1, bx2, by2 = squarify_crop_box(
                 expanded_box[0],
                 expanded_box[2],
                 expanded_box[3],
                 img_w,
+                img_h,
             )
         else:
             bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
         pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
         conf = result["confidence"]
+        results_per_crop.append((gidx, (bx1, by1, bx2, by2), crop_pil, pred, conf))
+    # Build group crop images: one per person/car group, with bboxes only for known objects
+    group_crop_images = []
+    for gidx, grp in enumerate(top_groups):
+        gx1, gy1, gx2, gy2 = grp["box"]
+        gx1, gy1 = int(gx1), int(gy1)
+        gx2, gy2 = int(gx2), int(gy2)
+        gx1, gy1 = max(0, gx1), max(0, gy1)
+        gx2, gy2 = min(img_w, gx2), min(img_h, gy2)
+        if gx2 <= gx1 or gy2 <= gy1:
+            continue
+        group_crop = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
+        crop_w, crop_h = group_crop.size
+        boxes_to_draw = []
+        for (gidx2, (bx1, by1, bx2, by2), _crop_pil, pred, conf) in results_per_crop:
+            if gidx2 != gidx or pred not in KNOWN_DISPLAY_CLASSES:
+                continue
+            # Convert to group-crop-relative coords and clamp
+            rx1 = max(0, min(crop_w, bx1 - gx1))
+            ry1 = max(0, min(crop_h, by1 - gy1))
+            rx2 = max(0, min(crop_w, bx2 - gx1))
+            ry2 = max(0, min(crop_h, by2 - gy1))
+            if rx2 > rx1 and ry2 > ry1:
+                boxes_to_draw.append((rx1, ry1, rx2, ry2, pred, conf))
+        if boxes_to_draw:
+            group_crop = draw_bboxes_on_image(group_crop, boxes_to_draw)
+        group_crop_images.append(np.array(group_crop))
+    # Build known-only gallery: composite (label above + crop) for each accepted known class
+    known_crop_composites = []
+    for (_gidx, _box, crop_pil, pred, conf) in results_per_crop:
+        if pred not in KNOWN_DISPLAY_CLASSES:
+            continue
+        composite = draw_label_on_image(crop_pil, pred, conf)
+        known_crop_composites.append(np.array(composite))
+    return group_crop_images, known_crop_composites, None
 if __name__ == "__main__":

jina_fewshot.py CHANGED Viewed

@@ -78,6 +78,28 @@ def draw_label_on_image(img: Image.Image, label: str, confidence: float) -> Imag
     return out
 CLASS_PROMPTS = {
     "knife": [
         "a knife",

     return out
+def draw_bboxes_on_image(
+    img: Image.Image,
+    boxes: list[tuple[float, float, float, float, str, float]],
+) -> Image.Image:
+    """Draw bboxes and labels (label conf) on image. boxes: list of (x1, y1, x2, y2, label, conf)."""
+    img = img.convert("RGB")
+    draw = ImageDraw.Draw(img)
+    w, h = img.width, img.height
+    font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
+    try:
+        font = ImageFont.truetype(font_path, size=max(10, min(h, w) // 20))
+    except OSError:
+        font = ImageFont.load_default()
+    for (x1, y1, x2, y2, label, conf) in boxes:
+        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+        draw.rectangle([x1, y1, x2, y2], outline=(0, 255, 0), width=2)
+        text = f"{label} {conf:.2f}"
+        draw.text((x1, max(0, y1 - 16)), text, fill=(0, 255, 0), font=font)
+    return img
 CLASS_PROMPTS = {
     "knife": [
         "a knife",