Orkhan Hasanli commited on
Commit
be400de
·
1 Parent(s): d6dee9e

D-FINE: person/car crop galleries, known-object bboxes only

Browse files
Files changed (3) hide show
  1. app.py +26 -17
  2. dfine_jina_pipeline.py +53 -15
  3. jina_fewshot.py +22 -0
app.py CHANGED
@@ -109,20 +109,20 @@ def run_detection(image, model):
109
 
110
 
111
  def run_dfine_classify(image, encoder_choice, refs_path):
112
- """Tab 2: D-FINE first, then classify crops with Jina or Nomic."""
113
-
 
114
  if image is None:
115
- return None, "Upload an image."
116
 
117
  refs = Path(refs_path.strip()) if refs_path and refs_path.strip() else Path(REFS_DIR)
118
 
119
  if not refs.is_dir():
120
- return None, f"Refs folder not found: {refs}"
121
 
122
  # Tuned on COCO GT: conf=0.5, gap=0.02.
123
  # Lower det_threshold/min_side so D-FINE picks up more objects (gun, phone, etc.) like local.
124
-
125
- out_img, text = run_single_image(
126
  image,
127
  refs_dir=refs,
128
  encoder_choice=encoder_choice.lower(),
@@ -133,10 +133,10 @@ def run_dfine_classify(image, encoder_choice, refs_path):
133
  crop_dedup_iou=0.4,
134
  )
135
 
136
- if out_img is None:
137
- return None, text
138
 
139
- return out_img, text
140
 
141
 
142
  IMG_HEIGHT = 400
@@ -264,21 +264,30 @@ with gr.Blocks(title="Small Object Detection") as app:
264
 
265
  with gr.Column(scale=1):
266
 
267
- out_img_dfine = gr.Image(
268
- label="Output (crops with labels)",
269
- height=IMG_HEIGHT
 
 
 
 
 
 
 
 
 
270
  )
271
 
272
- out_text_dfine = gr.Textbox(
273
- label="Crop predictions",
274
- lines=10,
275
- interactive=False
276
  )
277
 
278
  btn_dfine.click(
279
  fn=run_dfine_classify,
280
  inputs=[inp_dfine, encoder_choice, refs_path],
281
- outputs=[out_img_dfine, out_text_dfine],
282
  concurrency_limit=1,
283
  )
284
 
 
109
 
110
 
111
  def run_dfine_classify(image, encoder_choice, refs_path):
112
+ """Tab 2: D-FINE first, then classify crops with Jina or Nomic.
113
+ Returns (group_crop_gallery, known_crop_gallery, status_message).
114
+ """
115
  if image is None:
116
+ return [], [], "Upload an image."
117
 
118
  refs = Path(refs_path.strip()) if refs_path and refs_path.strip() else Path(REFS_DIR)
119
 
120
  if not refs.is_dir():
121
+ return [], [], f"Refs folder not found: {refs}"
122
 
123
  # Tuned on COCO GT: conf=0.5, gap=0.02.
124
  # Lower det_threshold/min_side so D-FINE picks up more objects (gun, phone, etc.) like local.
125
+ group_crops, known_crops, status = run_single_image(
 
126
  image,
127
  refs_dir=refs,
128
  encoder_choice=encoder_choice.lower(),
 
133
  crop_dedup_iou=0.4,
134
  )
135
 
136
+ if status is not None:
137
+ return [(g, None) for g in (group_crops or [])], [(k, None) for k in (known_crops or [])], status
138
 
139
+ return [(g, None) for g in group_crops], [(k, None) for k in known_crops], ""
140
 
141
 
142
  IMG_HEIGHT = 400
 
264
 
265
  with gr.Column(scale=1):
266
 
267
+ out_gallery_dfine = gr.Gallery(
268
+ label="Person/car crops (bboxes: gun, knife, cigarette, phone only)",
269
+ height=IMG_HEIGHT,
270
+ columns=2,
271
+ object_fit="contain",
272
+ )
273
+
274
+ out_gallery_known = gr.Gallery(
275
+ label="Known objects (class + score above each crop)",
276
+ height=IMG_HEIGHT,
277
+ columns=4,
278
+ object_fit="contain",
279
  )
280
 
281
+ out_status_dfine = gr.Textbox(
282
+ label="Status",
283
+ lines=2,
284
+ interactive=False,
285
  )
286
 
287
  btn_dfine.click(
288
  fn=run_dfine_classify,
289
  inputs=[inp_dfine, encoder_choice, refs_path],
290
+ outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
291
  concurrency_limit=1,
292
  )
293
 
dfine_jina_pipeline.py CHANGED
@@ -21,9 +21,13 @@ from jina_fewshot import (
21
  JinaCLIPv2Encoder,
22
  build_refs,
23
  classify as jina_classify,
 
24
  draw_label_on_image,
25
  )
26
 
 
 
 
27
  from nomic_fewshot import NomicTextEncoder, NomicVisionEncoder, build_refs_nomic
28
 
29
 
@@ -534,7 +538,10 @@ def run_single_image(
534
  refs_dir: path to refs folder (str or Path).
535
  encoder_choice: "jina" or "nomic".
536
 
537
- Returns (annotated_pil, result_text) for display in app.
 
 
 
538
  """
539
  import numpy as np
540
  from PIL import Image
@@ -543,7 +550,7 @@ def run_single_image(
543
 
544
  refs_dir = Path(refs_dir)
545
  if not refs_dir.is_dir():
546
- return None, f"Refs folder not found: {refs_dir}"
547
 
548
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
549
  print(f"[*] Device: {device}")
@@ -565,7 +572,7 @@ def run_single_image(
565
  detections = run_dfine(pil, image_processor, dfine_model, device, det_threshold)
566
  person_car = [d for d in detections if d["cls"] in person_car_ids]
567
  if not person_car:
568
- return np.array(pil), "No person/car detected. No small-object crops."
569
 
570
  grouped = group_detections(person_car, group_dist)
571
  grouped.sort(key=lambda x: x["conf"], reverse=True)
@@ -623,8 +630,8 @@ def run_single_image(
623
 
624
  if not kept:
625
  if not candidates:
626
- return np.array(pil), "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
627
- return np.array(pil), "No small-object crops (after dedup)."
628
 
629
  # Load encoder + refs for chosen model
630
  if encoder_choice == "jina":
@@ -651,10 +658,9 @@ def run_single_image(
651
 
652
  nomic_encoder, ref_labels, ref_embs = _APP_NOMIC
653
 
654
- lines = []
655
- out_img = pil.copy()
656
-
657
- for i, (expanded_box, d, gidx, crop_idx) in enumerate(kept):
658
  if squarify:
659
  bx1, by1, bx2, by2 = squarify_crop_box(
660
  expanded_box[0],
@@ -662,7 +668,7 @@ def run_single_image(
662
  expanded_box[2],
663
  expanded_box[3],
664
  img_w,
665
- img_h
666
  )
667
  else:
668
  bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
@@ -678,14 +684,46 @@ def run_single_image(
678
 
679
  pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
680
  conf = result["confidence"]
 
681
 
682
- lines.append(f"Crop {i+1}: {pred} ({conf:.2f})")
 
 
 
 
 
 
 
 
 
 
 
683
 
684
- labeled = draw_label_on_image(crop_pil, pred, conf)
685
- out_img.paste(labeled, (bx1, by1))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
 
687
- result_text = "\n".join(lines) if lines else "No crops"
688
- return np.array(out_img), result_text
689
 
690
 
691
  if __name__ == "__main__":
 
21
  JinaCLIPv2Encoder,
22
  build_refs,
23
  classify as jina_classify,
24
+ draw_bboxes_on_image,
25
  draw_label_on_image,
26
  )
27
 
28
+ # Only these ref classes get bboxes on group crops and appear in the known-object gallery
29
+ KNOWN_DISPLAY_CLASSES = {"gun", "knife", "cigarette", "phone"}
30
+
31
  from nomic_fewshot import NomicTextEncoder, NomicVisionEncoder, build_refs_nomic
32
 
33
 
 
538
  refs_dir: path to refs folder (str or Path).
539
  encoder_choice: "jina" or "nomic".
540
 
541
+ Returns (group_crop_images, known_crop_composites, status_message).
542
+ - group_crop_images: list of PIL/numpy (one per person/car group, with bboxes for known objects only).
543
+ - known_crop_composites: list of PIL/numpy (label+score above + crop) for known classes only.
544
+ - status_message: None on success, or error/empty-state string.
545
  """
546
  import numpy as np
547
  from PIL import Image
 
550
 
551
  refs_dir = Path(refs_dir)
552
  if not refs_dir.is_dir():
553
+ return [], [], f"Refs folder not found: {refs_dir}"
554
 
555
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
556
  print(f"[*] Device: {device}")
 
572
  detections = run_dfine(pil, image_processor, dfine_model, device, det_threshold)
573
  person_car = [d for d in detections if d["cls"] in person_car_ids]
574
  if not person_car:
575
+ return [], [], "No person/car detected. No small-object crops."
576
 
577
  grouped = group_detections(person_car, group_dist)
578
  grouped.sort(key=lambda x: x["conf"], reverse=True)
 
630
 
631
  if not kept:
632
  if not candidates:
633
+ return [], [], "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
634
+ return [], [], "No small-object crops (after dedup)."
635
 
636
  # Load encoder + refs for chosen model
637
  if encoder_choice == "jina":
 
658
 
659
  nomic_encoder, ref_labels, ref_embs = _APP_NOMIC
660
 
661
+ # Classify each kept crop and store (gidx, box_in_full_image, crop_pil, pred, conf)
662
+ results_per_crop = []
663
+ for expanded_box, d, gidx, crop_idx in kept:
 
664
  if squarify:
665
  bx1, by1, bx2, by2 = squarify_crop_box(
666
  expanded_box[0],
 
668
  expanded_box[2],
669
  expanded_box[3],
670
  img_w,
671
+ img_h,
672
  )
673
  else:
674
  bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
 
684
 
685
  pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
686
  conf = result["confidence"]
687
+ results_per_crop.append((gidx, (bx1, by1, bx2, by2), crop_pil, pred, conf))
688
 
689
+ # Build group crop images: one per person/car group, with bboxes only for known objects
690
+ group_crop_images = []
691
+ for gidx, grp in enumerate(top_groups):
692
+ gx1, gy1, gx2, gy2 = grp["box"]
693
+ gx1, gy1 = int(gx1), int(gy1)
694
+ gx2, gy2 = int(gx2), int(gy2)
695
+ gx1, gy1 = max(0, gx1), max(0, gy1)
696
+ gx2, gy2 = min(img_w, gx2), min(img_h, gy2)
697
+ if gx2 <= gx1 or gy2 <= gy1:
698
+ continue
699
+ group_crop = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
700
+ crop_w, crop_h = group_crop.size
701
 
702
+ boxes_to_draw = []
703
+ for (gidx2, (bx1, by1, bx2, by2), _crop_pil, pred, conf) in results_per_crop:
704
+ if gidx2 != gidx or pred not in KNOWN_DISPLAY_CLASSES:
705
+ continue
706
+ # Convert to group-crop-relative coords and clamp
707
+ rx1 = max(0, min(crop_w, bx1 - gx1))
708
+ ry1 = max(0, min(crop_h, by1 - gy1))
709
+ rx2 = max(0, min(crop_w, bx2 - gx1))
710
+ ry2 = max(0, min(crop_h, by2 - gy1))
711
+ if rx2 > rx1 and ry2 > ry1:
712
+ boxes_to_draw.append((rx1, ry1, rx2, ry2, pred, conf))
713
+
714
+ if boxes_to_draw:
715
+ group_crop = draw_bboxes_on_image(group_crop, boxes_to_draw)
716
+ group_crop_images.append(np.array(group_crop))
717
+
718
+ # Build known-only gallery: composite (label above + crop) for each accepted known class
719
+ known_crop_composites = []
720
+ for (_gidx, _box, crop_pil, pred, conf) in results_per_crop:
721
+ if pred not in KNOWN_DISPLAY_CLASSES:
722
+ continue
723
+ composite = draw_label_on_image(crop_pil, pred, conf)
724
+ known_crop_composites.append(np.array(composite))
725
 
726
+ return group_crop_images, known_crop_composites, None
 
727
 
728
 
729
  if __name__ == "__main__":
jina_fewshot.py CHANGED
@@ -78,6 +78,28 @@ def draw_label_on_image(img: Image.Image, label: str, confidence: float) -> Imag
78
  return out
79
 
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  CLASS_PROMPTS = {
82
  "knife": [
83
  "a knife",
 
78
  return out
79
 
80
 
81
+ def draw_bboxes_on_image(
82
+ img: Image.Image,
83
+ boxes: list[tuple[float, float, float, float, str, float]],
84
+ ) -> Image.Image:
85
+ """Draw bboxes and labels (label conf) on image. boxes: list of (x1, y1, x2, y2, label, conf)."""
86
+ img = img.convert("RGB")
87
+ draw = ImageDraw.Draw(img)
88
+ w, h = img.width, img.height
89
+ font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
90
+ try:
91
+ font = ImageFont.truetype(font_path, size=max(10, min(h, w) // 20))
92
+ except OSError:
93
+ font = ImageFont.load_default()
94
+
95
+ for (x1, y1, x2, y2, label, conf) in boxes:
96
+ x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
97
+ draw.rectangle([x1, y1, x2, y2], outline=(0, 255, 0), width=2)
98
+ text = f"{label} {conf:.2f}"
99
+ draw.text((x1, max(0, y1 - 16)), text, fill=(0, 255, 0), font=font)
100
+ return img
101
+
102
+
103
  CLASS_PROMPTS = {
104
  "knife": [
105
  "a knife",