Spaces:
Running
Running
Orkhan Hasanli commited on
Commit ·
be400de
1
Parent(s): d6dee9e
D-FINE: person/car crop galleries, known-object bboxes only
Browse files- app.py +26 -17
- dfine_jina_pipeline.py +53 -15
- jina_fewshot.py +22 -0
app.py
CHANGED
|
@@ -109,20 +109,20 @@ def run_detection(image, model):
|
|
| 109 |
|
| 110 |
|
| 111 |
def run_dfine_classify(image, encoder_choice, refs_path):
|
| 112 |
-
"""Tab 2: D-FINE first, then classify crops with Jina or Nomic.
|
| 113 |
-
|
|
|
|
| 114 |
if image is None:
|
| 115 |
-
return
|
| 116 |
|
| 117 |
refs = Path(refs_path.strip()) if refs_path and refs_path.strip() else Path(REFS_DIR)
|
| 118 |
|
| 119 |
if not refs.is_dir():
|
| 120 |
-
return
|
| 121 |
|
| 122 |
# Tuned on COCO GT: conf=0.5, gap=0.02.
|
| 123 |
# Lower det_threshold/min_side so D-FINE picks up more objects (gun, phone, etc.) like local.
|
| 124 |
-
|
| 125 |
-
out_img, text = run_single_image(
|
| 126 |
image,
|
| 127 |
refs_dir=refs,
|
| 128 |
encoder_choice=encoder_choice.lower(),
|
|
@@ -133,10 +133,10 @@ def run_dfine_classify(image, encoder_choice, refs_path):
|
|
| 133 |
crop_dedup_iou=0.4,
|
| 134 |
)
|
| 135 |
|
| 136 |
-
if
|
| 137 |
-
return None,
|
| 138 |
|
| 139 |
-
return
|
| 140 |
|
| 141 |
|
| 142 |
IMG_HEIGHT = 400
|
|
@@ -264,21 +264,30 @@ with gr.Blocks(title="Small Object Detection") as app:
|
|
| 264 |
|
| 265 |
with gr.Column(scale=1):
|
| 266 |
|
| 267 |
-
|
| 268 |
-
label="
|
| 269 |
-
height=IMG_HEIGHT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
)
|
| 271 |
|
| 272 |
-
|
| 273 |
-
label="
|
| 274 |
-
lines=
|
| 275 |
-
interactive=False
|
| 276 |
)
|
| 277 |
|
| 278 |
btn_dfine.click(
|
| 279 |
fn=run_dfine_classify,
|
| 280 |
inputs=[inp_dfine, encoder_choice, refs_path],
|
| 281 |
-
outputs=[
|
| 282 |
concurrency_limit=1,
|
| 283 |
)
|
| 284 |
|
|
|
|
| 109 |
|
| 110 |
|
| 111 |
def run_dfine_classify(image, encoder_choice, refs_path):
|
| 112 |
+
"""Tab 2: D-FINE first, then classify crops with Jina or Nomic.
|
| 113 |
+
Returns (group_crop_gallery, known_crop_gallery, status_message).
|
| 114 |
+
"""
|
| 115 |
if image is None:
|
| 116 |
+
return [], [], "Upload an image."
|
| 117 |
|
| 118 |
refs = Path(refs_path.strip()) if refs_path and refs_path.strip() else Path(REFS_DIR)
|
| 119 |
|
| 120 |
if not refs.is_dir():
|
| 121 |
+
return [], [], f"Refs folder not found: {refs}"
|
| 122 |
|
| 123 |
# Tuned on COCO GT: conf=0.5, gap=0.02.
|
| 124 |
# Lower det_threshold/min_side so D-FINE picks up more objects (gun, phone, etc.) like local.
|
| 125 |
+
group_crops, known_crops, status = run_single_image(
|
|
|
|
| 126 |
image,
|
| 127 |
refs_dir=refs,
|
| 128 |
encoder_choice=encoder_choice.lower(),
|
|
|
|
| 133 |
crop_dedup_iou=0.4,
|
| 134 |
)
|
| 135 |
|
| 136 |
+
if status is not None:
|
| 137 |
+
return [(g, None) for g in (group_crops or [])], [(k, None) for k in (known_crops or [])], status
|
| 138 |
|
| 139 |
+
return [(g, None) for g in group_crops], [(k, None) for k in known_crops], ""
|
| 140 |
|
| 141 |
|
| 142 |
IMG_HEIGHT = 400
|
|
|
|
| 264 |
|
| 265 |
with gr.Column(scale=1):
|
| 266 |
|
| 267 |
+
out_gallery_dfine = gr.Gallery(
|
| 268 |
+
label="Person/car crops (bboxes: gun, knife, cigarette, phone only)",
|
| 269 |
+
height=IMG_HEIGHT,
|
| 270 |
+
columns=2,
|
| 271 |
+
object_fit="contain",
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
out_gallery_known = gr.Gallery(
|
| 275 |
+
label="Known objects (class + score above each crop)",
|
| 276 |
+
height=IMG_HEIGHT,
|
| 277 |
+
columns=4,
|
| 278 |
+
object_fit="contain",
|
| 279 |
)
|
| 280 |
|
| 281 |
+
out_status_dfine = gr.Textbox(
|
| 282 |
+
label="Status",
|
| 283 |
+
lines=2,
|
| 284 |
+
interactive=False,
|
| 285 |
)
|
| 286 |
|
| 287 |
btn_dfine.click(
|
| 288 |
fn=run_dfine_classify,
|
| 289 |
inputs=[inp_dfine, encoder_choice, refs_path],
|
| 290 |
+
outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
|
| 291 |
concurrency_limit=1,
|
| 292 |
)
|
| 293 |
|
dfine_jina_pipeline.py
CHANGED
|
@@ -21,9 +21,13 @@ from jina_fewshot import (
|
|
| 21 |
JinaCLIPv2Encoder,
|
| 22 |
build_refs,
|
| 23 |
classify as jina_classify,
|
|
|
|
| 24 |
draw_label_on_image,
|
| 25 |
)
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
from nomic_fewshot import NomicTextEncoder, NomicVisionEncoder, build_refs_nomic
|
| 28 |
|
| 29 |
|
|
@@ -534,7 +538,10 @@ def run_single_image(
|
|
| 534 |
refs_dir: path to refs folder (str or Path).
|
| 535 |
encoder_choice: "jina" or "nomic".
|
| 536 |
|
| 537 |
-
Returns (
|
|
|
|
|
|
|
|
|
|
| 538 |
"""
|
| 539 |
import numpy as np
|
| 540 |
from PIL import Image
|
|
@@ -543,7 +550,7 @@ def run_single_image(
|
|
| 543 |
|
| 544 |
refs_dir = Path(refs_dir)
|
| 545 |
if not refs_dir.is_dir():
|
| 546 |
-
return
|
| 547 |
|
| 548 |
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 549 |
print(f"[*] Device: {device}")
|
|
@@ -565,7 +572,7 @@ def run_single_image(
|
|
| 565 |
detections = run_dfine(pil, image_processor, dfine_model, device, det_threshold)
|
| 566 |
person_car = [d for d in detections if d["cls"] in person_car_ids]
|
| 567 |
if not person_car:
|
| 568 |
-
return
|
| 569 |
|
| 570 |
grouped = group_detections(person_car, group_dist)
|
| 571 |
grouped.sort(key=lambda x: x["conf"], reverse=True)
|
|
@@ -623,8 +630,8 @@ def run_single_image(
|
|
| 623 |
|
| 624 |
if not kept:
|
| 625 |
if not candidates:
|
| 626 |
-
return
|
| 627 |
-
return
|
| 628 |
|
| 629 |
# Load encoder + refs for chosen model
|
| 630 |
if encoder_choice == "jina":
|
|
@@ -651,10 +658,9 @@ def run_single_image(
|
|
| 651 |
|
| 652 |
nomic_encoder, ref_labels, ref_embs = _APP_NOMIC
|
| 653 |
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
for i, (expanded_box, d, gidx, crop_idx) in enumerate(kept):
|
| 658 |
if squarify:
|
| 659 |
bx1, by1, bx2, by2 = squarify_crop_box(
|
| 660 |
expanded_box[0],
|
|
@@ -662,7 +668,7 @@ def run_single_image(
|
|
| 662 |
expanded_box[2],
|
| 663 |
expanded_box[3],
|
| 664 |
img_w,
|
| 665 |
-
img_h
|
| 666 |
)
|
| 667 |
else:
|
| 668 |
bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
|
|
@@ -678,14 +684,46 @@ def run_single_image(
|
|
| 678 |
|
| 679 |
pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
|
| 680 |
conf = result["confidence"]
|
|
|
|
| 681 |
|
| 682 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
|
| 684 |
-
|
| 685 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
|
| 687 |
-
|
| 688 |
-
return np.array(out_img), result_text
|
| 689 |
|
| 690 |
|
| 691 |
if __name__ == "__main__":
|
|
|
|
| 21 |
JinaCLIPv2Encoder,
|
| 22 |
build_refs,
|
| 23 |
classify as jina_classify,
|
| 24 |
+
draw_bboxes_on_image,
|
| 25 |
draw_label_on_image,
|
| 26 |
)
|
| 27 |
|
| 28 |
+
# Only these ref classes get bboxes on group crops and appear in the known-object gallery
|
| 29 |
+
KNOWN_DISPLAY_CLASSES = {"gun", "knife", "cigarette", "phone"}
|
| 30 |
+
|
| 31 |
from nomic_fewshot import NomicTextEncoder, NomicVisionEncoder, build_refs_nomic
|
| 32 |
|
| 33 |
|
|
|
|
| 538 |
refs_dir: path to refs folder (str or Path).
|
| 539 |
encoder_choice: "jina" or "nomic".
|
| 540 |
|
| 541 |
+
Returns (group_crop_images, known_crop_composites, status_message).
|
| 542 |
+
- group_crop_images: list of PIL/numpy (one per person/car group, with bboxes for known objects only).
|
| 543 |
+
- known_crop_composites: list of PIL/numpy (label+score above + crop) for known classes only.
|
| 544 |
+
- status_message: None on success, or error/empty-state string.
|
| 545 |
"""
|
| 546 |
import numpy as np
|
| 547 |
from PIL import Image
|
|
|
|
| 550 |
|
| 551 |
refs_dir = Path(refs_dir)
|
| 552 |
if not refs_dir.is_dir():
|
| 553 |
+
return [], [], f"Refs folder not found: {refs_dir}"
|
| 554 |
|
| 555 |
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 556 |
print(f"[*] Device: {device}")
|
|
|
|
| 572 |
detections = run_dfine(pil, image_processor, dfine_model, device, det_threshold)
|
| 573 |
person_car = [d for d in detections if d["cls"] in person_car_ids]
|
| 574 |
if not person_car:
|
| 575 |
+
return [], [], "No person/car detected. No small-object crops."
|
| 576 |
|
| 577 |
grouped = group_detections(person_car, group_dist)
|
| 578 |
grouped.sort(key=lambda x: x["conf"], reverse=True)
|
|
|
|
| 630 |
|
| 631 |
if not kept:
|
| 632 |
if not candidates:
|
| 633 |
+
return [], [], "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
|
| 634 |
+
return [], [], "No small-object crops (after dedup)."
|
| 635 |
|
| 636 |
# Load encoder + refs for chosen model
|
| 637 |
if encoder_choice == "jina":
|
|
|
|
| 658 |
|
| 659 |
nomic_encoder, ref_labels, ref_embs = _APP_NOMIC
|
| 660 |
|
| 661 |
+
# Classify each kept crop and store (gidx, box_in_full_image, crop_pil, pred, conf)
|
| 662 |
+
results_per_crop = []
|
| 663 |
+
for expanded_box, d, gidx, crop_idx in kept:
|
|
|
|
| 664 |
if squarify:
|
| 665 |
bx1, by1, bx2, by2 = squarify_crop_box(
|
| 666 |
expanded_box[0],
|
|
|
|
| 668 |
expanded_box[2],
|
| 669 |
expanded_box[3],
|
| 670 |
img_w,
|
| 671 |
+
img_h,
|
| 672 |
)
|
| 673 |
else:
|
| 674 |
bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
|
|
|
|
| 684 |
|
| 685 |
pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
|
| 686 |
conf = result["confidence"]
|
| 687 |
+
results_per_crop.append((gidx, (bx1, by1, bx2, by2), crop_pil, pred, conf))
|
| 688 |
|
| 689 |
+
# Build group crop images: one per person/car group, with bboxes only for known objects
|
| 690 |
+
group_crop_images = []
|
| 691 |
+
for gidx, grp in enumerate(top_groups):
|
| 692 |
+
gx1, gy1, gx2, gy2 = grp["box"]
|
| 693 |
+
gx1, gy1 = int(gx1), int(gy1)
|
| 694 |
+
gx2, gy2 = int(gx2), int(gy2)
|
| 695 |
+
gx1, gy1 = max(0, gx1), max(0, gy1)
|
| 696 |
+
gx2, gy2 = min(img_w, gx2), min(img_h, gy2)
|
| 697 |
+
if gx2 <= gx1 or gy2 <= gy1:
|
| 698 |
+
continue
|
| 699 |
+
group_crop = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
|
| 700 |
+
crop_w, crop_h = group_crop.size
|
| 701 |
|
| 702 |
+
boxes_to_draw = []
|
| 703 |
+
for (gidx2, (bx1, by1, bx2, by2), _crop_pil, pred, conf) in results_per_crop:
|
| 704 |
+
if gidx2 != gidx or pred not in KNOWN_DISPLAY_CLASSES:
|
| 705 |
+
continue
|
| 706 |
+
# Convert to group-crop-relative coords and clamp
|
| 707 |
+
rx1 = max(0, min(crop_w, bx1 - gx1))
|
| 708 |
+
ry1 = max(0, min(crop_h, by1 - gy1))
|
| 709 |
+
rx2 = max(0, min(crop_w, bx2 - gx1))
|
| 710 |
+
ry2 = max(0, min(crop_h, by2 - gy1))
|
| 711 |
+
if rx2 > rx1 and ry2 > ry1:
|
| 712 |
+
boxes_to_draw.append((rx1, ry1, rx2, ry2, pred, conf))
|
| 713 |
+
|
| 714 |
+
if boxes_to_draw:
|
| 715 |
+
group_crop = draw_bboxes_on_image(group_crop, boxes_to_draw)
|
| 716 |
+
group_crop_images.append(np.array(group_crop))
|
| 717 |
+
|
| 718 |
+
# Build known-only gallery: composite (label above + crop) for each accepted known class
|
| 719 |
+
known_crop_composites = []
|
| 720 |
+
for (_gidx, _box, crop_pil, pred, conf) in results_per_crop:
|
| 721 |
+
if pred not in KNOWN_DISPLAY_CLASSES:
|
| 722 |
+
continue
|
| 723 |
+
composite = draw_label_on_image(crop_pil, pred, conf)
|
| 724 |
+
known_crop_composites.append(np.array(composite))
|
| 725 |
|
| 726 |
+
return group_crop_images, known_crop_composites, None
|
|
|
|
| 727 |
|
| 728 |
|
| 729 |
if __name__ == "__main__":
|
jina_fewshot.py
CHANGED
|
@@ -78,6 +78,28 @@ def draw_label_on_image(img: Image.Image, label: str, confidence: float) -> Imag
|
|
| 78 |
return out
|
| 79 |
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
CLASS_PROMPTS = {
|
| 82 |
"knife": [
|
| 83 |
"a knife",
|
|
|
|
| 78 |
return out
|
| 79 |
|
| 80 |
|
| 81 |
+
def draw_bboxes_on_image(
|
| 82 |
+
img: Image.Image,
|
| 83 |
+
boxes: list[tuple[float, float, float, float, str, float]],
|
| 84 |
+
) -> Image.Image:
|
| 85 |
+
"""Draw bboxes and labels (label conf) on image. boxes: list of (x1, y1, x2, y2, label, conf)."""
|
| 86 |
+
img = img.convert("RGB")
|
| 87 |
+
draw = ImageDraw.Draw(img)
|
| 88 |
+
w, h = img.width, img.height
|
| 89 |
+
font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
|
| 90 |
+
try:
|
| 91 |
+
font = ImageFont.truetype(font_path, size=max(10, min(h, w) // 20))
|
| 92 |
+
except OSError:
|
| 93 |
+
font = ImageFont.load_default()
|
| 94 |
+
|
| 95 |
+
for (x1, y1, x2, y2, label, conf) in boxes:
|
| 96 |
+
x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
|
| 97 |
+
draw.rectangle([x1, y1, x2, y2], outline=(0, 255, 0), width=2)
|
| 98 |
+
text = f"{label} {conf:.2f}"
|
| 99 |
+
draw.text((x1, max(0, y1 - 16)), text, fill=(0, 255, 0), font=font)
|
| 100 |
+
return img
|
| 101 |
+
|
| 102 |
+
|
| 103 |
CLASS_PROMPTS = {
|
| 104 |
"knife": [
|
| 105 |
"a knife",
|