Spaces:
Sleeping
Sleeping
Updated siglip labels
Browse files- app.py +3 -6
- dfine_jina_pipeline.py +14 -2
- siglip2_onnx_zeroshot.py +3 -9
- siglip_zeroshot.py +3 -14
app.py
CHANGED
|
@@ -142,10 +142,7 @@ def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, mi
|
|
| 142 |
classifier=classifier,
|
| 143 |
)
|
| 144 |
|
| 145 |
-
|
| 146 |
-
return [(g, None) for g in (group_crops or [])], [(k, None) for k in (known_crops or [])], status
|
| 147 |
-
|
| 148 |
-
return [(g, None) for g in group_crops], [(k, None) for k in known_crops], ""
|
| 149 |
|
| 150 |
|
| 151 |
IMG_HEIGHT = 400
|
|
@@ -332,8 +329,8 @@ with gr.Blocks(title="Small Object Detection") as app:
|
|
| 332 |
)
|
| 333 |
|
| 334 |
out_status_dfine = gr.Textbox(
|
| 335 |
-
label="
|
| 336 |
-
lines=
|
| 337 |
interactive=False,
|
| 338 |
)
|
| 339 |
|
|
|
|
| 142 |
classifier=classifier,
|
| 143 |
)
|
| 144 |
|
| 145 |
+
return [(g, None) for g in (group_crops or [])], [(k, None) for k in (known_crops or [])], status or ""
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
IMG_HEIGHT = 400
|
|
|
|
| 329 |
)
|
| 330 |
|
| 331 |
out_status_dfine = gr.Textbox(
|
| 332 |
+
label="Classification details",
|
| 333 |
+
lines=8,
|
| 334 |
interactive=False,
|
| 335 |
)
|
| 336 |
|
dfine_jina_pipeline.py
CHANGED
|
@@ -618,6 +618,7 @@ def run_single_image(
|
|
| 618 |
|
| 619 |
results_per_crop = []
|
| 620 |
group_crop_images = []
|
|
|
|
| 621 |
|
| 622 |
# For each person/car group: crop (with 10% margin), run D-FINE on crop, detect objects, then classify each
|
| 623 |
for gidx, grp in enumerate(top_groups):
|
|
@@ -702,6 +703,14 @@ def run_single_image(
|
|
| 702 |
conf = result["confidence"]
|
| 703 |
results_per_crop.append((gidx, (bx1, by1, bx2, by2), small_crop, pred, conf))
|
| 704 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
# Draw bboxes on this group crop (bboxes already in crop coords)
|
| 706 |
boxes_to_draw = [
|
| 707 |
(bx1, by1, bx2, by2, pred, conf)
|
|
@@ -714,8 +723,11 @@ def run_single_image(
|
|
| 714 |
crop_pil_drawn = crop_pil
|
| 715 |
group_crop_images.append(np.array(crop_pil_drawn))
|
| 716 |
|
|
|
|
|
|
|
|
|
|
| 717 |
if not results_per_crop:
|
| 718 |
-
return group_crop_images if group_crop_images else [], [], "
|
| 719 |
|
| 720 |
# Build known-only gallery: only objects with conf >= min_display_conf
|
| 721 |
known_crop_composites = []
|
|
@@ -725,7 +737,7 @@ def run_single_image(
|
|
| 725 |
composite = draw_label_on_image(crop_pil, pred, conf)
|
| 726 |
known_crop_composites.append(np.array(composite))
|
| 727 |
|
| 728 |
-
return group_crop_images, known_crop_composites,
|
| 729 |
|
| 730 |
|
| 731 |
if __name__ == "__main__":
|
|
|
|
| 618 |
|
| 619 |
results_per_crop = []
|
| 620 |
group_crop_images = []
|
| 621 |
+
classification_log = []
|
| 622 |
|
| 623 |
# For each person/car group: crop (with 10% margin), run D-FINE on crop, detect objects, then classify each
|
| 624 |
for gidx, grp in enumerate(top_groups):
|
|
|
|
| 703 |
conf = result["confidence"]
|
| 704 |
results_per_crop.append((gidx, (bx1, by1, bx2, by2), small_crop, pred, conf))
|
| 705 |
|
| 706 |
+
# Build per-crop log line
|
| 707 |
+
sims_str = ", ".join(f"{k}: {v:.4f}" for k, v in result.get("all_sims", {}).items())
|
| 708 |
+
classification_log.append(
|
| 709 |
+
f"[group {gidx}] dfine: {d['label']} ({d['conf']:.3f}) → "
|
| 710 |
+
f"{pred} (conf={conf:.4f}, gap={result['gap']:.4f}, 2nd={result.get('second_best','?')}) "
|
| 711 |
+
f"| {result['status']} | {sims_str}"
|
| 712 |
+
)
|
| 713 |
+
|
| 714 |
# Draw bboxes on this group crop (bboxes already in crop coords)
|
| 715 |
boxes_to_draw = [
|
| 716 |
(bx1, by1, bx2, by2, pred, conf)
|
|
|
|
| 723 |
crop_pil_drawn = crop_pil
|
| 724 |
group_crop_images.append(np.array(crop_pil_drawn))
|
| 725 |
|
| 726 |
+
log_text = f"Classifier: {classifier} | {len(results_per_crop)} crops classified\n"
|
| 727 |
+
log_text += "\n".join(classification_log) if classification_log else "(no crops)"
|
| 728 |
+
|
| 729 |
if not results_per_crop:
|
| 730 |
+
return group_crop_images if group_crop_images else [], [], log_text + "\nNo small-object crops: D-FINE on person/car crops did not detect any object (gun/phone/etc.), or all were below min size."
|
| 731 |
|
| 732 |
# Build known-only gallery: only objects with conf >= min_display_conf
|
| 733 |
known_crop_composites = []
|
|
|
|
| 737 |
composite = draw_label_on_image(crop_pil, pred, conf)
|
| 738 |
known_crop_composites.append(np.array(composite))
|
| 739 |
|
| 740 |
+
return group_crop_images, known_crop_composites, log_text
|
| 741 |
|
| 742 |
|
| 743 |
if __name__ == "__main__":
|
siglip2_onnx_zeroshot.py
CHANGED
|
@@ -13,7 +13,7 @@ from PIL import Image
|
|
| 13 |
from huggingface_hub import hf_hub_download
|
| 14 |
from transformers import AutoProcessor
|
| 15 |
|
| 16 |
-
from jina_fewshot import
|
| 17 |
|
| 18 |
|
| 19 |
REPO_ID = "onnx-community/siglip2-large-patch16-256-ONNX"
|
|
@@ -137,15 +137,9 @@ class SigLIP2ONNXClassifier:
|
|
| 137 |
if not self.labels:
|
| 138 |
raise ValueError(f"No subfolders in {refs_dir}")
|
| 139 |
|
| 140 |
-
|
| 141 |
-
for name in self.labels:
|
| 142 |
-
prompts = CLASS_PROMPTS.get(name, [f"a {name}"])
|
| 143 |
-
text_prompts.append(prompts[0])
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
print(f" SigLIP2 ONNX classes: {self.labels}")
|
| 148 |
-
print(f" Text prompts: {text_prompts}")
|
| 149 |
print(f" Text embeds shape: {self._text_embeds.shape}")
|
| 150 |
|
| 151 |
def classify_crop(self, crop, conf_threshold, gap_threshold):
|
|
|
|
| 13 |
from huggingface_hub import hf_hub_download
|
| 14 |
from transformers import AutoProcessor
|
| 15 |
|
| 16 |
+
from jina_fewshot import IMAGE_EXTS
|
| 17 |
|
| 18 |
|
| 19 |
REPO_ID = "onnx-community/siglip2-large-patch16-256-ONNX"
|
|
|
|
| 137 |
if not self.labels:
|
| 138 |
raise ValueError(f"No subfolders in {refs_dir}")
|
| 139 |
|
| 140 |
+
self._text_embeds = self._encode_texts(self.labels)
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
+
print(f" SigLIP2 ONNX labels: {self.labels}")
|
|
|
|
|
|
|
|
|
|
| 143 |
print(f" Text embeds shape: {self._text_embeds.shape}")
|
| 144 |
|
| 145 |
def classify_crop(self, crop, conf_threshold, gap_threshold):
|
siglip_zeroshot.py
CHANGED
|
@@ -11,9 +11,6 @@ import numpy as np
|
|
| 11 |
import torch
|
| 12 |
from transformers import SiglipModel, AutoProcessor
|
| 13 |
|
| 14 |
-
from jina_fewshot import CLASS_PROMPTS
|
| 15 |
-
|
| 16 |
-
|
| 17 |
class SigLIPClassifier:
|
| 18 |
"""Zero-shot crop classifier using SigLIP (PyTorch)."""
|
| 19 |
|
|
@@ -27,25 +24,17 @@ class SigLIPClassifier:
|
|
| 27 |
self.processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
|
| 28 |
|
| 29 |
self.labels = []
|
| 30 |
-
self._text_prompts = []
|
| 31 |
|
| 32 |
print(f"[*] SigLIP loaded in {time.perf_counter() - t0:.1f}s (device={device})")
|
| 33 |
|
| 34 |
def build_refs(self, refs_dir, **kwargs):
|
| 35 |
-
"""Extract class names from refs_dir subfolders
|
| 36 |
refs_dir = Path(refs_dir)
|
| 37 |
self.labels = sorted(d.name for d in refs_dir.iterdir() if d.is_dir())
|
| 38 |
if not self.labels:
|
| 39 |
raise ValueError(f"No subfolders in {refs_dir}")
|
| 40 |
|
| 41 |
-
|
| 42 |
-
self._text_prompts = []
|
| 43 |
-
for name in self.labels:
|
| 44 |
-
prompts = CLASS_PROMPTS.get(name, [f"a {name}"])
|
| 45 |
-
self._text_prompts.append(prompts[0])
|
| 46 |
-
|
| 47 |
-
print(f" SigLIP classes: {self.labels}")
|
| 48 |
-
print(f" Text prompts: {self._text_prompts}")
|
| 49 |
|
| 50 |
def classify_crop(self, crop, conf_threshold, gap_threshold):
|
| 51 |
"""
|
|
@@ -53,7 +42,7 @@ class SigLIPClassifier:
|
|
| 53 |
Returns dict matching jina_fewshot.classify() format.
|
| 54 |
"""
|
| 55 |
inputs = self.processor(
|
| 56 |
-
text=self.
|
| 57 |
images=crop,
|
| 58 |
return_tensors="pt",
|
| 59 |
padding="max_length",
|
|
|
|
| 11 |
import torch
|
| 12 |
from transformers import SiglipModel, AutoProcessor
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
class SigLIPClassifier:
|
| 15 |
"""Zero-shot crop classifier using SigLIP (PyTorch)."""
|
| 16 |
|
|
|
|
| 24 |
self.processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
|
| 25 |
|
| 26 |
self.labels = []
|
|
|
|
| 27 |
|
| 28 |
print(f"[*] SigLIP loaded in {time.perf_counter() - t0:.1f}s (device={device})")
|
| 29 |
|
| 30 |
def build_refs(self, refs_dir, **kwargs):
|
| 31 |
+
"""Extract class names from refs_dir subfolders as plain labels."""
|
| 32 |
refs_dir = Path(refs_dir)
|
| 33 |
self.labels = sorted(d.name for d in refs_dir.iterdir() if d.is_dir())
|
| 34 |
if not self.labels:
|
| 35 |
raise ValueError(f"No subfolders in {refs_dir}")
|
| 36 |
|
| 37 |
+
print(f" SigLIP labels: {self.labels}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def classify_crop(self, crop, conf_threshold, gap_threshold):
|
| 40 |
"""
|
|
|
|
| 42 |
Returns dict matching jina_fewshot.classify() format.
|
| 43 |
"""
|
| 44 |
inputs = self.processor(
|
| 45 |
+
text=self.labels,
|
| 46 |
images=crop,
|
| 47 |
return_tensors="pt",
|
| 48 |
padding="max_length",
|