Spaces:
Sleeping
Sleeping
Added dfine all models and removed jina and siglip2
Browse files- app.py +33 -91
- dfine_jina_pipeline.py +18 -5
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
""" Gradio app: Tab 1 = Object Detection (YOLO models/v1), Tab 2 = D-FINE +
|
| 2 |
|
| 3 |
import os
|
| 4 |
os.environ["YOLO_CONFIG_DIR"] = os.environ.get("YOLO_CONFIG_DIR", "/tmp")
|
|
@@ -9,7 +9,7 @@ import gradio as gr
|
|
| 9 |
from ultralytics import YOLO
|
| 10 |
from pathlib import Path
|
| 11 |
|
| 12 |
-
# Tab 2: D-FINE runs first, then
|
| 13 |
from dfine_jina_pipeline import run_single_image
|
| 14 |
|
| 15 |
|
|
@@ -108,17 +108,8 @@ def run_detection(image, model):
|
|
| 108 |
return out_img, det_json
|
| 109 |
|
| 110 |
|
| 111 |
-
|
| 112 |
-
"
|
| 113 |
-
"SigLIP (zero-shot)": "siglip",
|
| 114 |
-
"SigLIP2 ONNX (zero-shot)": "siglip2_onnx",
|
| 115 |
-
}
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice,
|
| 119 |
-
min_display_conf, gap_threshold, siglip_threshold,
|
| 120 |
-
classifier_choice="Jina-CLIP-v2 (few-shot)"):
|
| 121 |
-
"""Tab 2: D-FINE first, then classify crops.
|
| 122 |
Returns (group_crop_gallery, known_crop_gallery, status_message).
|
| 123 |
"""
|
| 124 |
if image is None:
|
|
@@ -129,18 +120,8 @@ def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice,
|
|
| 129 |
if not refs.is_dir():
|
| 130 |
return [], [], f"Refs folder not found: {refs}"
|
| 131 |
|
| 132 |
-
dfine_model =
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
# SigLIP models: use their own threshold, no gap check
|
| 136 |
-
if classifier in ("siglip", "siglip2_onnx"):
|
| 137 |
-
conf_thresh = float(siglip_threshold)
|
| 138 |
-
gap_thresh = 0.0
|
| 139 |
-
display_conf = float(siglip_threshold)
|
| 140 |
-
else:
|
| 141 |
-
conf_thresh = 0.5
|
| 142 |
-
gap_thresh = float(gap_threshold)
|
| 143 |
-
display_conf = float(min_display_conf)
|
| 144 |
|
| 145 |
group_crops, known_crops, status = run_single_image(
|
| 146 |
image,
|
|
@@ -148,11 +129,11 @@ def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice,
|
|
| 148 |
dfine_model=dfine_model,
|
| 149 |
det_threshold=float(dfine_threshold),
|
| 150 |
conf_threshold=conf_thresh,
|
| 151 |
-
gap_threshold=
|
| 152 |
min_side=24,
|
| 153 |
crop_dedup_iou=0.4,
|
| 154 |
-
min_display_conf=
|
| 155 |
-
classifier=
|
| 156 |
)
|
| 157 |
|
| 158 |
return [(g, None) for g in (group_crops or [])], [(k, None) for k in (known_crops or [])], status or ""
|
|
@@ -248,14 +229,9 @@ with gr.Blocks(title="Small Object Detection") as app:
|
|
| 248 |
with gr.TabItem("D-FINE + Classify"):
|
| 249 |
|
| 250 |
gr.Markdown(
|
| 251 |
-
"**D-FINE** runs first (person/car grouping), then small-object crops are classified. "
|
| 252 |
-
"Choose a
|
| 253 |
-
"
|
| 254 |
-
"Choose D-FINE model size (Medium or Large). "
|
| 255 |
-
"Uses the **refs** folder (one subfolder per class, e.g. refs/phone/, refs/cigarette/) "
|
| 256 |
-
"— Jina uses reference images; SigLIP models use only the folder names as class labels.\n\n"
|
| 257 |
-
"**Gap** = how much the top class (e.g. gun) must beat the next-best class (e.g. phone). "
|
| 258 |
-
"Bigger gap means the model is more sure; we only accept the label if both confidence and gap are high enough."
|
| 259 |
)
|
| 260 |
|
| 261 |
with gr.Row():
|
|
@@ -268,19 +244,16 @@ with gr.Blocks(title="Small Object Detection") as app:
|
|
| 268 |
height=IMG_HEIGHT
|
| 269 |
)
|
| 270 |
|
| 271 |
-
|
| 272 |
-
choices=
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
choices=["Medium", "Large"],
|
| 279 |
-
value="Large",
|
| 280 |
label="D-FINE model",
|
| 281 |
)
|
| 282 |
|
| 283 |
-
# Default threshold: Large=0.2, Medium=0.15 (slider updates when model changes)
|
| 284 |
dfine_threshold_slider = gr.Slider(
|
| 285 |
minimum=0.05,
|
| 286 |
maximum=0.5,
|
|
@@ -290,7 +263,11 @@ with gr.Blocks(title="Small Object Detection") as app:
|
|
| 290 |
)
|
| 291 |
|
| 292 |
def update_dfine_threshold_default(choice):
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
dfine_model_radio.change(
|
| 296 |
fn=update_dfine_threshold_default,
|
|
@@ -298,6 +275,14 @@ with gr.Blocks(title="Small Object Detection") as app:
|
|
| 298 |
outputs=[dfine_threshold_slider],
|
| 299 |
)
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
refs_path = gr.Textbox(
|
| 302 |
label="Refs folder path",
|
| 303 |
value=REFS_DIR,
|
|
@@ -311,33 +296,6 @@ with gr.Blocks(title="Small Object Detection") as app:
|
|
| 311 |
|
| 312 |
with gr.Column(scale=1):
|
| 313 |
|
| 314 |
-
# --- Jina thresholds (visible when Jina selected) ---
|
| 315 |
-
threshold_slider = gr.Slider(
|
| 316 |
-
minimum=0.0,
|
| 317 |
-
maximum=1.0,
|
| 318 |
-
value=0.703,
|
| 319 |
-
step=0.005,
|
| 320 |
-
label="Jina: min display confidence",
|
| 321 |
-
)
|
| 322 |
-
|
| 323 |
-
gap_slider = gr.Slider(
|
| 324 |
-
minimum=0.0,
|
| 325 |
-
maximum=0.02,
|
| 326 |
-
value=0.005,
|
| 327 |
-
step=0.001,
|
| 328 |
-
label="Jina: gap (top class must beat runner-up by this much)",
|
| 329 |
-
)
|
| 330 |
-
|
| 331 |
-
# --- SigLIP threshold (visible when SigLIP selected) ---
|
| 332 |
-
siglip_threshold_slider = gr.Slider(
|
| 333 |
-
minimum=0.0,
|
| 334 |
-
maximum=1.0,
|
| 335 |
-
value=0.05,
|
| 336 |
-
step=0.01,
|
| 337 |
-
label="SigLIP: min confidence threshold",
|
| 338 |
-
visible=False,
|
| 339 |
-
)
|
| 340 |
-
|
| 341 |
out_gallery_dfine = gr.Gallery(
|
| 342 |
label="Person/car crops (all D-FINE objects inside drawn with label + score)",
|
| 343 |
height=IMG_HEIGHT,
|
|
@@ -358,25 +316,9 @@ with gr.Blocks(title="Small Object Detection") as app:
|
|
| 358 |
interactive=False,
|
| 359 |
)
|
| 360 |
|
| 361 |
-
# Show/hide threshold sliders based on classifier choice
|
| 362 |
-
def update_threshold_visibility(choice):
|
| 363 |
-
is_jina = (choice == "Jina-CLIP-v2 (few-shot)")
|
| 364 |
-
return (
|
| 365 |
-
gr.update(visible=is_jina), # threshold_slider
|
| 366 |
-
gr.update(visible=is_jina), # gap_slider
|
| 367 |
-
gr.update(visible=not is_jina), # siglip_threshold_slider
|
| 368 |
-
)
|
| 369 |
-
|
| 370 |
-
classifier_radio.change(
|
| 371 |
-
fn=update_threshold_visibility,
|
| 372 |
-
inputs=[classifier_radio],
|
| 373 |
-
outputs=[threshold_slider, gap_slider, siglip_threshold_slider],
|
| 374 |
-
)
|
| 375 |
-
|
| 376 |
btn_dfine.click(
|
| 377 |
fn=run_dfine_classify,
|
| 378 |
-
inputs=[inp_dfine, refs_path, dfine_threshold_slider, dfine_model_radio,
|
| 379 |
-
threshold_slider, gap_slider, siglip_threshold_slider, classifier_radio],
|
| 380 |
outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
|
| 381 |
concurrency_limit=1,
|
| 382 |
)
|
|
|
|
| 1 |
+
""" Gradio app: Tab 1 = Object Detection (YOLO models/v1), Tab 2 = D-FINE + SigLIP Classify. """
|
| 2 |
|
| 3 |
import os
|
| 4 |
os.environ["YOLO_CONFIG_DIR"] = os.environ.get("YOLO_CONFIG_DIR", "/tmp")
|
|
|
|
| 9 |
from ultralytics import YOLO
|
| 10 |
from pathlib import Path
|
| 11 |
|
| 12 |
+
# Tab 2: D-FINE runs first, then SigLIP for crop classification
|
| 13 |
from dfine_jina_pipeline import run_single_image
|
| 14 |
|
| 15 |
|
|
|
|
| 108 |
return out_img, det_json
|
| 109 |
|
| 110 |
|
| 111 |
+
def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, siglip_threshold):
|
| 112 |
+
"""Tab 2: D-FINE first, then classify crops with SigLIP.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
Returns (group_crop_gallery, known_crop_gallery, status_message).
|
| 114 |
"""
|
| 115 |
if image is None:
|
|
|
|
| 120 |
if not refs.is_dir():
|
| 121 |
return [], [], f"Refs folder not found: {refs}"
|
| 122 |
|
| 123 |
+
dfine_model = dfine_model_choice.strip().lower() if dfine_model_choice else "large-obj365"
|
| 124 |
+
conf_thresh = float(siglip_threshold)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
group_crops, known_crops, status = run_single_image(
|
| 127 |
image,
|
|
|
|
| 129 |
dfine_model=dfine_model,
|
| 130 |
det_threshold=float(dfine_threshold),
|
| 131 |
conf_threshold=conf_thresh,
|
| 132 |
+
gap_threshold=0.0,
|
| 133 |
min_side=24,
|
| 134 |
crop_dedup_iou=0.4,
|
| 135 |
+
min_display_conf=conf_thresh,
|
| 136 |
+
classifier="siglip",
|
| 137 |
)
|
| 138 |
|
| 139 |
return [(g, None) for g in (group_crops or [])], [(k, None) for k in (known_crops or [])], status or ""
|
|
|
|
| 229 |
with gr.TabItem("D-FINE + Classify"):
|
| 230 |
|
| 231 |
gr.Markdown(
|
| 232 |
+
"**D-FINE** runs first (person/car grouping), then small-object crops are classified with **SigLIP** (zero-shot). "
|
| 233 |
+
"Choose a D-FINE model (obj365, coco, or obj2coco variants in small/medium/large). "
|
| 234 |
+
"Uses the **refs** folder names as class labels (e.g. refs/phone/, refs/cigarette/)."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
)
|
| 236 |
|
| 237 |
with gr.Row():
|
|
|
|
| 244 |
height=IMG_HEIGHT
|
| 245 |
)
|
| 246 |
|
| 247 |
+
dfine_model_radio = gr.Dropdown(
|
| 248 |
+
choices=[
|
| 249 |
+
"small-obj365", "medium-obj365", "large-obj365",
|
| 250 |
+
"small-coco", "medium-coco", "large-coco",
|
| 251 |
+
"small-obj2coco", "medium-obj2coco", "large-obj2coco",
|
| 252 |
+
],
|
| 253 |
+
value="large-obj365",
|
|
|
|
|
|
|
| 254 |
label="D-FINE model",
|
| 255 |
)
|
| 256 |
|
|
|
|
| 257 |
dfine_threshold_slider = gr.Slider(
|
| 258 |
minimum=0.05,
|
| 259 |
maximum=0.5,
|
|
|
|
| 263 |
)
|
| 264 |
|
| 265 |
def update_dfine_threshold_default(choice):
|
| 266 |
+
if not choice:
|
| 267 |
+
return gr.update(value=0.15)
|
| 268 |
+
size = choice.strip().lower().split("-")[0]
|
| 269 |
+
defaults = {"large": 0.2, "medium": 0.15, "small": 0.1}
|
| 270 |
+
return gr.update(value=defaults.get(size, 0.15))
|
| 271 |
|
| 272 |
dfine_model_radio.change(
|
| 273 |
fn=update_dfine_threshold_default,
|
|
|
|
| 275 |
outputs=[dfine_threshold_slider],
|
| 276 |
)
|
| 277 |
|
| 278 |
+
siglip_threshold_slider = gr.Slider(
|
| 279 |
+
minimum=0.001,
|
| 280 |
+
maximum=0.1,
|
| 281 |
+
value=0.01,
|
| 282 |
+
step=0.001,
|
| 283 |
+
label="SigLIP: min confidence threshold",
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
refs_path = gr.Textbox(
|
| 287 |
label="Refs folder path",
|
| 288 |
value=REFS_DIR,
|
|
|
|
| 296 |
|
| 297 |
with gr.Column(scale=1):
|
| 298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
out_gallery_dfine = gr.Gallery(
|
| 300 |
label="Person/car crops (all D-FINE objects inside drawn with label + score)",
|
| 301 |
height=IMG_HEIGHT,
|
|
|
|
| 316 |
interactive=False,
|
| 317 |
)
|
| 318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
btn_dfine.click(
|
| 320 |
fn=run_dfine_classify,
|
| 321 |
+
inputs=[inp_dfine, refs_path, dfine_threshold_slider, dfine_model_radio, siglip_threshold_slider],
|
|
|
|
| 322 |
outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
|
| 323 |
concurrency_limit=1,
|
| 324 |
)
|
dfine_jina_pipeline.py
CHANGED
|
@@ -211,7 +211,7 @@ def parse_args():
|
|
| 211 |
p.add_argument("--text-weight", type=float, default=0.3)
|
| 212 |
p.add_argument("--max-images", type=int, default=None)
|
| 213 |
p.add_argument("--device", default=None)
|
| 214 |
-
p.add_argument("--dfine-model", choices=
|
| 215 |
return p.parse_args()
|
| 216 |
|
| 217 |
|
|
@@ -303,7 +303,7 @@ def main():
|
|
| 303 |
raise SystemExit(f"No images in {input_dir}")
|
| 304 |
|
| 305 |
# Load D-FINE
|
| 306 |
-
dfine_model_id = DFINE_MODEL_IDS.get(args.dfine_model, DFINE_MODEL_IDS["large"])
|
| 307 |
print(f"[*] Loading D-FINE ({dfine_model_id})...")
|
| 308 |
t0 = time.perf_counter()
|
| 309 |
image_processor = AutoImageProcessor.from_pretrained(dfine_model_id)
|
|
@@ -501,7 +501,20 @@ def main():
|
|
| 501 |
_APP_DFINE = None # (model_id, image_processor, dfine_model, person_car_ids)
|
| 502 |
_APP_CLASSIFIERS = {} # {classifier_name: (classifier_instance, refs_dir_str)}
|
| 503 |
|
| 504 |
-
DFINE_MODEL_IDS = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
|
| 506 |
CLASSIFIER_CHOICES = ["jina", "siglip", "siglip2_onnx"]
|
| 507 |
|
|
@@ -574,9 +587,9 @@ def run_single_image(
|
|
| 574 |
if not refs_dir.is_dir():
|
| 575 |
return [], [], f"Refs folder not found: {refs_dir}"
|
| 576 |
|
| 577 |
-
dfine_model = (dfine_model or "large").strip().lower()
|
| 578 |
if dfine_model not in DFINE_MODEL_IDS:
|
| 579 |
-
dfine_model = "large"
|
| 580 |
model_id = DFINE_MODEL_IDS[dfine_model]
|
| 581 |
|
| 582 |
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
| 211 |
p.add_argument("--text-weight", type=float, default=0.3)
|
| 212 |
p.add_argument("--max-images", type=int, default=None)
|
| 213 |
p.add_argument("--device", default=None)
|
| 214 |
+
p.add_argument("--dfine-model", choices=list(DFINE_MODEL_IDS.keys()), default="large-obj365", help="D-FINE model")
|
| 215 |
return p.parse_args()
|
| 216 |
|
| 217 |
|
|
|
|
| 303 |
raise SystemExit(f"No images in {input_dir}")
|
| 304 |
|
| 305 |
# Load D-FINE
|
| 306 |
+
dfine_model_id = DFINE_MODEL_IDS.get(args.dfine_model, DFINE_MODEL_IDS["large-obj365"])
|
| 307 |
print(f"[*] Loading D-FINE ({dfine_model_id})...")
|
| 308 |
t0 = time.perf_counter()
|
| 309 |
image_processor = AutoImageProcessor.from_pretrained(dfine_model_id)
|
|
|
|
| 501 |
_APP_DFINE = None # (model_id, image_processor, dfine_model, person_car_ids)
|
| 502 |
_APP_CLASSIFIERS = {} # {classifier_name: (classifier_instance, refs_dir_str)}
|
| 503 |
|
| 504 |
+
DFINE_MODEL_IDS = {
|
| 505 |
+
# obj365
|
| 506 |
+
"small-obj365": "ustc-community/dfine-small-obj365",
|
| 507 |
+
"medium-obj365": "ustc-community/dfine-medium-obj365",
|
| 508 |
+
"large-obj365": "ustc-community/dfine-large-obj365",
|
| 509 |
+
# coco
|
| 510 |
+
"small-coco": "ustc-community/dfine-small-coco",
|
| 511 |
+
"medium-coco": "ustc-community/dfine-medium-coco",
|
| 512 |
+
"large-coco": "ustc-community/dfine-large-coco",
|
| 513 |
+
# obj2coco
|
| 514 |
+
"small-obj2coco": "ustc-community/dfine-small-obj2coco",
|
| 515 |
+
"medium-obj2coco": "ustc-community/dfine-medium-obj2coco",
|
| 516 |
+
"large-obj2coco": "ustc-community/dfine-large-obj2coco-e25",
|
| 517 |
+
}
|
| 518 |
|
| 519 |
CLASSIFIER_CHOICES = ["jina", "siglip", "siglip2_onnx"]
|
| 520 |
|
|
|
|
| 587 |
if not refs_dir.is_dir():
|
| 588 |
return [], [], f"Refs folder not found: {refs_dir}"
|
| 589 |
|
| 590 |
+
dfine_model = (dfine_model or "large-obj365").strip().lower()
|
| 591 |
if dfine_model not in DFINE_MODEL_IDS:
|
| 592 |
+
dfine_model = "large-obj365"
|
| 593 |
model_id = DFINE_MODEL_IDS[dfine_model]
|
| 594 |
|
| 595 |
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|