orik-ss commited on
Commit
2455309
·
1 Parent(s): 1319df4

Added dfine all models and removed jina and siglip2

Browse files
Files changed (2) hide show
  1. app.py +33 -91
  2. dfine_jina_pipeline.py +18 -5
app.py CHANGED
@@ -1,4 +1,4 @@
1
- """ Gradio app: Tab 1 = Object Detection (YOLO models/v1), Tab 2 = D-FINE + Classify (Jina). """
2
 
3
  import os
4
  os.environ["YOLO_CONFIG_DIR"] = os.environ.get("YOLO_CONFIG_DIR", "/tmp")
@@ -9,7 +9,7 @@ import gradio as gr
9
  from ultralytics import YOLO
10
  from pathlib import Path
11
 
12
- # Tab 2: D-FINE runs first, then Jina for crop classification
13
  from dfine_jina_pipeline import run_single_image
14
 
15
 
@@ -108,17 +108,8 @@ def run_detection(image, model):
108
  return out_img, det_json
109
 
110
 
111
- CLASSIFIER_MAP = {
112
- "Jina-CLIP-v2 (few-shot)": "jina",
113
- "SigLIP (zero-shot)": "siglip",
114
- "SigLIP2 ONNX (zero-shot)": "siglip2_onnx",
115
- }
116
-
117
-
118
- def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice,
119
- min_display_conf, gap_threshold, siglip_threshold,
120
- classifier_choice="Jina-CLIP-v2 (few-shot)"):
121
- """Tab 2: D-FINE first, then classify crops.
122
  Returns (group_crop_gallery, known_crop_gallery, status_message).
123
  """
124
  if image is None:
@@ -129,18 +120,8 @@ def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice,
129
  if not refs.is_dir():
130
  return [], [], f"Refs folder not found: {refs}"
131
 
132
- dfine_model = "large" if dfine_model_choice.strip().lower() == "large" else "medium"
133
- classifier = CLASSIFIER_MAP.get(classifier_choice, "jina")
134
-
135
- # SigLIP models: use their own threshold, no gap check
136
- if classifier in ("siglip", "siglip2_onnx"):
137
- conf_thresh = float(siglip_threshold)
138
- gap_thresh = 0.0
139
- display_conf = float(siglip_threshold)
140
- else:
141
- conf_thresh = 0.5
142
- gap_thresh = float(gap_threshold)
143
- display_conf = float(min_display_conf)
144
 
145
  group_crops, known_crops, status = run_single_image(
146
  image,
@@ -148,11 +129,11 @@ def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice,
148
  dfine_model=dfine_model,
149
  det_threshold=float(dfine_threshold),
150
  conf_threshold=conf_thresh,
151
- gap_threshold=gap_thresh,
152
  min_side=24,
153
  crop_dedup_iou=0.4,
154
- min_display_conf=display_conf,
155
- classifier=classifier,
156
  )
157
 
158
  return [(g, None) for g in (group_crops or [])], [(k, None) for k in (known_crops or [])], status or ""
@@ -248,14 +229,9 @@ with gr.Blocks(title="Small Object Detection") as app:
248
  with gr.TabItem("D-FINE + Classify"):
249
 
250
  gr.Markdown(
251
- "**D-FINE** runs first (person/car grouping), then small-object crops are classified. "
252
- "Choose a **classifier**: Jina-CLIP-v2 (few-shot, uses reference images), "
253
- "SigLIP (zero-shot, PyTorch), or SigLIP2 ONNX (zero-shot, larger model). "
254
- "Choose D-FINE model size (Medium or Large). "
255
- "Uses the **refs** folder (one subfolder per class, e.g. refs/phone/, refs/cigarette/) "
256
- "— Jina uses reference images; SigLIP models use only the folder names as class labels.\n\n"
257
- "**Gap** = how much the top class (e.g. gun) must beat the next-best class (e.g. phone). "
258
- "Bigger gap means the model is more sure; we only accept the label if both confidence and gap are high enough."
259
  )
260
 
261
  with gr.Row():
@@ -268,19 +244,16 @@ with gr.Blocks(title="Small Object Detection") as app:
268
  height=IMG_HEIGHT
269
  )
270
 
271
- classifier_radio = gr.Radio(
272
- choices=list(CLASSIFIER_MAP.keys()),
273
- value="Jina-CLIP-v2 (few-shot)",
274
- label="Classifier",
275
- )
276
-
277
- dfine_model_radio = gr.Radio(
278
- choices=["Medium", "Large"],
279
- value="Large",
280
  label="D-FINE model",
281
  )
282
 
283
- # Default threshold: Large=0.2, Medium=0.15 (slider updates when model changes)
284
  dfine_threshold_slider = gr.Slider(
285
  minimum=0.05,
286
  maximum=0.5,
@@ -290,7 +263,11 @@ with gr.Blocks(title="Small Object Detection") as app:
290
  )
291
 
292
  def update_dfine_threshold_default(choice):
293
- return gr.update(value=0.2 if (choice and choice.strip().lower() == "large") else 0.15)
 
 
 
 
294
 
295
  dfine_model_radio.change(
296
  fn=update_dfine_threshold_default,
@@ -298,6 +275,14 @@ with gr.Blocks(title="Small Object Detection") as app:
298
  outputs=[dfine_threshold_slider],
299
  )
300
 
 
 
 
 
 
 
 
 
301
  refs_path = gr.Textbox(
302
  label="Refs folder path",
303
  value=REFS_DIR,
@@ -311,33 +296,6 @@ with gr.Blocks(title="Small Object Detection") as app:
311
 
312
  with gr.Column(scale=1):
313
 
314
- # --- Jina thresholds (visible when Jina selected) ---
315
- threshold_slider = gr.Slider(
316
- minimum=0.0,
317
- maximum=1.0,
318
- value=0.703,
319
- step=0.005,
320
- label="Jina: min display confidence",
321
- )
322
-
323
- gap_slider = gr.Slider(
324
- minimum=0.0,
325
- maximum=0.02,
326
- value=0.005,
327
- step=0.001,
328
- label="Jina: gap (top class must beat runner-up by this much)",
329
- )
330
-
331
- # --- SigLIP threshold (visible when SigLIP selected) ---
332
- siglip_threshold_slider = gr.Slider(
333
- minimum=0.0,
334
- maximum=1.0,
335
- value=0.05,
336
- step=0.01,
337
- label="SigLIP: min confidence threshold",
338
- visible=False,
339
- )
340
-
341
  out_gallery_dfine = gr.Gallery(
342
  label="Person/car crops (all D-FINE objects inside drawn with label + score)",
343
  height=IMG_HEIGHT,
@@ -358,25 +316,9 @@ with gr.Blocks(title="Small Object Detection") as app:
358
  interactive=False,
359
  )
360
 
361
- # Show/hide threshold sliders based on classifier choice
362
- def update_threshold_visibility(choice):
363
- is_jina = (choice == "Jina-CLIP-v2 (few-shot)")
364
- return (
365
- gr.update(visible=is_jina), # threshold_slider
366
- gr.update(visible=is_jina), # gap_slider
367
- gr.update(visible=not is_jina), # siglip_threshold_slider
368
- )
369
-
370
- classifier_radio.change(
371
- fn=update_threshold_visibility,
372
- inputs=[classifier_radio],
373
- outputs=[threshold_slider, gap_slider, siglip_threshold_slider],
374
- )
375
-
376
  btn_dfine.click(
377
  fn=run_dfine_classify,
378
- inputs=[inp_dfine, refs_path, dfine_threshold_slider, dfine_model_radio,
379
- threshold_slider, gap_slider, siglip_threshold_slider, classifier_radio],
380
  outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
381
  concurrency_limit=1,
382
  )
 
1
+ """ Gradio app: Tab 1 = Object Detection (YOLO models/v1), Tab 2 = D-FINE + SigLIP Classify. """
2
 
3
  import os
4
  os.environ["YOLO_CONFIG_DIR"] = os.environ.get("YOLO_CONFIG_DIR", "/tmp")
 
9
  from ultralytics import YOLO
10
  from pathlib import Path
11
 
12
+ # Tab 2: D-FINE runs first, then SigLIP for crop classification
13
  from dfine_jina_pipeline import run_single_image
14
 
15
 
 
108
  return out_img, det_json
109
 
110
 
111
+ def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, siglip_threshold):
112
+ """Tab 2: D-FINE first, then classify crops with SigLIP.
 
 
 
 
 
 
 
 
 
113
  Returns (group_crop_gallery, known_crop_gallery, status_message).
114
  """
115
  if image is None:
 
120
  if not refs.is_dir():
121
  return [], [], f"Refs folder not found: {refs}"
122
 
123
+ dfine_model = dfine_model_choice.strip().lower() if dfine_model_choice else "large-obj365"
124
+ conf_thresh = float(siglip_threshold)
 
 
 
 
 
 
 
 
 
 
125
 
126
  group_crops, known_crops, status = run_single_image(
127
  image,
 
129
  dfine_model=dfine_model,
130
  det_threshold=float(dfine_threshold),
131
  conf_threshold=conf_thresh,
132
+ gap_threshold=0.0,
133
  min_side=24,
134
  crop_dedup_iou=0.4,
135
+ min_display_conf=conf_thresh,
136
+ classifier="siglip",
137
  )
138
 
139
  return [(g, None) for g in (group_crops or [])], [(k, None) for k in (known_crops or [])], status or ""
 
229
  with gr.TabItem("D-FINE + Classify"):
230
 
231
  gr.Markdown(
232
+ "**D-FINE** runs first (person/car grouping), then small-object crops are classified with **SigLIP** (zero-shot). "
233
+ "Choose a D-FINE model (obj365, coco, or obj2coco variants in small/medium/large). "
234
+ "Uses the **refs** folder names as class labels (e.g. refs/phone/, refs/cigarette/)."
 
 
 
 
 
235
  )
236
 
237
  with gr.Row():
 
244
  height=IMG_HEIGHT
245
  )
246
 
247
+ dfine_model_radio = gr.Dropdown(
248
+ choices=[
249
+ "small-obj365", "medium-obj365", "large-obj365",
250
+ "small-coco", "medium-coco", "large-coco",
251
+ "small-obj2coco", "medium-obj2coco", "large-obj2coco",
252
+ ],
253
+ value="large-obj365",
 
 
254
  label="D-FINE model",
255
  )
256
 
 
257
  dfine_threshold_slider = gr.Slider(
258
  minimum=0.05,
259
  maximum=0.5,
 
263
  )
264
 
265
  def update_dfine_threshold_default(choice):
266
+ if not choice:
267
+ return gr.update(value=0.15)
268
+ size = choice.strip().lower().split("-")[0]
269
+ defaults = {"large": 0.2, "medium": 0.15, "small": 0.1}
270
+ return gr.update(value=defaults.get(size, 0.15))
271
 
272
  dfine_model_radio.change(
273
  fn=update_dfine_threshold_default,
 
275
  outputs=[dfine_threshold_slider],
276
  )
277
 
278
+ siglip_threshold_slider = gr.Slider(
279
+ minimum=0.001,
280
+ maximum=0.1,
281
+ value=0.01,
282
+ step=0.001,
283
+ label="SigLIP: min confidence threshold",
284
+ )
285
+
286
  refs_path = gr.Textbox(
287
  label="Refs folder path",
288
  value=REFS_DIR,
 
296
 
297
  with gr.Column(scale=1):
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  out_gallery_dfine = gr.Gallery(
300
  label="Person/car crops (all D-FINE objects inside drawn with label + score)",
301
  height=IMG_HEIGHT,
 
316
  interactive=False,
317
  )
318
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  btn_dfine.click(
320
  fn=run_dfine_classify,
321
+ inputs=[inp_dfine, refs_path, dfine_threshold_slider, dfine_model_radio, siglip_threshold_slider],
 
322
  outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
323
  concurrency_limit=1,
324
  )
dfine_jina_pipeline.py CHANGED
@@ -211,7 +211,7 @@ def parse_args():
211
  p.add_argument("--text-weight", type=float, default=0.3)
212
  p.add_argument("--max-images", type=int, default=None)
213
  p.add_argument("--device", default=None)
214
- p.add_argument("--dfine-model", choices=["medium", "large"], default="large", help="D-FINE model size")
215
  return p.parse_args()
216
 
217
 
@@ -303,7 +303,7 @@ def main():
303
  raise SystemExit(f"No images in {input_dir}")
304
 
305
  # Load D-FINE
306
- dfine_model_id = DFINE_MODEL_IDS.get(args.dfine_model, DFINE_MODEL_IDS["large"])
307
  print(f"[*] Loading D-FINE ({dfine_model_id})...")
308
  t0 = time.perf_counter()
309
  image_processor = AutoImageProcessor.from_pretrained(dfine_model_id)
@@ -501,7 +501,20 @@ def main():
501
  _APP_DFINE = None # (model_id, image_processor, dfine_model, person_car_ids)
502
  _APP_CLASSIFIERS = {} # {classifier_name: (classifier_instance, refs_dir_str)}
503
 
504
- DFINE_MODEL_IDS = {"medium": "ustc-community/dfine-medium-obj365", "large": "ustc-community/dfine-large-obj365"}
 
 
 
 
 
 
 
 
 
 
 
 
 
505
 
506
  CLASSIFIER_CHOICES = ["jina", "siglip", "siglip2_onnx"]
507
 
@@ -574,9 +587,9 @@ def run_single_image(
574
  if not refs_dir.is_dir():
575
  return [], [], f"Refs folder not found: {refs_dir}"
576
 
577
- dfine_model = (dfine_model or "large").strip().lower()
578
  if dfine_model not in DFINE_MODEL_IDS:
579
- dfine_model = "large"
580
  model_id = DFINE_MODEL_IDS[dfine_model]
581
 
582
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
 
211
  p.add_argument("--text-weight", type=float, default=0.3)
212
  p.add_argument("--max-images", type=int, default=None)
213
  p.add_argument("--device", default=None)
214
+ p.add_argument("--dfine-model", choices=list(DFINE_MODEL_IDS.keys()), default="large-obj365", help="D-FINE model")
215
  return p.parse_args()
216
 
217
 
 
303
  raise SystemExit(f"No images in {input_dir}")
304
 
305
  # Load D-FINE
306
+ dfine_model_id = DFINE_MODEL_IDS.get(args.dfine_model, DFINE_MODEL_IDS["large-obj365"])
307
  print(f"[*] Loading D-FINE ({dfine_model_id})...")
308
  t0 = time.perf_counter()
309
  image_processor = AutoImageProcessor.from_pretrained(dfine_model_id)
 
501
  _APP_DFINE = None # (model_id, image_processor, dfine_model, person_car_ids)
502
  _APP_CLASSIFIERS = {} # {classifier_name: (classifier_instance, refs_dir_str)}
503
 
504
+ DFINE_MODEL_IDS = {
505
+ # obj365
506
+ "small-obj365": "ustc-community/dfine-small-obj365",
507
+ "medium-obj365": "ustc-community/dfine-medium-obj365",
508
+ "large-obj365": "ustc-community/dfine-large-obj365",
509
+ # coco
510
+ "small-coco": "ustc-community/dfine-small-coco",
511
+ "medium-coco": "ustc-community/dfine-medium-coco",
512
+ "large-coco": "ustc-community/dfine-large-coco",
513
+ # obj2coco
514
+ "small-obj2coco": "ustc-community/dfine-small-obj2coco",
515
+ "medium-obj2coco": "ustc-community/dfine-medium-obj2coco",
516
+ "large-obj2coco": "ustc-community/dfine-large-obj2coco-e25",
517
+ }
518
 
519
  CLASSIFIER_CHOICES = ["jina", "siglip", "siglip2_onnx"]
520
 
 
587
  if not refs_dir.is_dir():
588
  return [], [], f"Refs folder not found: {refs_dir}"
589
 
590
+ dfine_model = (dfine_model or "large-obj365").strip().lower()
591
  if dfine_model not in DFINE_MODEL_IDS:
592
+ dfine_model = "large-obj365"
593
  model_id = DFINE_MODEL_IDS[dfine_model]
594
 
595
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")