Afsha001 commited on
Commit
ebc8d8e
Β·
verified Β·
1 Parent(s): 380cdcf

remove dino

Browse files
Files changed (1) hide show
  1. app.py +49 -107
app.py CHANGED
@@ -26,24 +26,16 @@ JINA_HEADERS = {
26
  "Content-Type": "application/json"
27
  }
28
 
29
- DETECT_PROMPT = (
30
- "person . man . woman . boy . girl . child . baby . a group of people . "
31
- "sitting on a chair . riding a bicycle . holding an object . walking on the road . "
32
- "wooden surface . shiny metal . smooth glass . brick wall . leather bag . denim clothing . "
33
- "shirt . jacket . dress . coat . hat . glasses . backpack . shoes . tie . "
34
- "table . chair . bench . sofa . desk . laptop . phone . book . umbrella . "
35
- "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
36
- "in the foreground . in the background . tree . grass . flower . sky . "
37
- "water . river . mountain . road . building . wall . door . window . floor . "
38
- "dark shadow . bright light . sunny day . indoor lamp . reflection . colorful texture . "
39
- "dog . cat . bird . horse . animal . pizza . cake . bread . fruit . "
40
- "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
41
- )
42
-
43
  if not JINA_KEY:
44
  st.error("JINA_KEY missing. Go to Space Settings β†’ Secrets and add it.")
45
  st.stop()
46
 
 
 
 
 
 
 
47
  @st.cache_resource
48
  def load_local_models():
49
  from transformers import (
@@ -51,8 +43,7 @@ def load_local_models():
51
  AutoModelForCausalLM,
52
  AutoTokenizer,
53
  BlipProcessor,
54
- BlipForImageTextRetrieval,
55
- AutoModelForZeroShotObjectDetection
56
  )
57
  gc.collect()
58
 
@@ -76,15 +67,6 @@ def load_local_models():
76
  )
77
  blip_itm_model.eval()
78
 
79
- dino_processor = AutoProcessor.from_pretrained(
80
- "IDEA-Research/grounding-dino-base"
81
- )
82
- dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
83
- "IDEA-Research/grounding-dino-base",
84
- torch_dtype=torch.float32
85
- )
86
- dino_model.eval()
87
-
88
  qwen_tokenizer = AutoTokenizer.from_pretrained(
89
  "Qwen/Qwen2.5-1.5B-Instruct"
90
  )
@@ -97,7 +79,6 @@ def load_local_models():
97
  return (
98
  florence_processor, florence_model,
99
  blip_processor, blip_itm_model,
100
- dino_processor, dino_model,
101
  qwen_tokenizer, qwen_model
102
  )
103
 
@@ -112,17 +93,7 @@ def image_to_data_uri(image: Image.Image) -> str:
112
  return f"data:image/jpeg;base64,{b64}"
113
 
114
  # ============================================================================
115
- # generate_captions_florence β€” speed optimized + diversity fixed
116
- #
117
- # Problem: num_beams=1 greedy produces near-identical captions across tasks
118
- # Fix: Task 1 stays greedy (baseline), Tasks 2 and 3 use sampling
119
- # with increasing temperature β€” each task explores different word paths
120
- #
121
- # Task 1: greedy β†’ deterministic, short, factual baseline
122
- # Task 2: temp=0.7 β†’ slightly varied, focuses on detail
123
- # Task 3: temp=1.1 β†’ more varied phrasing, different sentence structure
124
- #
125
- # Speed: sampling is as fast or faster than beam search β€” no regression
126
  # ============================================================================
127
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
128
 
@@ -170,7 +141,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
170
  st.warning(f"Florence {task_prompt} error: {str(e)[:80]}")
171
  captions.append("a scene shown in the image")
172
 
173
- # Deduplicate while keeping order
174
  seen, unique = set(), []
175
  for c in captions:
176
  if c not in seen:
@@ -185,6 +155,9 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
185
 
186
  return unique[:5]
187
 
 
 
 
188
  def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
189
  scores = []
190
  for cap in captions:
@@ -204,6 +177,9 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
204
  scores.append(0.0)
205
  return scores
206
 
 
 
 
207
  def compute_jina_scores(image: Image.Image, captions: list) -> list:
208
  img_data_uri = image_to_data_uri(image)
209
  scores = []
@@ -234,6 +210,9 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
234
  scores.append(0.0)
235
  return scores
236
 
 
 
 
237
  def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
238
  try:
239
  img_inp = blip_proc(images=image, return_tensors="pt")
@@ -260,6 +239,9 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
260
  st.warning(f"Cosine error: {str(e)[:60]}")
261
  return [0.0] * len(captions)
262
 
 
 
 
263
  def majority_voting(captions, itm, jina, cosine) -> tuple:
264
  itm_r = np.argsort(itm)[::-1]
265
  jina_r = np.argsort(jina)[::-1]
@@ -277,55 +259,21 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
277
 
278
  return captions[top2[0]], captions[top2[1]], top2, dict(counts)
279
 
280
- def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
281
- try:
282
- inputs = dino_proc(
283
- images=image, text=DETECT_PROMPT, return_tensors="pt"
284
- )
285
- with torch.no_grad():
286
- outputs = dino_mod(**inputs)
287
-
288
- target_sizes = torch.tensor([image.size[::-1]])
289
- results = dino_proc.post_process_grounded_object_detection(
290
- outputs, inputs.input_ids, target_sizes=target_sizes
291
- )[0]
292
-
293
- scores = results["scores"]
294
- labels = results.get("text_labels", results["labels"])
295
-
296
- keep = scores >= threshold
297
- kept_sc = scores[keep].tolist()
298
- kept_lbl = [labels[i] for i in range(len(labels)) if keep[i]]
299
-
300
- if not kept_lbl:
301
- return "No objects detected", []
302
-
303
- label_dict = {}
304
- for lbl, sc in zip(kept_lbl, kept_sc):
305
- lbl = lbl.strip().lower()
306
- if lbl not in label_dict or label_dict[lbl] < sc:
307
- label_dict[lbl] = sc
308
-
309
- sorted_labels = [
310
- l for l, _ in
311
- sorted(label_dict.items(), key=lambda x: x[1], reverse=True)
312
- ]
313
- formatted = "Detected objects: [" + ", ".join(sorted_labels) + "]"
314
- return formatted, sorted_labels
315
- except Exception as e:
316
- st.warning(f"DINO error: {str(e)[:80]}")
317
- return "Object detection unavailable", []
318
-
319
- def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
320
 
321
  system_prompt = (
322
  "You write image captions. "
323
- "You will receive two captions and a list of detected objects. "
324
  "Your job is to combine them into one detailed caption. "
325
  "Include ALL specific details you find: "
326
  "the clothing colors and style of each person, "
327
  "what each person looks like and what they are doing, "
328
- "the objects and plants visible around them, "
329
  "and the setting or background of the scene. "
330
  "Write 5 to 6 sentences. Use simple, clear, everyday words. "
331
  "Do NOT summarize or shorten β€” keep every specific detail. "
@@ -335,8 +283,7 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
335
 
336
  user_prompt = (
337
  f"Caption A: {cap1}\n"
338
- f"Caption B: {cap2}\n"
339
- f"{objects}\n\n"
340
  "Write a detailed caption that includes all the clothing, "
341
  "people, objects and background in details:"
342
  )
@@ -373,6 +320,9 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
373
  st.warning(f"Qwen fusion error: {str(e)[:80]}")
374
  return cap1
375
 
 
 
 
376
  with st.sidebar:
377
  st.title("Image Caption Fusion")
378
  st.markdown("---")
@@ -393,16 +343,16 @@ Embedding similarity
393
  **5. Majority Voting**
394
  Best 2 captions selected
395
 
396
- **6. Grounding DINO** (Local)
397
- Object detection
398
-
399
- **7. Qwen2.5-1.5B** (Local)
400
  Caption fusion
401
  """)
402
  st.markdown("---")
403
- st.markdown("**Local:** Florence-2, BLIP ITM, DINO, Qwen2.5")
404
  st.markdown("**API:** Jina")
405
 
 
 
 
406
  st.title("Image Caption Fusion System")
407
  st.markdown("Upload an image to generate a refined, grounded caption.")
408
  st.markdown("---")
@@ -427,32 +377,31 @@ if uploaded_file is not None:
427
  (
428
  florence_proc, florence_mod,
429
  blip_proc, blip_itm,
430
- dino_proc, dino_mod,
431
  qwen_tok, qwen_mod
432
  ) = load_local_models()
433
 
434
  progress = st.progress(0)
435
  status = st.empty()
436
 
437
- status.info("Step 1/7: Generating captions with Florence-2-Large...")
438
  captions = generate_captions_florence(input_image, florence_proc, florence_mod)
439
- progress.progress(14)
440
 
441
  with st.expander("5 Generated Captions", expanded=True):
442
  for i, cap in enumerate(captions):
443
  st.write(f"**{i+1}.** {cap}")
444
 
445
- status.info("Step 2/7: Computing BLIP ITM scores...")
446
  itm_scores = compute_itm_scores(input_image, captions, blip_proc, blip_itm)
447
- progress.progress(28)
448
 
449
- status.info("Step 3/7: Computing Jina Reranker scores...")
450
  jina_scores = compute_jina_scores(input_image, captions)
451
- progress.progress(42)
452
 
453
- status.info("Step 4/7: Computing Cosine Similarity scores...")
454
  cosine_scores = compute_cosine_scores(input_image, captions, blip_proc, blip_itm)
455
- progress.progress(57)
456
 
457
  scores_df = pd.DataFrame({
458
  "Caption": [f"Cap {i+1}: {c[:50]}" for i, c in enumerate(captions)],
@@ -463,11 +412,11 @@ if uploaded_file is not None:
463
  with st.expander("All Scores", expanded=False):
464
  st.dataframe(scores_df, use_container_width=True, hide_index=True)
465
 
466
- status.info("Step 5/7: Running majority voting...")
467
  best_1, best_2, _, _ = majority_voting(
468
  captions, itm_scores, jina_scores, cosine_scores
469
  )
470
- progress.progress(71)
471
 
472
  st.markdown("### Majority Voted Captions")
473
  c1, c2 = st.columns(2)
@@ -476,15 +425,8 @@ if uploaded_file is not None:
476
  with c2:
477
  st.info(f"2. {best_2}")
478
 
479
- status.info("Step 6/7: Detecting objects with DINO...")
480
- obj_str, obj_list = detect_objects(input_image, dino_proc, dino_mod)
481
- progress.progress(85)
482
-
483
- st.markdown("### Detected Objects")
484
- st.write(" | ".join(obj_list) if obj_list else obj_str)
485
-
486
- status.info("Step 7/7: Fusing captions with Qwen2.5-1.5B...")
487
- final = fuse_captions(best_1, best_2, obj_str, qwen_tok, qwen_mod)
488
  progress.progress(100)
489
  status.success("Pipeline complete!")
490
 
 
26
  "Content-Type": "application/json"
27
  }
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  if not JINA_KEY:
30
  st.error("JINA_KEY missing. Go to Space Settings β†’ Secrets and add it.")
31
  st.stop()
32
 
33
+ # ============================================================================
34
+ # LOAD LOCAL MODELS
35
+ # DINO removed β€” was adding hallucinated labels that hurt fusion accuracy
36
+ # Local: Florence-2, BLIP ITM, Qwen2.5
37
+ # API: Jina Reranker
38
+ # ============================================================================
39
  @st.cache_resource
40
  def load_local_models():
41
  from transformers import (
 
43
  AutoModelForCausalLM,
44
  AutoTokenizer,
45
  BlipProcessor,
46
+ BlipForImageTextRetrieval
 
47
  )
48
  gc.collect()
49
 
 
67
  )
68
  blip_itm_model.eval()
69
 
 
 
 
 
 
 
 
 
 
70
  qwen_tokenizer = AutoTokenizer.from_pretrained(
71
  "Qwen/Qwen2.5-1.5B-Instruct"
72
  )
 
79
  return (
80
  florence_processor, florence_model,
81
  blip_processor, blip_itm_model,
 
82
  qwen_tokenizer, qwen_model
83
  )
84
 
 
93
  return f"data:image/jpeg;base64,{b64}"
94
 
95
  # ============================================================================
96
+ # STEP 1 β€” FLORENCE-2-LARGE: GENERATE 5 DIVERSE CAPTIONS
 
 
 
 
 
 
 
 
 
 
97
  # ============================================================================
98
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
99
 
 
141
  st.warning(f"Florence {task_prompt} error: {str(e)[:80]}")
142
  captions.append("a scene shown in the image")
143
 
 
144
  seen, unique = set(), []
145
  for c in captions:
146
  if c not in seen:
 
155
 
156
  return unique[:5]
157
 
158
+ # ============================================================================
159
+ # STEP 2 β€” BLIP ITM: IMAGE-TEXT MATCHING SCORES
160
+ # ============================================================================
161
  def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
162
  scores = []
163
  for cap in captions:
 
177
  scores.append(0.0)
178
  return scores
179
 
180
+ # ============================================================================
181
+ # STEP 3 β€” JINA RERANKER M0: SEMANTIC SCORES
182
+ # ============================================================================
183
  def compute_jina_scores(image: Image.Image, captions: list) -> list:
184
  img_data_uri = image_to_data_uri(image)
185
  scores = []
 
210
  scores.append(0.0)
211
  return scores
212
 
213
+ # ============================================================================
214
+ # STEP 4 β€” COSINE SIMILARITY: EMBEDDING SCORES
215
+ # ============================================================================
216
  def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
217
  try:
218
  img_inp = blip_proc(images=image, return_tensors="pt")
 
239
  st.warning(f"Cosine error: {str(e)[:60]}")
240
  return [0.0] * len(captions)
241
 
242
+ # ============================================================================
243
+ # STEP 5 β€” MAJORITY VOTING: SELECT TOP 2 CAPTIONS
244
+ # ============================================================================
245
  def majority_voting(captions, itm, jina, cosine) -> tuple:
246
  itm_r = np.argsort(itm)[::-1]
247
  jina_r = np.argsort(jina)[::-1]
 
259
 
260
  return captions[top2[0]], captions[top2[1]], top2, dict(counts)
261
 
262
+ # ============================================================================
263
+ # STEP 6 β€” QWEN2.5-1.5B: CAPTION FUSION
264
+ # DINO objects removed from input β€” was causing hallucinations in fused output
265
+ # Qwen now fuses only the two verified majority-voted captions
266
+ # ============================================================================
267
+ def fuse_captions(cap1: str, cap2: str, qwen_tok, qwen_mod) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
  system_prompt = (
270
  "You write image captions. "
271
+ "You will receive two captions of the same image. "
272
  "Your job is to combine them into one detailed caption. "
273
  "Include ALL specific details you find: "
274
  "the clothing colors and style of each person, "
275
  "what each person looks like and what they are doing, "
276
+ "the objects and surroundings visible around them, "
277
  "and the setting or background of the scene. "
278
  "Write 5 to 6 sentences. Use simple, clear, everyday words. "
279
  "Do NOT summarize or shorten β€” keep every specific detail. "
 
283
 
284
  user_prompt = (
285
  f"Caption A: {cap1}\n"
286
+ f"Caption B: {cap2}\n\n"
 
287
  "Write a detailed caption that includes all the clothing, "
288
  "people, objects and background in details:"
289
  )
 
320
  st.warning(f"Qwen fusion error: {str(e)[:80]}")
321
  return cap1
322
 
323
+ # ============================================================================
324
+ # SIDEBAR
325
+ # ============================================================================
326
  with st.sidebar:
327
  st.title("Image Caption Fusion")
328
  st.markdown("---")
 
343
  **5. Majority Voting**
344
  Best 2 captions selected
345
 
346
+ **6. Qwen2.5-1.5B** (Local)
 
 
 
347
  Caption fusion
348
  """)
349
  st.markdown("---")
350
+ st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
351
  st.markdown("**API:** Jina")
352
 
353
+ # ============================================================================
354
+ # MAIN UI
355
+ # ============================================================================
356
  st.title("Image Caption Fusion System")
357
  st.markdown("Upload an image to generate a refined, grounded caption.")
358
  st.markdown("---")
 
377
  (
378
  florence_proc, florence_mod,
379
  blip_proc, blip_itm,
 
380
  qwen_tok, qwen_mod
381
  ) = load_local_models()
382
 
383
  progress = st.progress(0)
384
  status = st.empty()
385
 
386
+ status.info("Step 1/6: Generating captions with Florence-2-Large...")
387
  captions = generate_captions_florence(input_image, florence_proc, florence_mod)
388
+ progress.progress(16)
389
 
390
  with st.expander("5 Generated Captions", expanded=True):
391
  for i, cap in enumerate(captions):
392
  st.write(f"**{i+1}.** {cap}")
393
 
394
+ status.info("Step 2/6: Computing BLIP ITM scores...")
395
  itm_scores = compute_itm_scores(input_image, captions, blip_proc, blip_itm)
396
+ progress.progress(32)
397
 
398
+ status.info("Step 3/6: Computing Jina Reranker scores...")
399
  jina_scores = compute_jina_scores(input_image, captions)
400
+ progress.progress(50)
401
 
402
+ status.info("Step 4/6: Computing Cosine Similarity scores...")
403
  cosine_scores = compute_cosine_scores(input_image, captions, blip_proc, blip_itm)
404
+ progress.progress(66)
405
 
406
  scores_df = pd.DataFrame({
407
  "Caption": [f"Cap {i+1}: {c[:50]}" for i, c in enumerate(captions)],
 
412
  with st.expander("All Scores", expanded=False):
413
  st.dataframe(scores_df, use_container_width=True, hide_index=True)
414
 
415
+ status.info("Step 5/6: Running majority voting...")
416
  best_1, best_2, _, _ = majority_voting(
417
  captions, itm_scores, jina_scores, cosine_scores
418
  )
419
+ progress.progress(83)
420
 
421
  st.markdown("### Majority Voted Captions")
422
  c1, c2 = st.columns(2)
 
425
  with c2:
426
  st.info(f"2. {best_2}")
427
 
428
+ status.info("Step 6/6: Fusing captions with Qwen2.5-1.5B...")
429
+ final = fuse_captions(best_1, best_2, qwen_tok, qwen_mod)
 
 
 
 
 
 
 
430
  progress.progress(100)
431
  status.success("Pipeline complete!")
432