Afsha001 commited on
Commit
8161819
Β·
verified Β·
1 Parent(s): 209f651
Files changed (1) hide show
  1. app.py +96 -31
app.py CHANGED
@@ -88,8 +88,18 @@ def image_to_data_uri(image: Image.Image) -> str:
88
  return f"data:image/jpeg;base64,{b64}"
89
 
90
  # ============================================================================
91
- # STEP 1 β€” FLORENCE-2-LARGE: 5 DIVERSE CAPTIONS
92
- # 3 simple + 2 detailed β€” no padding, no duplicates
 
 
 
 
 
 
 
 
 
 
93
  # ============================================================================
94
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
95
 
@@ -97,11 +107,11 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
97
  image_size = (image.width, image.height)
98
 
99
  tasks = [
100
- ("<CAPTION>", 30, {"num_beams": 1}),
101
- ("<CAPTION>", 35, {"do_sample": True, "temperature": 0.9, "top_p": 0.90}),
102
- ("<CAPTION>", 35, {"do_sample": True, "temperature": 1.2, "top_p": 0.95}),
103
  ("<DETAILED_CAPTION>", 80, {"do_sample": True, "temperature": 0.7, "top_p": 0.90}),
104
- ("<MORE_DETAILED_CAPTION>", 120, {"do_sample": True, "temperature": 0.9, "top_p": 0.95}),
 
105
  ]
106
 
107
  for task_prompt, max_tokens, gen_params in tasks:
@@ -289,9 +299,6 @@ def fuse_captions(cap1: str, cap2: str, qwen_tok, qwen_mod) -> str:
289
  st.warning(f"Qwen fusion error: {str(e)[:80]}")
290
  return cap1
291
 
292
- # ============================================================================
293
- # CAPTION QUALITY β€” BLIP ITM + COSINE ON FINAL CAPTION
294
- # ============================================================================
295
  def compute_caption_quality(image, final_caption, blip_proc, blip_itm) -> tuple:
296
 
297
  try:
@@ -334,56 +341,73 @@ def compute_caption_quality(image, final_caption, blip_proc, blip_itm) -> tuple:
334
  return avg_score, round(itm_score, 4), round(cosine_score, 4)
335
 
336
  # ============================================================================
337
- # GAUGE CHART β€” 4 COLOR ZONES BELOW IMAGE
 
338
  # ============================================================================
339
  def render_gauge(score, itm, cosine, placeholder):
340
 
341
  if score >= 0.75:
342
- label, bar_color = "Good", "#22c55e"
343
  elif score >= 0.50:
344
- label, bar_color = "Moderate", "#f97316"
345
  elif score >= 0.25:
346
- label, bar_color = "Low", "#eab308"
347
  else:
348
- label, bar_color = "Poor", "#ef4444"
349
 
350
  fig = go.Figure(go.Indicator(
351
  mode = "gauge+number",
352
  value = score,
353
- number = {"font": {"size": 32, "color": bar_color}},
354
- gauge = {
355
- "axis": {"range": [0, 1], "tickwidth": 1, "tickcolor": "#6b7280"},
356
- "bar": {"color": bar_color, "thickness": 0.3},
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  "steps": [
358
- {"range": [0.00, 0.25], "color": "#fee2e2"},
359
- {"range": [0.25, 0.50], "color": "#fef9c3"},
360
- {"range": [0.50, 0.75], "color": "#ffedd5"},
361
- {"range": [0.75, 1.00], "color": "#dcfce7"},
362
  ],
363
  "threshold": {
364
- "line": {"color": bar_color, "width": 4},
365
- "thickness": 0.75,
366
  "value": score
367
  }
368
  },
369
  title = {
370
- "text": f"Caption Quality Score<br><b style='color:{bar_color}'>{label}</b>",
371
- "font": {"size": 13}
372
  }
373
  ))
374
 
375
  fig.update_layout(
376
- height = 230,
377
- margin = dict(l=20, r=20, t=50, b=10),
378
  paper_bgcolor = "rgba(0,0,0,0)",
379
- font = {"color": "#374151", "family": "sans-serif"}
 
380
  )
381
 
382
  with placeholder:
383
  st.markdown("<br>", unsafe_allow_html=True)
384
  g_col, s_col = st.columns([3, 2])
 
385
  with g_col:
386
  st.plotly_chart(fig, use_container_width=True)
 
387
  with s_col:
388
  st.markdown("<br><br>", unsafe_allow_html=True)
389
  st.markdown("**Score Breakdown**")
@@ -392,11 +416,14 @@ def render_gauge(score, itm, cosine, placeholder):
392
  st.markdown(f"Overall Score: **{score} / 1.00**")
393
  st.markdown(
394
  f"<span style='background:{bar_color};color:white;"
395
- f"padding:3px 10px;border-radius:12px;"
396
- f"font-weight:600;font-size:13px;'>{label}</span>",
397
  unsafe_allow_html=True
398
  )
399
 
 
 
 
400
  with st.sidebar:
401
  st.title("Image Caption Fusion")
402
  st.markdown("---")
@@ -424,6 +451,38 @@ Caption fusion
424
  st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
425
  st.markdown("**API:** Jina")
426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  st.title("Image Caption Fusion System")
428
  st.markdown("Upload an image to generate a refined, grounded caption.")
429
  st.markdown("---")
@@ -516,4 +575,10 @@ if uploaded_file is not None:
516
  avg_score, itm_q, cosine_q = compute_caption_quality(
517
  input_image, final, blip_proc, blip_itm
518
  )
 
 
 
 
 
 
519
  render_gauge(avg_score, itm_q, cosine_q, gauge_placeholder)
 
88
  return f"data:image/jpeg;base64,{b64}"
89
 
90
  # ============================================================================
91
+ # STEP 1 β€” FLORENCE-2-LARGE: 5 DISTINCT CAPTION APPROACHES
92
+ #
93
+ # Cap 1: <CAPTION> greedy
94
+ # β†’ single concise sentence, primary subject only
95
+ # Cap 2: <CAPTION> sampling temp=1.0
96
+ # β†’ alt-text accessibility style, concise but different phrasing
97
+ # Cap 3: <DETAILED_CAPTION> temp=0.7
98
+ # β†’ paragraph describing foreground, background, colors
99
+ # Cap 4: <DETAILED_CAPTION> temp=1.1
100
+ # β†’ focuses on mood, atmosphere, implied action
101
+ # Cap 5: <MORE_DETAILED_CAPTION> temp=0.8
102
+ # β†’ exhaustive breakdown of every visible element
103
  # ============================================================================
104
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
105
 
 
107
  image_size = (image.width, image.height)
108
 
109
  tasks = [
110
+ ("<CAPTION>", 30, {"num_beams": 1}),
111
+ ("<CAPTION>", 35, {"do_sample": True, "temperature": 1.0, "top_p": 0.92}),
 
112
  ("<DETAILED_CAPTION>", 80, {"do_sample": True, "temperature": 0.7, "top_p": 0.90}),
113
+ ("<DETAILED_CAPTION>", 90, {"do_sample": True, "temperature": 1.1, "top_p": 0.95}),
114
+ ("<MORE_DETAILED_CAPTION>", 120, {"do_sample": True, "temperature": 0.8, "top_p": 0.92}),
115
  ]
116
 
117
  for task_prompt, max_tokens, gen_params in tasks:
 
299
  st.warning(f"Qwen fusion error: {str(e)[:80]}")
300
  return cap1
301
 
 
 
 
302
  def compute_caption_quality(image, final_caption, blip_proc, blip_itm) -> tuple:
303
 
304
  try:
 
341
  return avg_score, round(itm_score, 4), round(cosine_score, 4)
342
 
343
  # ============================================================================
344
+ # GAUGE β€” updated to match reference style
345
+ # Bright saturated zone colors, sharp black needle, clean arc, no dark shades
346
  # ============================================================================
347
  def render_gauge(score, itm, cosine, placeholder):
348
 
349
  if score >= 0.75:
350
+ label, bar_color = "Good", "#16a34a"
351
  elif score >= 0.50:
352
+ label, bar_color = "Moderate", "#d97706"
353
  elif score >= 0.25:
354
+ label, bar_color = "Low", "#ca8a04"
355
  else:
356
+ label, bar_color = "Poor", "#dc2626"
357
 
358
  fig = go.Figure(go.Indicator(
359
  mode = "gauge+number",
360
  value = score,
361
+ number = {
362
+ "font": {"size": 36, "color": bar_color, "family": "Arial Black"},
363
+ "suffix": ""
364
+ },
365
+ gauge = {
366
+ "axis": {
367
+ "range": [0, 1],
368
+ "tickwidth": 2,
369
+ "tickcolor": "#111827",
370
+ "tickfont": {"size": 11, "color": "#374151"}
371
+ },
372
+ "bar": {
373
+ "color": "#111827",
374
+ "thickness": 0.06
375
+ },
376
+ "bgcolor": "white",
377
+ "borderwidth": 0,
378
  "steps": [
379
+ {"range": [0.00, 0.25], "color": "#ef4444"},
380
+ {"range": [0.25, 0.50], "color": "#f59e0b"},
381
+ {"range": [0.50, 0.75], "color": "#84cc16"},
382
+ {"range": [0.75, 1.00], "color": "#22c55e"},
383
  ],
384
  "threshold": {
385
+ "line": {"color": "#111827", "width": 5},
386
+ "thickness": 0.85,
387
  "value": score
388
  }
389
  },
390
  title = {
391
+ "text": f"Caption Quality Score<br><b style='color:{bar_color};font-size:15px'>{label}</b>",
392
+ "font": {"size": 13, "color": "#374151"}
393
  }
394
  ))
395
 
396
  fig.update_layout(
397
+ height = 240,
398
+ margin = dict(l=15, r=15, t=55, b=5),
399
  paper_bgcolor = "rgba(0,0,0,0)",
400
+ plot_bgcolor = "rgba(0,0,0,0)",
401
+ font = {"color": "#374151", "family": "Arial"}
402
  )
403
 
404
  with placeholder:
405
  st.markdown("<br>", unsafe_allow_html=True)
406
  g_col, s_col = st.columns([3, 2])
407
+
408
  with g_col:
409
  st.plotly_chart(fig, use_container_width=True)
410
+
411
  with s_col:
412
  st.markdown("<br><br>", unsafe_allow_html=True)
413
  st.markdown("**Score Breakdown**")
 
416
  st.markdown(f"Overall Score: **{score} / 1.00**")
417
  st.markdown(
418
  f"<span style='background:{bar_color};color:white;"
419
+ f"padding:4px 12px;border-radius:12px;"
420
+ f"font-weight:700;font-size:13px;'>{label}</span>",
421
  unsafe_allow_html=True
422
  )
423
 
424
+ # ============================================================================
425
+ # SIDEBAR β€” pipeline steps + live accuracy section (session_state)
426
+ # ============================================================================
427
  with st.sidebar:
428
  st.title("Image Caption Fusion")
429
  st.markdown("---")
 
451
  st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
452
  st.markdown("**API:** Jina")
453
 
454
+ # ── Live accuracy section β€” populated after pipeline runs ──────────────
455
+ st.markdown("---")
456
+ st.markdown("### Caption Quality")
457
+
458
+ if "avg_score" in st.session_state:
459
+ score = st.session_state.avg_score
460
+ itm = st.session_state.itm_q
461
+ cos = st.session_state.cosine_q
462
+
463
+ if score >= 0.75:
464
+ label, color = "Good", "#16a34a"
465
+ elif score >= 0.50:
466
+ label, color = "Moderate", "#d97706"
467
+ elif score >= 0.25:
468
+ label, color = "Low", "#ca8a04"
469
+ else:
470
+ label, color = "Poor", "#dc2626"
471
+
472
+ st.markdown(
473
+ f"<span style='background:{color};color:white;padding:3px 10px;"
474
+ f"border-radius:10px;font-weight:700;font-size:13px;'>{label}</span>",
475
+ unsafe_allow_html=True
476
+ )
477
+ st.markdown(f"**Overall:** {score} / 1.00")
478
+ st.markdown(f"BLIP ITM: **{itm}**")
479
+ st.markdown(f"Cosine Similarity: **{cos}**")
480
+ else:
481
+ st.caption("Run the pipeline to see scores.")
482
+
483
+ # ============================================================================
484
+ # MAIN UI
485
+ # ============================================================================
486
  st.title("Image Caption Fusion System")
487
  st.markdown("Upload an image to generate a refined, grounded caption.")
488
  st.markdown("---")
 
575
  avg_score, itm_q, cosine_q = compute_caption_quality(
576
  input_image, final, blip_proc, blip_itm
577
  )
578
+
579
+ # Store in session_state so sidebar updates on rerender
580
+ st.session_state.avg_score = avg_score
581
+ st.session_state.itm_q = itm_q
582
+ st.session_state.cosine_q = cosine_q
583
+
584
  render_gauge(avg_score, itm_q, cosine_q, gauge_placeholder)