Afsha001 commited on
Commit
278547a
·
verified ·
1 Parent(s): ebc8d8e

add accuracy score

Browse files
Files changed (1) hide show
  1. app.py +121 -44
app.py CHANGED
@@ -6,6 +6,7 @@ import pandas as pd
6
  import requests
7
  import base64
8
  import streamlit as st
 
9
  from PIL import Image
10
  from io import BytesIO
11
  from collections import Counter
@@ -30,12 +31,6 @@ if not JINA_KEY:
30
  st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
31
  st.stop()
32
 
33
- # ============================================================================
34
- # LOAD LOCAL MODELS
35
- # DINO removed — was adding hallucinated labels that hurt fusion accuracy
36
- # Local: Florence-2, BLIP ITM, Qwen2.5
37
- # API: Jina Reranker
38
- # ============================================================================
39
  @st.cache_resource
40
  def load_local_models():
41
  from transformers import (
@@ -93,7 +88,8 @@ def image_to_data_uri(image: Image.Image) -> str:
93
  return f"data:image/jpeg;base64,{b64}"
94
 
95
  # ============================================================================
96
- # STEP 1 — FLORENCE-2-LARGE: GENERATE 5 DIVERSE CAPTIONS
 
97
  # ============================================================================
98
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
99
 
@@ -101,21 +97,11 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
101
  image_size = (image.width, image.height)
102
 
103
  tasks = [
104
- (
105
- "<CAPTION>",
106
- 30,
107
- {"num_beams": 1}
108
- ),
109
- (
110
- "<DETAILED_CAPTION>",
111
- 80,
112
- {"do_sample": True, "temperature": 0.7, "top_p": 0.9}
113
- ),
114
- (
115
- "<MORE_DETAILED_CAPTION>",
116
- 120,
117
- {"do_sample": True, "temperature": 1.1, "top_p": 0.95}
118
- ),
119
  ]
120
 
121
  for task_prompt, max_tokens, gen_params in tasks:
@@ -155,9 +141,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
155
 
156
  return unique[:5]
157
 
158
- # ============================================================================
159
- # STEP 2 — BLIP ITM: IMAGE-TEXT MATCHING SCORES
160
- # ============================================================================
161
  def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
162
  scores = []
163
  for cap in captions:
@@ -177,9 +160,6 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
177
  scores.append(0.0)
178
  return scores
179
 
180
- # ============================================================================
181
- # STEP 3 — JINA RERANKER M0: SEMANTIC SCORES
182
- # ============================================================================
183
  def compute_jina_scores(image: Image.Image, captions: list) -> list:
184
  img_data_uri = image_to_data_uri(image)
185
  scores = []
@@ -210,9 +190,6 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
210
  scores.append(0.0)
211
  return scores
212
 
213
- # ============================================================================
214
- # STEP 4 — COSINE SIMILARITY: EMBEDDING SCORES
215
- # ============================================================================
216
  def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
217
  try:
218
  img_inp = blip_proc(images=image, return_tensors="pt")
@@ -239,9 +216,6 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
239
  st.warning(f"Cosine error: {str(e)[:60]}")
240
  return [0.0] * len(captions)
241
 
242
- # ============================================================================
243
- # STEP 5 — MAJORITY VOTING: SELECT TOP 2 CAPTIONS
244
- # ============================================================================
245
  def majority_voting(captions, itm, jina, cosine) -> tuple:
246
  itm_r = np.argsort(itm)[::-1]
247
  jina_r = np.argsort(jina)[::-1]
@@ -259,11 +233,6 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
259
 
260
  return captions[top2[0]], captions[top2[1]], top2, dict(counts)
261
 
262
- # ============================================================================
263
- # STEP 6 — QWEN2.5-1.5B: CAPTION FUSION
264
- # DINO objects removed from input — was causing hallucinations in fused output
265
- # Qwen now fuses only the two verified majority-voted captions
266
- # ============================================================================
267
  def fuse_captions(cap1: str, cap2: str, qwen_tok, qwen_mod) -> str:
268
 
269
  system_prompt = (
@@ -321,8 +290,113 @@ def fuse_captions(cap1: str, cap2: str, qwen_tok, qwen_mod) -> str:
321
  return cap1
322
 
323
  # ============================================================================
324
- # SIDEBAR
325
  # ============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  with st.sidebar:
327
  st.title("Image Caption Fusion")
328
  st.markdown("---")
@@ -350,9 +424,6 @@ Caption fusion
350
  st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
351
  st.markdown("**API:** Jina")
352
 
353
- # ============================================================================
354
- # MAIN UI
355
- # ============================================================================
356
  st.title("Image Caption Fusion System")
357
  st.markdown("Upload an image to generate a refined, grounded caption.")
358
  st.markdown("---")
@@ -369,6 +440,7 @@ if uploaded_file is not None:
369
 
370
  with col_img:
371
  st.image(input_image, caption="Uploaded Image", use_container_width=True)
 
372
 
373
  with col_run:
374
  if st.button("Generate Caption", type="primary", use_container_width=True):
@@ -439,4 +511,9 @@ if uploaded_file is not None:
439
  f"font-size:18px;font-weight:500;text-align:center;"
440
  f"line-height:1.6;'>{final}</div>",
441
  unsafe_allow_html=True
442
- )
 
 
 
 
 
 
6
  import requests
7
  import base64
8
  import streamlit as st
9
+ import plotly.graph_objects as go
10
  from PIL import Image
11
  from io import BytesIO
12
  from collections import Counter
 
31
  st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
32
  st.stop()
33
 
 
 
 
 
 
 
34
  @st.cache_resource
35
  def load_local_models():
36
  from transformers import (
 
88
  return f"data:image/jpeg;base64,{b64}"
89
 
90
  # ============================================================================
91
+ # STEP 1 — FLORENCE-2-LARGE: 5 DIVERSE CAPTIONS
92
+ # 3 simple + 2 detailed — no padding, no duplicates
93
  # ============================================================================
94
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
95
 
 
97
  image_size = (image.width, image.height)
98
 
99
  tasks = [
100
+ ("<CAPTION>", 30, {"num_beams": 1}),
101
+ ("<CAPTION>", 35, {"do_sample": True, "temperature": 0.9, "top_p": 0.90}),
102
+ ("<CAPTION>", 35, {"do_sample": True, "temperature": 1.2, "top_p": 0.95}),
103
+ ("<DETAILED_CAPTION>", 80, {"do_sample": True, "temperature": 0.7, "top_p": 0.90}),
104
+ ("<MORE_DETAILED_CAPTION>", 120, {"do_sample": True, "temperature": 0.9, "top_p": 0.95}),
 
 
 
 
 
 
 
 
 
 
105
  ]
106
 
107
  for task_prompt, max_tokens, gen_params in tasks:
 
141
 
142
  return unique[:5]
143
 
 
 
 
144
  def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
145
  scores = []
146
  for cap in captions:
 
160
  scores.append(0.0)
161
  return scores
162
 
 
 
 
163
  def compute_jina_scores(image: Image.Image, captions: list) -> list:
164
  img_data_uri = image_to_data_uri(image)
165
  scores = []
 
190
  scores.append(0.0)
191
  return scores
192
 
 
 
 
193
  def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
194
  try:
195
  img_inp = blip_proc(images=image, return_tensors="pt")
 
216
  st.warning(f"Cosine error: {str(e)[:60]}")
217
  return [0.0] * len(captions)
218
 
 
 
 
219
  def majority_voting(captions, itm, jina, cosine) -> tuple:
220
  itm_r = np.argsort(itm)[::-1]
221
  jina_r = np.argsort(jina)[::-1]
 
233
 
234
  return captions[top2[0]], captions[top2[1]], top2, dict(counts)
235
 
 
 
 
 
 
236
  def fuse_captions(cap1: str, cap2: str, qwen_tok, qwen_mod) -> str:
237
 
238
  system_prompt = (
 
290
  return cap1
291
 
292
  # ============================================================================
293
+ # CAPTION QUALITY — BLIP ITM + COSINE ON FINAL CAPTION
294
  # ============================================================================
295
+ def compute_caption_quality(image, final_caption, blip_proc, blip_itm) -> tuple:
296
+
297
+ try:
298
+ inputs = blip_proc(
299
+ images=image, text=final_caption,
300
+ return_tensors="pt", padding=True
301
+ )
302
+ with torch.no_grad():
303
+ out = blip_itm(**inputs)
304
+ itm_score = torch.nn.functional.softmax(
305
+ out.itm_score, dim=1
306
+ )[:, 1].item()
307
+ except:
308
+ itm_score = 0.0
309
+
310
+ try:
311
+ img_inp = blip_proc(images=image, return_tensors="pt")
312
+ with torch.no_grad():
313
+ vis = blip_itm.vision_model(pixel_values=img_inp["pixel_values"])
314
+ img_feat = blip_itm.vision_proj(vis.last_hidden_state[:, 0, :]).numpy()
315
+ img_feat = normalize(img_feat, norm="l2")
316
+
317
+ cap_inp = blip_proc(
318
+ text=[final_caption], return_tensors="pt",
319
+ padding=True, truncation=True, max_length=512
320
+ )
321
+ with torch.no_grad():
322
+ txt = blip_itm.text_encoder(
323
+ input_ids=cap_inp["input_ids"],
324
+ attention_mask=cap_inp["attention_mask"]
325
+ )
326
+ cap_feat = blip_itm.text_proj(txt.last_hidden_state[:, 0, :]).numpy()
327
+ cap_feat = normalize(cap_feat, norm="l2")
328
+
329
+ cosine_score = float(cosine_similarity(img_feat, cap_feat)[0][0])
330
+ except:
331
+ cosine_score = 0.0
332
+
333
+ avg_score = round((itm_score + cosine_score) / 2, 4)
334
+ return avg_score, round(itm_score, 4), round(cosine_score, 4)
335
+
336
+ # ============================================================================
337
+ # GAUGE CHART — 4 COLOR ZONES BELOW IMAGE
338
+ # ============================================================================
339
+ def render_gauge(score, itm, cosine, placeholder):
340
+
341
+ if score >= 0.75:
342
+ label, bar_color = "Good", "#22c55e"
343
+ elif score >= 0.50:
344
+ label, bar_color = "Moderate", "#f97316"
345
+ elif score >= 0.25:
346
+ label, bar_color = "Low", "#eab308"
347
+ else:
348
+ label, bar_color = "Poor", "#ef4444"
349
+
350
+ fig = go.Figure(go.Indicator(
351
+ mode = "gauge+number",
352
+ value = score,
353
+ number = {"font": {"size": 32, "color": bar_color}},
354
+ gauge = {
355
+ "axis": {"range": [0, 1], "tickwidth": 1, "tickcolor": "#6b7280"},
356
+ "bar": {"color": bar_color, "thickness": 0.3},
357
+ "steps": [
358
+ {"range": [0.00, 0.25], "color": "#fee2e2"},
359
+ {"range": [0.25, 0.50], "color": "#fef9c3"},
360
+ {"range": [0.50, 0.75], "color": "#ffedd5"},
361
+ {"range": [0.75, 1.00], "color": "#dcfce7"},
362
+ ],
363
+ "threshold": {
364
+ "line": {"color": bar_color, "width": 4},
365
+ "thickness": 0.75,
366
+ "value": score
367
+ }
368
+ },
369
+ title = {
370
+ "text": f"Caption Quality Score<br><b style='color:{bar_color}'>{label}</b>",
371
+ "font": {"size": 13}
372
+ }
373
+ ))
374
+
375
+ fig.update_layout(
376
+ height = 230,
377
+ margin = dict(l=20, r=20, t=50, b=10),
378
+ paper_bgcolor = "rgba(0,0,0,0)",
379
+ font = {"color": "#374151", "family": "sans-serif"}
380
+ )
381
+
382
+ with placeholder:
383
+ st.markdown("<br>", unsafe_allow_html=True)
384
+ g_col, s_col = st.columns([3, 2])
385
+ with g_col:
386
+ st.plotly_chart(fig, use_container_width=True)
387
+ with s_col:
388
+ st.markdown("<br><br>", unsafe_allow_html=True)
389
+ st.markdown("**Score Breakdown**")
390
+ st.markdown(f"Image-Text Match: **{itm}**")
391
+ st.markdown(f"Embedding Similarity: **{cosine}**")
392
+ st.markdown(f"Overall Score: **{score} / 1.00**")
393
+ st.markdown(
394
+ f"<span style='background:{bar_color};color:white;"
395
+ f"padding:3px 10px;border-radius:12px;"
396
+ f"font-weight:600;font-size:13px;'>{label}</span>",
397
+ unsafe_allow_html=True
398
+ )
399
+
400
  with st.sidebar:
401
  st.title("Image Caption Fusion")
402
  st.markdown("---")
 
424
  st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
425
  st.markdown("**API:** Jina")
426
 
 
 
 
427
  st.title("Image Caption Fusion System")
428
  st.markdown("Upload an image to generate a refined, grounded caption.")
429
  st.markdown("---")
 
440
 
441
  with col_img:
442
  st.image(input_image, caption="Uploaded Image", use_container_width=True)
443
+ gauge_placeholder = st.empty()
444
 
445
  with col_run:
446
  if st.button("Generate Caption", type="primary", use_container_width=True):
 
511
  f"font-size:18px;font-weight:500;text-align:center;"
512
  f"line-height:1.6;'>{final}</div>",
513
  unsafe_allow_html=True
514
+ )
515
+
516
+ avg_score, itm_q, cosine_q = compute_caption_quality(
517
+ input_image, final, blip_proc, blip_itm
518
+ )
519
+ render_gauge(avg_score, itm_q, cosine_q, gauge_placeholder)