Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 13 days ago

Commit

278547a

verified ·

1 Parent(s): ebc8d8e

add accuracy score

Browse files

Files changed (1) hide show

app.py +121 -44

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import pandas as pd
 import requests
 import base64
 import streamlit as st
 from PIL import Image
 from io import BytesIO
 from collections import Counter
@@ -30,12 +31,6 @@ if not JINA_KEY:
     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
-# ============================================================================
-# LOAD LOCAL MODELS
-# DINO removed — was adding hallucinated labels that hurt fusion accuracy
-# Local: Florence-2, BLIP ITM, Qwen2.5
-# API:   Jina Reranker
-# ============================================================================
 @st.cache_resource
 def load_local_models():
     from transformers import (
@@ -93,7 +88,8 @@ def image_to_data_uri(image: Image.Image) -> str:
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
-# STEP 1 — FLORENCE-2-LARGE: GENERATE 5 DIVERSE CAPTIONS
 # ============================================================================
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
@@ -101,21 +97,11 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
     image_size = (image.width, image.height)
     tasks = [
-        (
-            "<CAPTION>",
-            30,
-            {"num_beams": 1}
-        ),
-        (
-            "<DETAILED_CAPTION>",
-            80,
-            {"do_sample": True, "temperature": 0.7, "top_p": 0.9}
-        ),
-        (
-            "<MORE_DETAILED_CAPTION>",
-            120,
-            {"do_sample": True, "temperature": 1.1, "top_p": 0.95}
-        ),
     ]
     for task_prompt, max_tokens, gen_params in tasks:
@@ -155,9 +141,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
     return unique[:5]
-# ============================================================================
-# STEP 2 — BLIP ITM: IMAGE-TEXT MATCHING SCORES
-# ============================================================================
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
@@ -177,9 +160,6 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
             scores.append(0.0)
     return scores
-# ============================================================================
-# STEP 3 — JINA RERANKER M0: SEMANTIC SCORES
-# ============================================================================
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
@@ -210,9 +190,6 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
             scores.append(0.0)
     return scores
-# ============================================================================
-# STEP 4 — COSINE SIMILARITY: EMBEDDING SCORES
-# ============================================================================
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
@@ -239,9 +216,6 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
-# ============================================================================
-# STEP 5 — MAJORITY VOTING: SELECT TOP 2 CAPTIONS
-# ============================================================================
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
@@ -259,11 +233,6 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
-# ============================================================================
-# STEP 6 — QWEN2.5-1.5B: CAPTION FUSION
-# DINO objects removed from input — was causing hallucinations in fused output
-# Qwen now fuses only the two verified majority-voted captions
-# ============================================================================
 def fuse_captions(cap1: str, cap2: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
@@ -321,8 +290,113 @@ def fuse_captions(cap1: str, cap2: str, qwen_tok, qwen_mod) -> str:
         return cap1
 # ============================================================================
-# SIDEBAR
 # ============================================================================
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
@@ -350,9 +424,6 @@ Caption fusion
     st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
     st.markdown("**API:** Jina")
-# ============================================================================
-# MAIN UI
-# ============================================================================
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
@@ -369,6 +440,7 @@ if uploaded_file is not None:
     with col_img:
         st.image(input_image, caption="Uploaded Image", use_container_width=True)
     with col_run:
         if st.button("Generate Caption", type="primary", use_container_width=True):
@@ -439,4 +511,9 @@ if uploaded_file is not None:
                 f"font-size:18px;font-weight:500;text-align:center;"
                 f"line-height:1.6;'>{final}</div>",
                 unsafe_allow_html=True
-            )

 import requests
 import base64
 import streamlit as st
+import plotly.graph_objects as go
 from PIL import Image
 from io import BytesIO
 from collections import Counter
     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
 @st.cache_resource
 def load_local_models():
     from transformers import (
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
+# STEP 1 — FLORENCE-2-LARGE: 5 DIVERSE CAPTIONS
+# 3 simple + 2 detailed — no padding, no duplicates
 # ============================================================================
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
     image_size = (image.width, image.height)
     tasks = [
+        ("<CAPTION>", 30, {"num_beams": 1}),
+        ("<CAPTION>", 35, {"do_sample": True, "temperature": 0.9, "top_p": 0.90}),
+        ("<CAPTION>", 35, {"do_sample": True, "temperature": 1.2, "top_p": 0.95}),
+        ("<DETAILED_CAPTION>",      80,  {"do_sample": True, "temperature": 0.7, "top_p": 0.90}),
+        ("<MORE_DETAILED_CAPTION>", 120, {"do_sample": True, "temperature": 0.9, "top_p": 0.95}),
     ]
     for task_prompt, max_tokens, gen_params in tasks:
     return unique[:5]
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
             scores.append(0.0)
     return scores
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
             scores.append(0.0)
     return scores
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
 def fuse_captions(cap1: str, cap2: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
         return cap1
 # ============================================================================
+# CAPTION QUALITY — BLIP ITM + COSINE ON FINAL CAPTION
 # ============================================================================
+def compute_caption_quality(image, final_caption, blip_proc, blip_itm) -> tuple:
+    try:
+        inputs = blip_proc(
+            images=image, text=final_caption,
+            return_tensors="pt", padding=True
+        )
+        with torch.no_grad():
+            out       = blip_itm(**inputs)
+            itm_score = torch.nn.functional.softmax(
+                out.itm_score, dim=1
+            )[:, 1].item()
+    except:
+        itm_score = 0.0
+    try:
+        img_inp = blip_proc(images=image, return_tensors="pt")
+        with torch.no_grad():
+            vis      = blip_itm.vision_model(pixel_values=img_inp["pixel_values"])
+            img_feat = blip_itm.vision_proj(vis.last_hidden_state[:, 0, :]).numpy()
+            img_feat = normalize(img_feat, norm="l2")
+        cap_inp = blip_proc(
+            text=[final_caption], return_tensors="pt",
+            padding=True, truncation=True, max_length=512
+        )
+        with torch.no_grad():
+            txt      = blip_itm.text_encoder(
+                input_ids=cap_inp["input_ids"],
+                attention_mask=cap_inp["attention_mask"]
+            )
+            cap_feat = blip_itm.text_proj(txt.last_hidden_state[:, 0, :]).numpy()
+            cap_feat = normalize(cap_feat, norm="l2")
+        cosine_score = float(cosine_similarity(img_feat, cap_feat)[0][0])
+    except:
+        cosine_score = 0.0
+    avg_score = round((itm_score + cosine_score) / 2, 4)
+    return avg_score, round(itm_score, 4), round(cosine_score, 4)
+# ============================================================================
+# GAUGE CHART — 4 COLOR ZONES BELOW IMAGE
+# ============================================================================
+def render_gauge(score, itm, cosine, placeholder):
+    if score >= 0.75:
+        label, bar_color = "Good",     "#22c55e"
+    elif score >= 0.50:
+        label, bar_color = "Moderate", "#f97316"
+    elif score >= 0.25:
+        label, bar_color = "Low",      "#eab308"
+    else:
+        label, bar_color = "Poor",     "#ef4444"
+    fig = go.Figure(go.Indicator(
+        mode  = "gauge+number",
+        value = score,
+        number = {"font": {"size": 32, "color": bar_color}},
+        gauge  = {
+            "axis":  {"range": [0, 1], "tickwidth": 1, "tickcolor": "#6b7280"},
+            "bar":   {"color": bar_color, "thickness": 0.3},
+            "steps": [
+                {"range": [0.00, 0.25], "color": "#fee2e2"},
+                {"range": [0.25, 0.50], "color": "#fef9c3"},
+                {"range": [0.50, 0.75], "color": "#ffedd5"},
+                {"range": [0.75, 1.00], "color": "#dcfce7"},
+            ],
+            "threshold": {
+                "line":      {"color": bar_color, "width": 4},
+                "thickness": 0.75,
+                "value":     score
+            }
+        },
+        title = {
+            "text": f"Caption Quality Score<br><b style='color:{bar_color}'>{label}</b>",
+            "font": {"size": 13}
+        }
+    ))
+    fig.update_layout(
+        height        = 230,
+        margin        = dict(l=20, r=20, t=50, b=10),
+        paper_bgcolor = "rgba(0,0,0,0)",
+        font          = {"color": "#374151", "family": "sans-serif"}
+    )
+    with placeholder:
+        st.markdown("<br>", unsafe_allow_html=True)
+        g_col, s_col = st.columns([3, 2])
+        with g_col:
+            st.plotly_chart(fig, use_container_width=True)
+        with s_col:
+            st.markdown("<br><br>", unsafe_allow_html=True)
+            st.markdown("**Score Breakdown**")
+            st.markdown(f"Image-Text Match: **{itm}**")
+            st.markdown(f"Embedding Similarity: **{cosine}**")
+            st.markdown(f"Overall Score: **{score} / 1.00**")
+            st.markdown(
+                f"<span style='background:{bar_color};color:white;"
+                f"padding:3px 10px;border-radius:12px;"
+                f"font-weight:600;font-size:13px;'>{label}</span>",
+                unsafe_allow_html=True
+            )
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
     st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
     st.markdown("**API:** Jina")
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
     with col_img:
         st.image(input_image, caption="Uploaded Image", use_container_width=True)
+        gauge_placeholder = st.empty()
     with col_run:
         if st.button("Generate Caption", type="primary", use_container_width=True):
                 f"font-size:18px;font-weight:500;text-align:center;"
                 f"line-height:1.6;'>{final}</div>",
                 unsafe_allow_html=True
+            )
+            avg_score, itm_q, cosine_q = compute_caption_quality(
+                input_image, final, blip_proc, blip_itm
+            )
+            render_gauge(avg_score, itm_q, cosine_q, gauge_placeholder)