Spaces:

Afsha001
/

Image_captioning

Sleeping

App Files Files Community

Afsha001 commited on 18 days ago

Commit

8161819

verified ·

1 Parent(s): 209f651

update

Browse files

Files changed (1) hide show

app.py +96 -31

app.py CHANGED Viewed

@@ -88,8 +88,18 @@ def image_to_data_uri(image: Image.Image) -> str:
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
-# STEP 1 — FLORENCE-2-LARGE: 5 DIVERSE CAPTIONS
-# 3 simple + 2 detailed — no padding, no duplicates
 # ============================================================================
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
@@ -97,11 +107,11 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
     image_size = (image.width, image.height)
     tasks = [
-        ("<CAPTION>", 30, {"num_beams": 1}),
-        ("<CAPTION>", 35, {"do_sample": True, "temperature": 0.9, "top_p": 0.90}),
-        ("<CAPTION>", 35, {"do_sample": True, "temperature": 1.2, "top_p": 0.95}),
         ("<DETAILED_CAPTION>",      80,  {"do_sample": True, "temperature": 0.7, "top_p": 0.90}),
-        ("<MORE_DETAILED_CAPTION>", 120, {"do_sample": True, "temperature": 0.9, "top_p": 0.95}),
     ]
     for task_prompt, max_tokens, gen_params in tasks:
@@ -289,9 +299,6 @@ def fuse_captions(cap1: str, cap2: str, qwen_tok, qwen_mod) -> str:
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
-# ============================================================================
-# CAPTION QUALITY — BLIP ITM + COSINE ON FINAL CAPTION
-# ============================================================================
 def compute_caption_quality(image, final_caption, blip_proc, blip_itm) -> tuple:
     try:
@@ -334,56 +341,73 @@ def compute_caption_quality(image, final_caption, blip_proc, blip_itm) -> tuple:
     return avg_score, round(itm_score, 4), round(cosine_score, 4)
 # ============================================================================
-# GAUGE CHART — 4 COLOR ZONES BELOW IMAGE
 # ============================================================================
 def render_gauge(score, itm, cosine, placeholder):
     if score >= 0.75:
-        label, bar_color = "Good",     "#22c55e"
     elif score >= 0.50:
-        label, bar_color = "Moderate", "#f97316"
     elif score >= 0.25:
-        label, bar_color = "Low",      "#eab308"
     else:
-        label, bar_color = "Poor",     "#ef4444"
     fig = go.Figure(go.Indicator(
         mode  = "gauge+number",
         value = score,
-        number = {"font": {"size": 32, "color": bar_color}},
-        gauge  = {
-            "axis":  {"range": [0, 1], "tickwidth": 1, "tickcolor": "#6b7280"},
-            "bar":   {"color": bar_color, "thickness": 0.3},
             "steps": [
-                {"range": [0.00, 0.25], "color": "#fee2e2"},
-                {"range": [0.25, 0.50], "color": "#fef9c3"},
-                {"range": [0.50, 0.75], "color": "#ffedd5"},
-                {"range": [0.75, 1.00], "color": "#dcfce7"},
             ],
             "threshold": {
-                "line":      {"color": bar_color, "width": 4},
-                "thickness": 0.75,
                 "value":     score
             }
         },
         title = {
-            "text": f"Caption Quality Score<br><b style='color:{bar_color}'>{label}</b>",
-            "font": {"size": 13}
         }
     ))
     fig.update_layout(
-        height        = 230,
-        margin        = dict(l=20, r=20, t=50, b=10),
         paper_bgcolor = "rgba(0,0,0,0)",
-        font          = {"color": "#374151", "family": "sans-serif"}
     )
     with placeholder:
         st.markdown("<br>", unsafe_allow_html=True)
         g_col, s_col = st.columns([3, 2])
         with g_col:
             st.plotly_chart(fig, use_container_width=True)
         with s_col:
             st.markdown("<br><br>", unsafe_allow_html=True)
             st.markdown("**Score Breakdown**")
@@ -392,11 +416,14 @@ def render_gauge(score, itm, cosine, placeholder):
             st.markdown(f"Overall Score: **{score} / 1.00**")
             st.markdown(
                 f"<span style='background:{bar_color};color:white;"
-                f"padding:3px 10px;border-radius:12px;"
-                f"font-weight:600;font-size:13px;'>{label}</span>",
                 unsafe_allow_html=True
             )
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
@@ -424,6 +451,38 @@ Caption fusion
     st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
     st.markdown("**API:** Jina")
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
@@ -516,4 +575,10 @@ if uploaded_file is not None:
             avg_score, itm_q, cosine_q = compute_caption_quality(
                 input_image, final, blip_proc, blip_itm
             )
             render_gauge(avg_score, itm_q, cosine_q, gauge_placeholder)

     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
+# STEP 1 — FLORENCE-2-LARGE: 5 DISTINCT CAPTION APPROACHES
+#
+# Cap 1: <CAPTION> greedy
+#        → single concise sentence, primary subject only
+# Cap 2: <CAPTION> sampling temp=1.0
+#        → alt-text accessibility style, concise but different phrasing
+# Cap 3: <DETAILED_CAPTION> temp=0.7
+#        → paragraph describing foreground, background, colors
+# Cap 4: <DETAILED_CAPTION> temp=1.1
+#        → focuses on mood, atmosphere, implied action
+# Cap 5: <MORE_DETAILED_CAPTION> temp=0.8
+#        → exhaustive breakdown of every visible element
 # ============================================================================
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
     image_size = (image.width, image.height)
     tasks = [
+        ("<CAPTION>",               30,  {"num_beams": 1}),
+        ("<CAPTION>",               35,  {"do_sample": True, "temperature": 1.0, "top_p": 0.92}),
         ("<DETAILED_CAPTION>",      80,  {"do_sample": True, "temperature": 0.7, "top_p": 0.90}),
+        ("<DETAILED_CAPTION>",      90,  {"do_sample": True, "temperature": 1.1, "top_p": 0.95}),
+        ("<MORE_DETAILED_CAPTION>", 120, {"do_sample": True, "temperature": 0.8, "top_p": 0.92}),
     ]
     for task_prompt, max_tokens, gen_params in tasks:
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
 def compute_caption_quality(image, final_caption, blip_proc, blip_itm) -> tuple:
     try:
     return avg_score, round(itm_score, 4), round(cosine_score, 4)
 # ============================================================================
+# GAUGE — updated to match reference style
+# Bright saturated zone colors, sharp black needle, clean arc, no dark shades
 # ============================================================================
 def render_gauge(score, itm, cosine, placeholder):
     if score >= 0.75:
+        label, bar_color = "Good",     "#16a34a"
     elif score >= 0.50:
+        label, bar_color = "Moderate", "#d97706"
     elif score >= 0.25:
+        label, bar_color = "Low",      "#ca8a04"
     else:
+        label, bar_color = "Poor",     "#dc2626"
     fig = go.Figure(go.Indicator(
         mode  = "gauge+number",
         value = score,
+        number = {
+            "font":   {"size": 36, "color": bar_color, "family": "Arial Black"},
+            "suffix": ""
+        },
+        gauge = {
+            "axis": {
+                "range":     [0, 1],
+                "tickwidth": 2,
+                "tickcolor": "#111827",
+                "tickfont":  {"size": 11, "color": "#374151"}
+            },
+            "bar": {
+                "color":     "#111827",
+                "thickness": 0.06
+            },
+            "bgcolor":      "white",
+            "borderwidth":  0,
             "steps": [
+                {"range": [0.00, 0.25], "color": "#ef4444"},
+                {"range": [0.25, 0.50], "color": "#f59e0b"},
+                {"range": [0.50, 0.75], "color": "#84cc16"},
+                {"range": [0.75, 1.00], "color": "#22c55e"},
             ],
             "threshold": {
+                "line":      {"color": "#111827", "width": 5},
+                "thickness": 0.85,
                 "value":     score
             }
         },
         title = {
+            "text": f"Caption Quality Score<br><b style='color:{bar_color};font-size:15px'>{label}</b>",
+            "font": {"size": 13, "color": "#374151"}
         }
     ))
     fig.update_layout(
+        height        = 240,
+        margin        = dict(l=15, r=15, t=55, b=5),
         paper_bgcolor = "rgba(0,0,0,0)",
+        plot_bgcolor  = "rgba(0,0,0,0)",
+        font          = {"color": "#374151", "family": "Arial"}
     )
     with placeholder:
         st.markdown("<br>", unsafe_allow_html=True)
         g_col, s_col = st.columns([3, 2])
         with g_col:
             st.plotly_chart(fig, use_container_width=True)
         with s_col:
             st.markdown("<br><br>", unsafe_allow_html=True)
             st.markdown("**Score Breakdown**")
             st.markdown(f"Overall Score: **{score} / 1.00**")
             st.markdown(
                 f"<span style='background:{bar_color};color:white;"
+                f"padding:4px 12px;border-radius:12px;"
+                f"font-weight:700;font-size:13px;'>{label}</span>",
                 unsafe_allow_html=True
             )
+# ============================================================================
+# SIDEBAR — pipeline steps + live accuracy section (session_state)
+# ============================================================================
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
     st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
     st.markdown("**API:** Jina")
+    # ── Live accuracy section — populated after pipeline runs ──────────────
+    st.markdown("---")
+    st.markdown("### Caption Quality")
+    if "avg_score" in st.session_state:
+        score = st.session_state.avg_score
+        itm   = st.session_state.itm_q
+        cos   = st.session_state.cosine_q
+        if score >= 0.75:
+            label, color = "Good",     "#16a34a"
+        elif score >= 0.50:
+            label, color = "Moderate", "#d97706"
+        elif score >= 0.25:
+            label, color = "Low",      "#ca8a04"
+        else:
+            label, color = "Poor",     "#dc2626"
+        st.markdown(
+            f"<span style='background:{color};color:white;padding:3px 10px;"
+            f"border-radius:10px;font-weight:700;font-size:13px;'>{label}</span>",
+            unsafe_allow_html=True
+        )
+        st.markdown(f"**Overall:** {score} / 1.00")
+        st.markdown(f"BLIP ITM: **{itm}**")
+        st.markdown(f"Cosine Similarity: **{cos}**")
+    else:
+        st.caption("Run the pipeline to see scores.")
+# ============================================================================
+# MAIN UI
+# ============================================================================
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
             avg_score, itm_q, cosine_q = compute_caption_quality(
                 input_image, final, blip_proc, blip_itm
             )
+            # Store in session_state so sidebar updates on rerender
+            st.session_state.avg_score = avg_score
+            st.session_state.itm_q     = itm_q
+            st.session_state.cosine_q  = cosine_q
             render_gauge(avg_score, itm_q, cosine_q, gauge_placeholder)