Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
|
@@ -88,8 +88,18 @@ def image_to_data_uri(image: Image.Image) -> str:
|
|
| 88 |
return f"data:image/jpeg;base64,{b64}"
|
| 89 |
|
| 90 |
# ============================================================================
|
| 91 |
-
# STEP 1 β FLORENCE-2-LARGE: 5
|
| 92 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
# ============================================================================
|
| 94 |
def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
|
| 95 |
|
|
@@ -97,11 +107,11 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
|
|
| 97 |
image_size = (image.width, image.height)
|
| 98 |
|
| 99 |
tasks = [
|
| 100 |
-
("<CAPTION>",
|
| 101 |
-
("<CAPTION>",
|
| 102 |
-
("<CAPTION>", 35, {"do_sample": True, "temperature": 1.2, "top_p": 0.95}),
|
| 103 |
("<DETAILED_CAPTION>", 80, {"do_sample": True, "temperature": 0.7, "top_p": 0.90}),
|
| 104 |
-
("<
|
|
|
|
| 105 |
]
|
| 106 |
|
| 107 |
for task_prompt, max_tokens, gen_params in tasks:
|
|
@@ -289,9 +299,6 @@ def fuse_captions(cap1: str, cap2: str, qwen_tok, qwen_mod) -> str:
|
|
| 289 |
st.warning(f"Qwen fusion error: {str(e)[:80]}")
|
| 290 |
return cap1
|
| 291 |
|
| 292 |
-
# ============================================================================
|
| 293 |
-
# CAPTION QUALITY β BLIP ITM + COSINE ON FINAL CAPTION
|
| 294 |
-
# ============================================================================
|
| 295 |
def compute_caption_quality(image, final_caption, blip_proc, blip_itm) -> tuple:
|
| 296 |
|
| 297 |
try:
|
|
@@ -334,56 +341,73 @@ def compute_caption_quality(image, final_caption, blip_proc, blip_itm) -> tuple:
|
|
| 334 |
return avg_score, round(itm_score, 4), round(cosine_score, 4)
|
| 335 |
|
| 336 |
# ============================================================================
|
| 337 |
-
# GAUGE
|
|
|
|
| 338 |
# ============================================================================
|
| 339 |
def render_gauge(score, itm, cosine, placeholder):
|
| 340 |
|
| 341 |
if score >= 0.75:
|
| 342 |
-
label, bar_color = "Good", "#
|
| 343 |
elif score >= 0.50:
|
| 344 |
-
label, bar_color = "Moderate", "#
|
| 345 |
elif score >= 0.25:
|
| 346 |
-
label, bar_color = "Low", "#
|
| 347 |
else:
|
| 348 |
-
label, bar_color = "Poor", "#
|
| 349 |
|
| 350 |
fig = go.Figure(go.Indicator(
|
| 351 |
mode = "gauge+number",
|
| 352 |
value = score,
|
| 353 |
-
number = {
|
| 354 |
-
|
| 355 |
-
"
|
| 356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
"steps": [
|
| 358 |
-
{"range": [0.00, 0.25], "color": "#
|
| 359 |
-
{"range": [0.25, 0.50], "color": "#
|
| 360 |
-
{"range": [0.50, 0.75], "color": "#
|
| 361 |
-
{"range": [0.75, 1.00], "color": "#
|
| 362 |
],
|
| 363 |
"threshold": {
|
| 364 |
-
"line": {"color":
|
| 365 |
-
"thickness": 0.
|
| 366 |
"value": score
|
| 367 |
}
|
| 368 |
},
|
| 369 |
title = {
|
| 370 |
-
"text": f"Caption Quality Score<br><b style='color:{bar_color}'>{label}</b>",
|
| 371 |
-
"font": {"size": 13}
|
| 372 |
}
|
| 373 |
))
|
| 374 |
|
| 375 |
fig.update_layout(
|
| 376 |
-
height =
|
| 377 |
-
margin = dict(l=
|
| 378 |
paper_bgcolor = "rgba(0,0,0,0)",
|
| 379 |
-
|
|
|
|
| 380 |
)
|
| 381 |
|
| 382 |
with placeholder:
|
| 383 |
st.markdown("<br>", unsafe_allow_html=True)
|
| 384 |
g_col, s_col = st.columns([3, 2])
|
|
|
|
| 385 |
with g_col:
|
| 386 |
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
| 387 |
with s_col:
|
| 388 |
st.markdown("<br><br>", unsafe_allow_html=True)
|
| 389 |
st.markdown("**Score Breakdown**")
|
|
@@ -392,11 +416,14 @@ def render_gauge(score, itm, cosine, placeholder):
|
|
| 392 |
st.markdown(f"Overall Score: **{score} / 1.00**")
|
| 393 |
st.markdown(
|
| 394 |
f"<span style='background:{bar_color};color:white;"
|
| 395 |
-
f"padding:
|
| 396 |
-
f"font-weight:
|
| 397 |
unsafe_allow_html=True
|
| 398 |
)
|
| 399 |
|
|
|
|
|
|
|
|
|
|
| 400 |
with st.sidebar:
|
| 401 |
st.title("Image Caption Fusion")
|
| 402 |
st.markdown("---")
|
|
@@ -424,6 +451,38 @@ Caption fusion
|
|
| 424 |
st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
|
| 425 |
st.markdown("**API:** Jina")
|
| 426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
st.title("Image Caption Fusion System")
|
| 428 |
st.markdown("Upload an image to generate a refined, grounded caption.")
|
| 429 |
st.markdown("---")
|
|
@@ -516,4 +575,10 @@ if uploaded_file is not None:
|
|
| 516 |
avg_score, itm_q, cosine_q = compute_caption_quality(
|
| 517 |
input_image, final, blip_proc, blip_itm
|
| 518 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
render_gauge(avg_score, itm_q, cosine_q, gauge_placeholder)
|
|
|
|
| 88 |
return f"data:image/jpeg;base64,{b64}"
|
| 89 |
|
| 90 |
# ============================================================================
|
| 91 |
+
# STEP 1 β FLORENCE-2-LARGE: 5 DISTINCT CAPTION APPROACHES
|
| 92 |
+
#
|
| 93 |
+
# Cap 1: <CAPTION> greedy
|
| 94 |
+
# β single concise sentence, primary subject only
|
| 95 |
+
# Cap 2: <CAPTION> sampling temp=1.0
|
| 96 |
+
# β alt-text accessibility style, concise but different phrasing
|
| 97 |
+
# Cap 3: <DETAILED_CAPTION> temp=0.7
|
| 98 |
+
# β paragraph describing foreground, background, colors
|
| 99 |
+
# Cap 4: <DETAILED_CAPTION> temp=1.1
|
| 100 |
+
# β focuses on mood, atmosphere, implied action
|
| 101 |
+
# Cap 5: <MORE_DETAILED_CAPTION> temp=0.8
|
| 102 |
+
# β exhaustive breakdown of every visible element
|
| 103 |
# ============================================================================
|
| 104 |
def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
|
| 105 |
|
|
|
|
| 107 |
image_size = (image.width, image.height)
|
| 108 |
|
| 109 |
tasks = [
|
| 110 |
+
("<CAPTION>", 30, {"num_beams": 1}),
|
| 111 |
+
("<CAPTION>", 35, {"do_sample": True, "temperature": 1.0, "top_p": 0.92}),
|
|
|
|
| 112 |
("<DETAILED_CAPTION>", 80, {"do_sample": True, "temperature": 0.7, "top_p": 0.90}),
|
| 113 |
+
("<DETAILED_CAPTION>", 90, {"do_sample": True, "temperature": 1.1, "top_p": 0.95}),
|
| 114 |
+
("<MORE_DETAILED_CAPTION>", 120, {"do_sample": True, "temperature": 0.8, "top_p": 0.92}),
|
| 115 |
]
|
| 116 |
|
| 117 |
for task_prompt, max_tokens, gen_params in tasks:
|
|
|
|
| 299 |
st.warning(f"Qwen fusion error: {str(e)[:80]}")
|
| 300 |
return cap1
|
| 301 |
|
|
|
|
|
|
|
|
|
|
| 302 |
def compute_caption_quality(image, final_caption, blip_proc, blip_itm) -> tuple:
|
| 303 |
|
| 304 |
try:
|
|
|
|
| 341 |
return avg_score, round(itm_score, 4), round(cosine_score, 4)
|
| 342 |
|
| 343 |
# ============================================================================
|
| 344 |
+
# GAUGE β updated to match reference style
|
| 345 |
+
# Bright saturated zone colors, sharp black needle, clean arc, no dark shades
|
| 346 |
# ============================================================================
|
| 347 |
def render_gauge(score, itm, cosine, placeholder):
|
| 348 |
|
| 349 |
if score >= 0.75:
|
| 350 |
+
label, bar_color = "Good", "#16a34a"
|
| 351 |
elif score >= 0.50:
|
| 352 |
+
label, bar_color = "Moderate", "#d97706"
|
| 353 |
elif score >= 0.25:
|
| 354 |
+
label, bar_color = "Low", "#ca8a04"
|
| 355 |
else:
|
| 356 |
+
label, bar_color = "Poor", "#dc2626"
|
| 357 |
|
| 358 |
fig = go.Figure(go.Indicator(
|
| 359 |
mode = "gauge+number",
|
| 360 |
value = score,
|
| 361 |
+
number = {
|
| 362 |
+
"font": {"size": 36, "color": bar_color, "family": "Arial Black"},
|
| 363 |
+
"suffix": ""
|
| 364 |
+
},
|
| 365 |
+
gauge = {
|
| 366 |
+
"axis": {
|
| 367 |
+
"range": [0, 1],
|
| 368 |
+
"tickwidth": 2,
|
| 369 |
+
"tickcolor": "#111827",
|
| 370 |
+
"tickfont": {"size": 11, "color": "#374151"}
|
| 371 |
+
},
|
| 372 |
+
"bar": {
|
| 373 |
+
"color": "#111827",
|
| 374 |
+
"thickness": 0.06
|
| 375 |
+
},
|
| 376 |
+
"bgcolor": "white",
|
| 377 |
+
"borderwidth": 0,
|
| 378 |
"steps": [
|
| 379 |
+
{"range": [0.00, 0.25], "color": "#ef4444"},
|
| 380 |
+
{"range": [0.25, 0.50], "color": "#f59e0b"},
|
| 381 |
+
{"range": [0.50, 0.75], "color": "#84cc16"},
|
| 382 |
+
{"range": [0.75, 1.00], "color": "#22c55e"},
|
| 383 |
],
|
| 384 |
"threshold": {
|
| 385 |
+
"line": {"color": "#111827", "width": 5},
|
| 386 |
+
"thickness": 0.85,
|
| 387 |
"value": score
|
| 388 |
}
|
| 389 |
},
|
| 390 |
title = {
|
| 391 |
+
"text": f"Caption Quality Score<br><b style='color:{bar_color};font-size:15px'>{label}</b>",
|
| 392 |
+
"font": {"size": 13, "color": "#374151"}
|
| 393 |
}
|
| 394 |
))
|
| 395 |
|
| 396 |
fig.update_layout(
|
| 397 |
+
height = 240,
|
| 398 |
+
margin = dict(l=15, r=15, t=55, b=5),
|
| 399 |
paper_bgcolor = "rgba(0,0,0,0)",
|
| 400 |
+
plot_bgcolor = "rgba(0,0,0,0)",
|
| 401 |
+
font = {"color": "#374151", "family": "Arial"}
|
| 402 |
)
|
| 403 |
|
| 404 |
with placeholder:
|
| 405 |
st.markdown("<br>", unsafe_allow_html=True)
|
| 406 |
g_col, s_col = st.columns([3, 2])
|
| 407 |
+
|
| 408 |
with g_col:
|
| 409 |
st.plotly_chart(fig, use_container_width=True)
|
| 410 |
+
|
| 411 |
with s_col:
|
| 412 |
st.markdown("<br><br>", unsafe_allow_html=True)
|
| 413 |
st.markdown("**Score Breakdown**")
|
|
|
|
| 416 |
st.markdown(f"Overall Score: **{score} / 1.00**")
|
| 417 |
st.markdown(
|
| 418 |
f"<span style='background:{bar_color};color:white;"
|
| 419 |
+
f"padding:4px 12px;border-radius:12px;"
|
| 420 |
+
f"font-weight:700;font-size:13px;'>{label}</span>",
|
| 421 |
unsafe_allow_html=True
|
| 422 |
)
|
| 423 |
|
| 424 |
+
# ============================================================================
|
| 425 |
+
# SIDEBAR β pipeline steps + live accuracy section (session_state)
|
| 426 |
+
# ============================================================================
|
| 427 |
with st.sidebar:
|
| 428 |
st.title("Image Caption Fusion")
|
| 429 |
st.markdown("---")
|
|
|
|
| 451 |
st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
|
| 452 |
st.markdown("**API:** Jina")
|
| 453 |
|
| 454 |
+
# ββ Live accuracy section β populated after pipeline runs ββββββββββββββ
|
| 455 |
+
st.markdown("---")
|
| 456 |
+
st.markdown("### Caption Quality")
|
| 457 |
+
|
| 458 |
+
if "avg_score" in st.session_state:
|
| 459 |
+
score = st.session_state.avg_score
|
| 460 |
+
itm = st.session_state.itm_q
|
| 461 |
+
cos = st.session_state.cosine_q
|
| 462 |
+
|
| 463 |
+
if score >= 0.75:
|
| 464 |
+
label, color = "Good", "#16a34a"
|
| 465 |
+
elif score >= 0.50:
|
| 466 |
+
label, color = "Moderate", "#d97706"
|
| 467 |
+
elif score >= 0.25:
|
| 468 |
+
label, color = "Low", "#ca8a04"
|
| 469 |
+
else:
|
| 470 |
+
label, color = "Poor", "#dc2626"
|
| 471 |
+
|
| 472 |
+
st.markdown(
|
| 473 |
+
f"<span style='background:{color};color:white;padding:3px 10px;"
|
| 474 |
+
f"border-radius:10px;font-weight:700;font-size:13px;'>{label}</span>",
|
| 475 |
+
unsafe_allow_html=True
|
| 476 |
+
)
|
| 477 |
+
st.markdown(f"**Overall:** {score} / 1.00")
|
| 478 |
+
st.markdown(f"BLIP ITM: **{itm}**")
|
| 479 |
+
st.markdown(f"Cosine Similarity: **{cos}**")
|
| 480 |
+
else:
|
| 481 |
+
st.caption("Run the pipeline to see scores.")
|
| 482 |
+
|
| 483 |
+
# ============================================================================
|
| 484 |
+
# MAIN UI
|
| 485 |
+
# ============================================================================
|
| 486 |
st.title("Image Caption Fusion System")
|
| 487 |
st.markdown("Upload an image to generate a refined, grounded caption.")
|
| 488 |
st.markdown("---")
|
|
|
|
| 575 |
avg_score, itm_q, cosine_q = compute_caption_quality(
|
| 576 |
input_image, final, blip_proc, blip_itm
|
| 577 |
)
|
| 578 |
+
|
| 579 |
+
# Store in session_state so sidebar updates on rerender
|
| 580 |
+
st.session_state.avg_score = avg_score
|
| 581 |
+
st.session_state.itm_q = itm_q
|
| 582 |
+
st.session_state.cosine_q = cosine_q
|
| 583 |
+
|
| 584 |
render_gauge(avg_score, itm_q, cosine_q, gauge_placeholder)
|