Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 20 days ago

Commit

b7d8863

1 Parent(s): a0d9361

fix all f-string syntax errors

Browse files

Files changed (1) hide show

app.py +82 -202

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import os
-import re
-import time
 import torch
 import numpy as np
 import requests
@@ -11,27 +9,20 @@ from io import BytesIO
 from collections import Counter
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.preprocessing import normalize
-# ── Page config ──
-st.set_page_config(
-    page_title = "Image Caption Fusion",
-    page_icon  = "🖼️",
-    layout     = "wide"
-)
-# ── API Keys from HF Secrets ──
 HF_TOKEN = os.environ.get("HF_TOKEN", "")
 JINA_KEY = os.environ.get("JINA_KEY", "")
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# ── API endpoints ──
 QWEN_VL_URL  = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-2B-Instruct"
 QWEN_LM_URL  = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct"
 JINA_URL     = "https://api.jina.ai/v1/rerank"
-HF_HEADERS   = {"Authorization": f"Bearer {HF_TOKEN}"}
-JINA_HEADERS = {"Authorization": f"Bearer {JINA_KEY}", "Content-Type": "application/json"}
 DETECT_PROMPT = (
     "person . child . man . woman . boy . girl . "
@@ -42,38 +33,25 @@ DETECT_PROMPT = (
     "jacket . dress . shirt . hat . bag ."
 )
-# ── Load local models once at startup ──
 @st.cache_resource
 def load_local_models():
     from transformers import (
         BlipProcessor, BlipForImageTextRetrieval,
         AutoProcessor, AutoModelForZeroShotObjectDetection
     )
-    st.write("⏳ Loading BLIP ITM model (CPU)...")
-    blip_processor = BlipProcessor.from_pretrained(
-        "Salesforce/blip-image-captioning-large"
-    )
     itm_model = BlipForImageTextRetrieval.from_pretrained(
-        "Salesforce/blip-itm-large-coco",
-        torch_dtype = torch.float32
     )
     itm_model.eval()
-    st.write(" Loading DINO model (CPU)...")
-    dino_processor = AutoProcessor.from_pretrained(
-        "IDEA-Research/grounding-dino-base"
-    )
     dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
-        "IDEA-Research/grounding-dino-base",
-        torch_dtype = torch.float32
     )
     dino_model.eval()
     return blip_processor, itm_model, dino_processor, dino_model
-# ── Step 1: Generate 5 captions via Qwen2-VL API ──
-def generate_captions_api(image: Image.Image) -> list:
     buffered = BytesIO()
     image.save(buffered, format="JPEG")
     img_bytes = buffered.getvalue()
@@ -88,14 +66,12 @@ def generate_captions_api(image: Image.Image) -> list:
     captions = []
     for prompt in PROMPTS:
-        payload  = {"inputs": prompt, "image": img_bytes.hex()}
         try:
             response = requests.post(
                 QWEN_VL_URL,
-                headers = HF_HEADERS,
-                json    = {"inputs": prompt},
-                files   = {"image": img_bytes},
-                timeout = 30
             )
             if response.status_code == 200:
                 result = response.json()
@@ -106,10 +82,9 @@ def generate_captions_api(image: Image.Image) -> list:
                 captions.append(cap if cap else "a scene with various objects and people")
             else:
                 captions.append("a detailed scene with people and objects")
-        except Exception as e:
             captions.append("a scene captured in the image")
-    # Deduplicate
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
@@ -117,172 +92,125 @@ def generate_captions_api(image: Image.Image) -> list:
             unique.append(c)
     while len(unique) < 5:
         unique.append(unique[0])
     return unique[:5]
-# ── Step 2: BLIP ITM Scoring (local CPU) ──
 def compute_itm_scores(image, captions, blip_processor, itm_model):
     scores = []
     for cap in captions:
-        inp = blip_processor(
-            images=image, text=cap,
-            return_tensors="pt", padding=True
-        )
         with torch.no_grad():
             out   = itm_model(**inp)
-            score = torch.nn.functional.softmax(
-                out.itm_score, dim=1
-            )[:, 1].item()
         scores.append(round(score, 4))
     return scores
-# ── Step 3: Jina Reranker Scoring (API) ──
 def compute_jina_scores(image, captions):
     buffered = BytesIO()
     image.save(buffered, format="JPEG")
-    img_b64  = __import__("base64").b64encode(buffered.getvalue()).decode()
-    scores = []
     for cap in captions:
         try:
             payload  = {
                 "model"     : "jina-reranker-m0",
                 "query"     : cap,
                 "documents" : [{"type": "image_url",
-                                "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}]
             }
-            response = requests.post(
-                JINA_URL,
-                headers = JINA_HEADERS,
-                json    = payload,
-                timeout = 30
-            )
             if response.status_code == 200:
                 result = response.json()
                 score  = result["results"][0]["relevance_score"]
                 scores.append(round(float(score), 4))
             else:
                 scores.append(0.5)
-        except:
             scores.append(0.5)
     return scores
-# ── Step 4: Cosine Similarity Scoring (local numpy) ──
 def compute_cosine_scores(image, captions, blip_processor, itm_model):
-    # Get image embedding
     img_inp = blip_processor(images=image, return_tensors="pt")
     with torch.no_grad():
-        vis_out = itm_model.vision_model(
-            pixel_values=img_inp["pixel_values"]
-        )
-        img_feat = itm_model.vision_proj(
-            vis_out.last_hidden_state[:, 0, :]
-        ).numpy()
         img_feat = normalize(img_feat, norm="l2")
-    # Get caption embeddings
     cap_inp = blip_processor(
         text=captions, return_tensors="pt",
         padding=True, truncation=True, max_length=512
     )
     with torch.no_grad():
         txt_out  = itm_model.text_encoder(
-            input_ids      = cap_inp["input_ids"],
-            attention_mask = cap_inp["attention_mask"]
         )
-        cap_feat = itm_model.text_proj(
-            txt_out.last_hidden_state[:, 0, :]
-        ).numpy()
         cap_feat = normalize(cap_feat, norm="l2")
     scores = cosine_similarity(img_feat, cap_feat)[0]
     return [round(float(s), 4) for s in scores]
-# ── Step 5: Majority Voting ──
 def majority_voting(captions, itm_scores, jina_scores, cosine_scores):
     itm_ranked  = np.argsort(itm_scores)[::-1]
     jina_ranked = np.argsort(jina_scores)[::-1]
     cos_ranked  = np.argsort(cosine_scores)[::-1]
     votes = [
         int(itm_ranked[0]),  int(itm_ranked[1]),
         int(jina_ranked[0]), int(jina_ranked[1]),
         int(cos_ranked[0]),  int(cos_ranked[1]),
     ]
     vote_counts  = Counter(votes)
     top2_indices = [idx for idx, _ in vote_counts.most_common(2)]
     if len(top2_indices) < 2:
         top2_indices = [int(itm_ranked[0]), int(jina_ranked[0])]
-    return (
-        captions[top2_indices[0]],
-        captions[top2_indices[1]],
-        top2_indices,
-        dict(vote_counts)
-    )
-# ── Step 6: DINO Object Detection (local CPU) ──
 def detect_objects(image, dino_processor, dino_model, threshold=0.3):
-    inp = dino_processor(
-        images=image, text=DETECT_PROMPT,
-        return_tensors="pt"
-    )
     with torch.no_grad():
         outputs = dino_model(**inp)
     target_sizes = torch.tensor([image.size[::-1]])
-    results      = dino_processor.post_process_grounded_object_detection(
-        outputs, inp.input_ids,
-        target_sizes=target_sizes
     )[0]
     scores = results["scores"]
     labels = results["labels"]
     keep   = scores >= threshold
     labels = [labels[i] for i in range(len(labels)) if keep[i]]
     sc_list= scores[keep].tolist()
     if not labels:
         return "No objects detected", []
     seen = {}
     for lbl, sc in zip(labels, sc_list):
         lbl = lbl.strip().lower()
         if lbl not in seen or seen[lbl] < sc:
             seen[lbl] = sc
     sorted_labels = [l for l, _ in sorted(seen.items(), key=lambda x: x[1], reverse=True)]
-    label_str     = "Detected: [" + ", ".join(sorted_labels) + "]"
     return label_str, sorted_labels
-# ── Step 7: Qwen2.5-1.5B Caption Fusion (API) ──
 def fuse_captions_api(cap1, cap2, dino_labels):
-    prompt = f"""You are given two captions and detected objects for the same image.
-Write ONE fluent, natural, descriptive caption combining the best details.
-Return ONLY the caption, no explanation, no prefix.
-Caption 1        : {cap1}
-Caption 2        : {cap2}
-Detected objects : {dino_labels}
-Fused caption :"""
     try:
         response = requests.post(
             QWEN_LM_URL,
-            headers = HF_HEADERS,
-            json    = {
-                "inputs"     : prompt,
-                "parameters" : {
-                    "max_new_tokens"     : 80,
-                    "do_sample"          : False,
-                    "repetition_penalty" : 1.1,
-                    "return_full_text"   : False
                 }
             },
-            timeout = 40
         )
         if response.status_code == 200:
             result = response.json()
@@ -290,112 +218,80 @@ Fused caption :"""
                 fused = result[0].get("generated_text", "").strip()
             else:
                 fused = str(result).strip()
-            # Clean any prefix Qwen adds
-            for prefix in ["Fused caption :", "Fused caption:", "Caption:"]:
                 if fused.lower().startswith(prefix.lower()):
                     fused = fused[len(prefix):].strip()
             return fused if fused else cap1
         else:
             return cap1
-    except Exception as e:
         return cap1
-# ════════════════════════════════════════
-#  STREAMLIT UI
-# ════════════════════════════════════════
-# ── Sidebar ──
 with st.sidebar:
     st.title(" Image Caption Fusion")
     st.markdown("---")
-    st.markdown("###  Pipeline")
-    st.markdown("""
-    1.  **Qwen2-VL-2B** — Generate 5 captions
-    2.  **BLIP ITM** — Image-text matching score
-    3.  **Jina Reranker M0** — Semantic reranking
-    4.  **Cosine Similarity** — Embedding similarity
-    5.  **Majority Voting** — Best 2 captions
-    6.  **Grounding DINO** — Object detection
-    7.  **Qwen2.5-1.5B** — Caption fusion
-    """)
-    st.markdown("---")
-    st.markdown("###  About")
-    st.markdown("""
-    This system generates a rich, humanized caption
-    for any image using a multi-model ensemble pipeline.
-    """)
     st.markdown("---")
-    st.markdown("**Local models:** BLIP ITM, DINO")
-    st.markdown("**API models:** Qwen2-VL, Jina, Qwen2.5")
-# ── Main area ──
 st.title(" Image Caption Fusion System")
 st.markdown("Upload any image and get a detailed, humanized caption.")
 st.markdown("---")
-uploaded = st.file_uploader(
-    " Upload an image",
-    type=["jpg", "jpeg", "png"],
-    help="Upload any image to generate a fused caption"
-)
 if uploaded:
     image = Image.open(uploaded).convert("RGB")
     col1, col2 = st.columns([1, 1])
     with col1:
         st.image(image, caption="Uploaded Image", use_column_width=True)
     with col2:
         if st.button(" Generate Caption", type="primary", use_container_width=True):
-            # Load local models
-            with st.spinner("Loading local models (first time takes ~2 min)..."):
                 blip_processor, itm_model, dino_processor, dino_model = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
-            # Step 1 — Generate captions
             status.info(" Step 1/7 — Generating 5 captions with Qwen2-VL...")
             captions = generate_captions_api(image)
             progress.progress(14)
-            with st.expander(" 5 Generated Captions", expanded=False):
                 for i, c in enumerate(captions):
-                    st.write(f"**{i+1}.** {c}")
-            # Step 2 — ITM scores
             status.info(" Step 2/7 — Computing BLIP ITM scores...")
             itm_scores = compute_itm_scores(image, captions, blip_processor, itm_model)
             progress.progress(28)
-            # Step 3 — Jina scores
             status.info(" Step 3/7 — Computing Jina Reranker scores...")
             jina_scores = compute_jina_scores(image, captions)
             progress.progress(42)
-            # Step 4 — Cosine scores
-            status.info(" Step 4/7 — Computing Cosine Similarity scores...")
             cosine_scores = compute_cosine_scores(image, captions, blip_processor, itm_model)
             progress.progress(57)
-            # Show score table
-            import pandas as pd
             score_df = pd.DataFrame({
-                "Caption"   : [f"Cap {i+1}: {c[:50]}..." for i, c in enumerate(captions)],
-                "ITM"       : itm_scores,
-                "Jina"      : jina_scores,
-                "Cosine"    : cosine_scores
             })
-            with st.expander(" All Scores", expanded=False):
                 st.dataframe(score_df, use_container_width=True)
-            # Step 5 — Majority voting
-            status.info(" Step 5/7 — Running Majority Voting...")
             voted_cap1, voted_cap2, top2_idx, vote_counts = majority_voting(
                 captions, itm_scores, jina_scores, cosine_scores
             )
@@ -404,46 +300,30 @@ if uploaded:
             st.markdown("###  Majority Voted Captions")
             col_a, col_b = st.columns(2)
             with col_a:
-                st.success(f" **Caption 1:**
-{voted_cap1}")
             with col_b:
-                st.info(f" **Caption 2:**
-{voted_cap2}")
-            # Step 6 — DINO
             status.info(" Step 6/7 — Detecting objects with DINO...")
             label_str, label_list = detect_objects(image, dino_processor, dino_model)
             progress.progress(85)
             st.markdown("###  Detected Objects")
             if label_list:
-                cols = st.columns(min(len(label_list), 6))
-                for i, lbl in enumerate(label_list[:6]):
-                    cols[i].markdown(
-                        f"<span style='background:#e8f4fd;padding:4px 8px;"
-                        f"border-radius:12px;font-size:13px'> {lbl}</span>",
-                        unsafe_allow_html=True
-                    )
             else:
                 st.write(label_str)
-            # Step 7 — Qwen fusion
-            status.info("Step 7/7 — Fusing captions with Qwen2.5-1.5B...")
             fused = fuse_captions_api(voted_cap1, voted_cap2, label_str)
             progress.progress(100)
             status.success(" Pipeline complete!")
-            # Final output
             st.markdown("---")
             st.markdown("###  Final Fused Caption")
             st.markdown(
-                f"<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);"
-                f"padding: 20px; border-radius: 12px; color: white; font-size: 18px;"
-                f"font-weight: 500; text-align: center;'>"
-                f" {fused}"
-                f"</div>",
                 unsafe_allow_html=True
             )
-            st.markdown("---")

 import os
 import torch
 import numpy as np
 import requests
 from collections import Counter
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.preprocessing import normalize
+import base64
+import pandas as pd
+st.set_page_config(page_title="Image Caption Fusion", page_icon="🖼️", layout="wide")
 HF_TOKEN = os.environ.get("HF_TOKEN", "")
 JINA_KEY = os.environ.get("JINA_KEY", "")
+DEVICE   = "cpu"
 QWEN_VL_URL  = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-2B-Instruct"
 QWEN_LM_URL  = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct"
 JINA_URL     = "https://api.jina.ai/v1/rerank"
+HF_HEADERS   = {"Authorization": "Bearer " + HF_TOKEN}
+JINA_HEADERS = {"Authorization": "Bearer " + JINA_KEY, "Content-Type": "application/json"}
 DETECT_PROMPT = (
     "person . child . man . woman . boy . girl . "
     "jacket . dress . shirt . hat . bag ."
 )
 @st.cache_resource
 def load_local_models():
     from transformers import (
         BlipProcessor, BlipForImageTextRetrieval,
         AutoProcessor, AutoModelForZeroShotObjectDetection
     )
+    blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
     itm_model = BlipForImageTextRetrieval.from_pretrained(
+        "Salesforce/blip-itm-large-coco", torch_dtype=torch.float32
     )
     itm_model.eval()
+    dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
     dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
+        "IDEA-Research/grounding-dino-base", torch_dtype=torch.float32
     )
     dino_model.eval()
     return blip_processor, itm_model, dino_processor, dino_model
+def generate_captions_api(image):
     buffered = BytesIO()
     image.save(buffered, format="JPEG")
     img_bytes = buffered.getvalue()
     captions = []
     for prompt in PROMPTS:
         try:
             response = requests.post(
                 QWEN_VL_URL,
+                headers=HF_HEADERS,
+                json={"inputs": prompt},
+                timeout=30
             )
             if response.status_code == 200:
                 result = response.json()
                 captions.append(cap if cap else "a scene with various objects and people")
             else:
                 captions.append("a detailed scene with people and objects")
+        except Exception:
             captions.append("a scene captured in the image")
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
             unique.append(c)
     while len(unique) < 5:
         unique.append(unique[0])
     return unique[:5]
 def compute_itm_scores(image, captions, blip_processor, itm_model):
     scores = []
     for cap in captions:
+        inp = blip_processor(images=image, text=cap, return_tensors="pt", padding=True)
         with torch.no_grad():
             out   = itm_model(**inp)
+            score = torch.nn.functional.softmax(out.itm_score, dim=1)[:, 1].item()
         scores.append(round(score, 4))
     return scores
 def compute_jina_scores(image, captions):
     buffered = BytesIO()
     image.save(buffered, format="JPEG")
+    img_b64  = base64.b64encode(buffered.getvalue()).decode()
+    scores   = []
     for cap in captions:
         try:
             payload  = {
                 "model"     : "jina-reranker-m0",
                 "query"     : cap,
                 "documents" : [{"type": "image_url",
+                                "image_url": {"url": "data:image/jpeg;base64," + img_b64}}]
             }
+            response = requests.post(JINA_URL, headers=JINA_HEADERS, json=payload, timeout=30)
             if response.status_code == 200:
                 result = response.json()
                 score  = result["results"][0]["relevance_score"]
                 scores.append(round(float(score), 4))
             else:
                 scores.append(0.5)
+        except Exception:
             scores.append(0.5)
     return scores
 def compute_cosine_scores(image, captions, blip_processor, itm_model):
     img_inp = blip_processor(images=image, return_tensors="pt")
     with torch.no_grad():
+        vis_out  = itm_model.vision_model(pixel_values=img_inp["pixel_values"])
+        img_feat = itm_model.vision_proj(vis_out.last_hidden_state[:, 0, :]).numpy()
         img_feat = normalize(img_feat, norm="l2")
     cap_inp = blip_processor(
         text=captions, return_tensors="pt",
         padding=True, truncation=True, max_length=512
     )
     with torch.no_grad():
         txt_out  = itm_model.text_encoder(
+            input_ids=cap_inp["input_ids"],
+            attention_mask=cap_inp["attention_mask"]
         )
+        cap_feat = itm_model.text_proj(txt_out.last_hidden_state[:, 0, :]).numpy()
         cap_feat = normalize(cap_feat, norm="l2")
     scores = cosine_similarity(img_feat, cap_feat)[0]
     return [round(float(s), 4) for s in scores]
 def majority_voting(captions, itm_scores, jina_scores, cosine_scores):
     itm_ranked  = np.argsort(itm_scores)[::-1]
     jina_ranked = np.argsort(jina_scores)[::-1]
     cos_ranked  = np.argsort(cosine_scores)[::-1]
     votes = [
         int(itm_ranked[0]),  int(itm_ranked[1]),
         int(jina_ranked[0]), int(jina_ranked[1]),
         int(cos_ranked[0]),  int(cos_ranked[1]),
     ]
     vote_counts  = Counter(votes)
     top2_indices = [idx for idx, _ in vote_counts.most_common(2)]
     if len(top2_indices) < 2:
         top2_indices = [int(itm_ranked[0]), int(jina_ranked[0])]
+    return captions[top2_indices[0]], captions[top2_indices[1]], top2_indices, dict(vote_counts)
 def detect_objects(image, dino_processor, dino_model, threshold=0.3):
+    inp = dino_processor(images=image, text=DETECT_PROMPT, return_tensors="pt")
     with torch.no_grad():
         outputs = dino_model(**inp)
     target_sizes = torch.tensor([image.size[::-1]])
+    results = dino_processor.post_process_grounded_object_detection(
+        outputs, inp.input_ids, target_sizes=target_sizes
     )[0]
     scores = results["scores"]
     labels = results["labels"]
     keep   = scores >= threshold
     labels = [labels[i] for i in range(len(labels)) if keep[i]]
     sc_list= scores[keep].tolist()
     if not labels:
         return "No objects detected", []
     seen = {}
     for lbl, sc in zip(labels, sc_list):
         lbl = lbl.strip().lower()
         if lbl not in seen or seen[lbl] < sc:
             seen[lbl] = sc
     sorted_labels = [l for l, _ in sorted(seen.items(), key=lambda x: x[1], reverse=True)]
+    label_str = "Detected: [" + ", ".join(sorted_labels) + "]"
     return label_str, sorted_labels
 def fuse_captions_api(cap1, cap2, dino_labels):
+    prompt = (
+        "You are given two captions and detected objects for the same image. "
+        "Write ONE fluent, natural, descriptive caption combining the best details. "
+        "Return ONLY the caption, no explanation, no prefix. "
+        "Caption 1: " + cap1 + " "
+        "Caption 2: " + cap2 + " "
+        "Detected objects: " + dino_labels + " "
+        "Fused caption:"
+    )
     try:
         response = requests.post(
             QWEN_LM_URL,
+            headers=HF_HEADERS,
+            json={
+                "inputs": prompt,
+                "parameters": {
+                    "max_new_tokens"    : 80,
+                    "do_sample"         : False,
+                    "repetition_penalty": 1.1,
+                    "return_full_text"  : False
                 }
             },
+            timeout=40
         )
         if response.status_code == 200:
             result = response.json()
                 fused = result[0].get("generated_text", "").strip()
             else:
                 fused = str(result).strip()
+            for prefix in ["Fused caption:", "Caption:"]:
                 if fused.lower().startswith(prefix.lower()):
                     fused = fused[len(prefix):].strip()
             return fused if fused else cap1
         else:
             return cap1
+    except Exception:
         return cap1
+# ── SIDEBAR ──
 with st.sidebar:
     st.title(" Image Caption Fusion")
     st.markdown("---")
+    st.markdown("###  Pipeline Steps")
+    st.markdown("1.  Qwen2-VL-2B — Generate 5 captions")
+    st.markdown("2.  BLIP ITM — Image-text matching")
+    st.markdown("3.  Jina Reranker M0 — Semantic reranking")
+    st.markdown("4.  Cosine Similarity — Embedding similarity")
+    st.markdown("5.  Majority Voting — Best 2 captions")
+    st.markdown("6.  Grounding DINO — Object detection")
+    st.markdown("7.  Qwen2.5-1.5B — Caption fusion")
     st.markdown("---")
+    st.markdown("**Local:** BLIP ITM, DINO")
+    st.markdown("**API:** Qwen2-VL, Jina, Qwen2.5")
+# ── MAIN UI ──
 st.title(" Image Caption Fusion System")
 st.markdown("Upload any image and get a detailed, humanized caption.")
 st.markdown("---")
+uploaded = st.file_uploader(" Upload an image", type=["jpg","jpeg","png"])
 if uploaded:
     image = Image.open(uploaded).convert("RGB")
     col1, col2 = st.columns([1, 1])
     with col1:
         st.image(image, caption="Uploaded Image", use_column_width=True)
     with col2:
         if st.button(" Generate Caption", type="primary", use_container_width=True):
+            with st.spinner("Loading local models..."):
                 blip_processor, itm_model, dino_processor, dino_model = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
             status.info(" Step 1/7 — Generating 5 captions with Qwen2-VL...")
             captions = generate_captions_api(image)
             progress.progress(14)
+            with st.expander(" 5 Generated Captions"):
                 for i, c in enumerate(captions):
+                    st.write(str(i+1) + ". " + c)
             status.info(" Step 2/7 — Computing BLIP ITM scores...")
             itm_scores = compute_itm_scores(image, captions, blip_processor, itm_model)
             progress.progress(28)
             status.info(" Step 3/7 — Computing Jina Reranker scores...")
             jina_scores = compute_jina_scores(image, captions)
             progress.progress(42)
+            status.info(" Step 4/7 — Computing Cosine Similarity...")
             cosine_scores = compute_cosine_scores(image, captions, blip_processor, itm_model)
             progress.progress(57)
             score_df = pd.DataFrame({
+                "Caption" : ["Cap " + str(i+1) + ": " + c[:50] for i, c in enumerate(captions)],
+                "ITM"     : itm_scores,
+                "Jina"    : jina_scores,
+                "Cosine"  : cosine_scores
             })
+            with st.expander(" All Scores"):
                 st.dataframe(score_df, use_container_width=True)
+            status.info(" Step 5/7 — Majority Voting...")
             voted_cap1, voted_cap2, top2_idx, vote_counts = majority_voting(
                 captions, itm_scores, jina_scores, cosine_scores
             )
             st.markdown("###  Majority Voted Captions")
             col_a, col_b = st.columns(2)
             with col_a:
+                st.success(" Caption 1: " + voted_cap1)
             with col_b:
+                st.info(" Caption 2: " + voted_cap2)
             status.info(" Step 6/7 — Detecting objects with DINO...")
             label_str, label_list = detect_objects(image, dino_processor, dino_model)
             progress.progress(85)
             st.markdown("###  Detected Objects")
             if label_list:
+                st.write(" | ".join(["🔍 " + l for l in label_list]))
             else:
                 st.write(label_str)
+            status.info(" Step 7/7 — Fusing with Qwen2.5-1.5B...")
             fused = fuse_captions_api(voted_cap1, voted_cap2, label_str)
             progress.progress(100)
             status.success(" Pipeline complete!")
             st.markdown("---")
             st.markdown("###  Final Fused Caption")
             st.markdown(
+                "<div style='background:linear-gradient(135deg,#667eea,#764ba2);"
+                "padding:20px;border-radius:12px;color:white;font-size:18px;"
+                "font-weight:500;text-align:center;'> " + fused + "</div>",
                 unsafe_allow_html=True
             )