Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 7 days ago

Commit

8dffcbd

verified ·

1 Parent(s): 05ab8bc

update app.py

Browse files

Files changed (1) hide show

app.py +489 -2

app.py CHANGED Viewed

@@ -1,3 +1,490 @@
-# PASTE YOUR COMPLETE app.py CONTENT HERE
-# (the one from /mnt/user-data/outputs/app.py)

+import os
+import gc
+import torch
+import numpy as np
+import pandas as pd
+import requests
+import base64
+import streamlit as st
+from PIL import Image
+from io import BytesIO
+from collections import Counter
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.preprocessing import normalize
+# ============================================================================
+# PAGE CONFIG
+# ============================================================================
+st.set_page_config(
+    page_title="Image Caption Fusion System",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# ============================================================================
+# CREDENTIALS
+# ============================================================================
+HF_TOKEN = os.environ.get("HF_TOKEN", "")
+JINA_KEY = os.environ.get("JINA_KEY", "")
+# ============================================================================
+# API ENDPOINTS
+# Florence-2: raw bytes, no Content-Type
+# Qwen2.5:   model-specific endpoint (not generic /v1/chat/completions)
+# Jina:      query=plain string, documents=list of data URI strings
+# ============================================================================
+FLORENCE_URL     = "https://api-inference.huggingface.co/models/microsoft/Florence-2-large"
+FLORENCE_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
+QWEN_URL   = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
+HF_HEADERS = {
+    "Authorization": f"Bearer {HF_TOKEN}",
+    "Content-Type":  "application/json"
+}
+JINA_URL     = "https://api.jina.ai/v1/rerank"
+JINA_HEADERS = {
+    "Authorization": f"Bearer {JINA_KEY}",
+    "Content-Type":  "application/json"
+}
+DETECT_PROMPT = (
+    "person . child . man . woman . boy . girl . "
+    "dog . cat . horse . bird . animal . "
+    "ball . toy . bicycle . car . bench . "
+    "tree . grass . water . sky . mountain . "
+    "building . stairs . door . fence . floor . "
+    "jacket . dress . shirt . hat . bag ."
+)
+# ============================================================================
+# CREDENTIAL CHECK
+# ============================================================================
+if not HF_TOKEN:
+    st.error("HF_TOKEN missing. Go to Space Settings → Secrets and add it.")
+    st.stop()
+if not JINA_KEY:
+    st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
+    st.stop()
+# ============================================================================
+# LOAD LOCAL MODELS — BLIP ITM + GROUNDING DINO
+# Cached so they load only once per session
+# ============================================================================
+@st.cache_resource
+def load_local_models():
+    from transformers import (
+        BlipProcessor,
+        BlipForImageTextRetrieval,
+        AutoProcessor,
+        AutoModelForZeroShotObjectDetection
+    )
+    gc.collect()
+    blip_processor = BlipProcessor.from_pretrained(
+        "Salesforce/blip-image-captioning-large"
+    )
+    blip_itm_model = BlipForImageTextRetrieval.from_pretrained(
+        "Salesforce/blip-itm-large-coco",
+        torch_dtype=torch.float32
+    )
+    blip_itm_model.eval()
+    dino_processor = AutoProcessor.from_pretrained(
+        "IDEA-Research/grounding-dino-base"
+    )
+    dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
+        "IDEA-Research/grounding-dino-base",
+        torch_dtype=torch.float32
+    )
+    dino_model.eval()
+    return blip_processor, blip_itm_model, dino_processor, dino_model
+# ============================================================================
+# HELPERS
+# ============================================================================
+def image_to_bytes(image: Image.Image) -> bytes:
+    buf = BytesIO()
+    image.save(buf, format="JPEG", quality=85)
+    return buf.getvalue()
+def image_to_data_uri(image: Image.Image) -> str:
+    raw = image_to_bytes(image)
+    b64 = base64.b64encode(raw).decode()
+    return f"data:image/jpeg;base64,{b64}"
+# ============================================================================
+# STEP 1 — FLORENCE-2-LARGE: GENERATE 5 CAPTIONS
+# Fix applied: data=raw_bytes instead of json={"inputs": base64}
+# ============================================================================
+def generate_captions_florence(image: Image.Image) -> list:
+    img_bytes = image_to_bytes(image)
+    captions  = []
+    for i in range(5):
+        try:
+            response = requests.post(
+                FLORENCE_URL,
+                headers=FLORENCE_HEADERS,
+                data=img_bytes,
+                params={"wait_for_model": True},
+                timeout=60
+            )
+            if response.status_code == 200:
+                result = response.json()
+                if isinstance(result, list):
+                    cap = result[0].get("generated_text", "").strip().lower()
+                elif isinstance(result, dict):
+                    cap = result.get("generated_text", "").strip().lower()
+                else:
+                    cap = ""
+                captions.append(cap if cap else "a scene shown in the image")
+            else:
+                st.warning(f"Florence API error {response.status_code}")
+                captions.append("a scene shown in the image")
+        except Exception as e:
+            st.warning(f"Florence exception: {str(e)[:80]}")
+            captions.append("a scene shown in the image")
+    seen, unique = set(), []
+    for c in captions:
+        if c not in seen:
+            seen.add(c)
+            unique.append(c)
+    while len(unique) < 5:
+        unique.append(unique[0])
+    return unique[:5]
+# ============================================================================
+# STEP 2 — BLIP ITM: IMAGE-TEXT MATCHING SCORES
+# Local model, no API call needed
+# ============================================================================
+def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
+    scores = []
+    for cap in captions:
+        try:
+            inputs = blip_proc(
+                images=image, text=cap,
+                return_tensors="pt", padding=True
+            )
+            with torch.no_grad():
+                out   = blip_itm(**inputs)
+                score = torch.nn.functional.softmax(
+                    out.itm_score, dim=1
+                )[:, 1].item()
+            scores.append(round(float(score), 4))
+        except Exception as e:
+            st.warning(f"ITM error: {str(e)[:60]}")
+            scores.append(0.0)
+    return scores
+# ============================================================================
+# STEP 3 — JINA RERANKER M0: SEMANTIC SCORES
+# Fix applied: query=plain string, documents=[data_uri_string]
+# ============================================================================
+def compute_jina_scores(image: Image.Image, captions: list) -> list:
+    img_data_uri = image_to_data_uri(image)
+    scores       = []
+    for cap in captions:
+        try:
+            payload = {
+                "model":     "jina-reranker-m0",
+                "query":     cap,
+                "documents": [img_data_uri],
+                "top_n":     1
+            }
+            response = requests.post(
+                JINA_URL,
+                headers=JINA_HEADERS,
+                json=payload,
+                timeout=30
+            )
+            if response.status_code == 200:
+                result = response.json()
+                if "results" in result and result["results"]:
+                    score = result["results"][0].get("relevance_score", 0.0)
+                    scores.append(round(float(score), 4))
+                else:
+                    scores.append(0.0)
+            else:
+                st.warning(f"Jina API error {response.status_code}: {response.text[:100]}")
+                scores.append(0.0)
+        except Exception as e:
+            st.warning(f"Jina exception: {str(e)[:60]}")
+            scores.append(0.0)
+    return scores
+# ============================================================================
+# STEP 4 — COSINE SIMILARITY: EMBEDDING SCORES
+# Local model, reuses BLIP encoders
+# ============================================================================
+def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
+    try:
+        img_inp = blip_proc(images=image, return_tensors="pt")
+        with torch.no_grad():
+            vis      = blip_itm.vision_model(pixel_values=img_inp["pixel_values"])
+            img_feat = blip_itm.vision_proj(vis.last_hidden_state[:, 0, :]).numpy()
+            img_feat = normalize(img_feat, norm="l2")
+        cap_inp = blip_proc(
+            text=captions, return_tensors="pt",
+            padding=True, truncation=True, max_length=512
+        )
+        with torch.no_grad():
+            txt      = blip_itm.text_encoder(
+                input_ids=cap_inp["input_ids"],
+                attention_mask=cap_inp["attention_mask"]
+            )
+            cap_feat = blip_itm.text_proj(txt.last_hidden_state[:, 0, :]).numpy()
+            cap_feat = normalize(cap_feat, norm="l2")
+        sims = cosine_similarity(img_feat, cap_feat)[0]
+        return [round(float(s), 4) for s in sims]
+    except Exception as e:
+        st.warning(f"Cosine error: {str(e)[:60]}")
+        return [0.0] * len(captions)
+# ============================================================================
+# STEP 5 — MAJORITY VOTING: SELECT TOP 2 CAPTIONS
+# Each of 3 methods votes for its top 2 — 6 votes total
+# ============================================================================
+def majority_voting(captions, itm, jina, cosine) -> tuple:
+    itm_r    = np.argsort(itm)[::-1]
+    jina_r   = np.argsort(jina)[::-1]
+    cosine_r = np.argsort(cosine)[::-1]
+    votes = [
+        int(itm_r[0]),    int(itm_r[1]),
+        int(jina_r[0]),   int(jina_r[1]),
+        int(cosine_r[0]), int(cosine_r[1])
+    ]
+    counts = Counter(votes)
+    top2   = [idx for idx, _ in counts.most_common(2)]
+    if len(top2) < 2:
+        top2 = [int(itm_r[0]), int(jina_r[0])]
+    return captions[top2[0]], captions[top2[1]], top2, dict(counts)
+# ============================================================================
+# STEP 6 — GROUNDING DINO: OBJECT DETECTION
+# Local model, provides factual grounding for LLM fusion
+# ============================================================================
+def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
+    try:
+        inputs = dino_proc(
+            images=image, text=DETECT_PROMPT, return_tensors="pt"
+        )
+        with torch.no_grad():
+            outputs = dino_mod(**inputs)
+        target_sizes = torch.tensor([image.size[::-1]])
+        results      = dino_proc.post_process_grounded_object_detection(
+            outputs,
+            inputs.input_ids,
+            target_sizes=target_sizes
+        )[0]
+        scores = results["scores"]
+        labels = results.get("text_labels", results["labels"])
+        keep     = scores >= threshold
+        kept_sc  = scores[keep].tolist()
+        kept_lbl = [labels[i] for i in range(len(labels)) if keep[i]]
+        if not kept_lbl:
+            return "No objects detected", []
+        label_dict = {}
+        for lbl, sc in zip(kept_lbl, kept_sc):
+            lbl = lbl.strip().lower()
+            if lbl not in label_dict or label_dict[lbl] < sc:
+                label_dict[lbl] = sc
+        sorted_labels = [
+            l for l, _ in
+            sorted(label_dict.items(), key=lambda x: x[1], reverse=True)
+        ]
+        formatted = "Detected objects: [" + ", ".join(sorted_labels) + "]"
+        return formatted, sorted_labels
+    except Exception as e:
+        st.warning(f"DINO error: {str(e)[:80]}")
+        return "Object detection unavailable", []
+# ============================================================================
+# STEP 7 — QWEN2.5-1.5B: CAPTION FUSION
+# Fix applied: model-specific endpoint URL
+# ============================================================================
+def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
+    system_prompt = (
+        "You are an expert image captioning assistant. "
+        "Write ONE natural, fluent, descriptive caption combining the best details. "
+        "Return ONLY the caption, no explanation or prefix."
+    )
+    user_prompt = (
+        f"Caption A: {cap1}\n"
+        f"Caption B: {cap2}\n"
+        f"{objects}\n\n"
+        "Fused caption:"
+    )
+    try:
+        payload = {
+            "model": "Qwen/Qwen2.5-1.5B-Instruct",
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user",   "content": user_prompt}
+            ],
+            "max_tokens": 100,
+            "temperature": 0.3,
+            "top_p": 0.9
+        }
+        response = requests.post(
+            QWEN_URL,
+            headers=HF_HEADERS,
+            json=payload,
+            timeout=40
+        )
+        if response.status_code == 200:
+            fused = response.json()["choices"][0]["message"]["content"].strip()
+            for prefix in ["Fused caption:", "Caption:", "Result:"]:
+                if fused.lower().startswith(prefix.lower()):
+                    fused = fused[len(prefix):].strip()
+            return fused if fused else cap1
+        else:
+            st.warning(f"Qwen API error {response.status_code}")
+            return cap1
+    except Exception as e:
+        st.warning(f"Qwen exception: {str(e)[:60]}")
+        return cap1
+# ============================================================================
+# SIDEBAR
+# ============================================================================
+with st.sidebar:
+    st.title("Image Caption Fusion")
+    st.markdown("---")
+    st.markdown("### Pipeline Steps")
+    st.markdown("""
+**1. Florence-2-Large** (API)
+Generate 5 captions
+**2. BLIP ITM** (Local)
+Image-text matching
+**3. Jina Reranker M0** (API)
+Semantic reranking
+**4. Cosine Similarity** (Local)
+Embedding similarity
+**5. Majority Voting**
+Best 2 captions selected
+**6. Grounding DINO** (Local)
+Object detection
+**7. Qwen2.5-1.5B** (API)
+Caption fusion
+    """)
+    st.markdown("---")
+    st.markdown("**Local:** BLIP ITM, DINO")
+    st.markdown("**API:** Florence-2, Jina, Qwen2.5")
+# ============================================================================
+# MAIN UI
+# ============================================================================
+st.title("Image Caption Fusion System")
+st.markdown("Upload an image to generate a refined, grounded caption.")
+st.markdown("---")
+uploaded_file = st.file_uploader(
+    "Select an image",
+    type=["jpg", "jpeg", "png"]
+)
+if uploaded_file is not None:
+    input_image = Image.open(uploaded_file).convert("RGB")
+    col_img, col_run = st.columns([1, 1])
+    with col_img:
+        st.image(input_image, caption="Uploaded Image", use_column_width=True)
+    with col_run:
+        if st.button("Run Pipeline", type="primary", use_container_width=True):
+            with st.spinner("Loading local models (first run takes 1-2 min)..."):
+                blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
+            progress = st.progress(0)
+            status   = st.empty()
+            status.info("Step 1/7: Generating captions with Florence-2-Large...")
+            captions = generate_captions_florence(input_image)
+            progress.progress(14)
+            with st.expander("5 Generated Captions", expanded=True):
+                for i, cap in enumerate(captions):
+                    st.write(f"**{i+1}.** {cap}")
+            status.info("Step 2/7: Computing BLIP ITM scores...")
+            itm_scores = compute_itm_scores(input_image, captions, blip_proc, blip_itm)
+            progress.progress(28)
+            status.info("Step 3/7: Computing Jina Reranker scores...")
+            jina_scores = compute_jina_scores(input_image, captions)
+            progress.progress(42)
+            status.info("Step 4/7: Computing Cosine Similarity scores...")
+            cosine_scores = compute_cosine_scores(input_image, captions, blip_proc, blip_itm)
+            progress.progress(57)
+            scores_df = pd.DataFrame({
+                "Caption": [f"Cap {i+1}: {c[:50]}" for i, c in enumerate(captions)],
+                "ITM":     itm_scores,
+                "Jina":    jina_scores,
+                "Cosine":  cosine_scores
+            })
+            with st.expander("All Scores", expanded=False):
+                st.dataframe(scores_df, use_container_width=True, hide_index=True)
+            status.info("Step 5/7: Running majority voting...")
+            best_1, best_2, _, _ = majority_voting(
+                captions, itm_scores, jina_scores, cosine_scores
+            )
+            progress.progress(71)
+            st.markdown("### Majority Voted Captions")
+            c1, c2 = st.columns(2)
+            with c1:
+                st.success(f"1. {best_1}")
+            with c2:
+                st.info(f"2. {best_2}")
+            status.info("Step 6/7: Detecting objects with DINO...")
+            obj_str, obj_list = detect_objects(input_image, dino_proc, dino_mod)
+            progress.progress(85)
+            st.markdown("### Detected Objects")
+            st.write(" | ".join(obj_list) if obj_list else obj_str)
+            status.info("Step 7/7: Fusing captions with Qwen2.5-1.5B...")
+            final = fuse_captions(best_1, best_2, obj_str)
+            progress.progress(100)
+            status.success("Pipeline complete!")
+            st.markdown("---")
+            st.markdown("### Final Fused Caption")
+            st.markdown(
+                f"<div style='"
+                f"background:linear-gradient(135deg,#667eea,#764ba2);"
+                f"padding:24px;border-radius:12px;color:white;"
+                f"font-size:18px;font-weight:500;text-align:center;"
+                f"line-height:1.6;'>{final}</div>",
+                unsafe_allow_html=True
+            )