Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 24 days ago

Commit

25245f2

1 Parent(s): c1b05fd

add app.py and requirements.txt

Browse files

Files changed (2) hide show

app.py +390 -143
requirements.txt +10 -4

app.py CHANGED Viewed

@@ -1,154 +1,401 @@
-import gradio as gr
-import numpy as np
-import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
 import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
-):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
-css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 640px;
-}
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
             )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=2,  # Replace with defaults that work for your model
-                )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-        ],
-        outputs=[result, seed],
     )
-if __name__ == "__main__":
-    demo.launch()

+import os
+import re
+import time
 import torch
+import numpy as np
+import requests
+import streamlit as st
+from PIL import Image
+from io import BytesIO
+from collections import Counter
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.preprocessing import normalize
+# ── Page config ──
+st.set_page_config(
+    page_title = "Image Caption Fusion",
+    page_icon  = "🖼️",
+    layout     = "wide"
+)
+# ── API Keys from HF Secrets ──
+HF_TOKEN = os.environ.get("HF_TOKEN", "")
+JINA_KEY = os.environ.get("JINA_KEY", "")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# ── API endpoints ──
+QWEN_VL_URL  = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-2B-Instruct"
+QWEN_LM_URL  = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct"
+JINA_URL     = "https://api.jina.ai/v1/rerank"
+HF_HEADERS   = {"Authorization": f"Bearer {HF_TOKEN}"}
+JINA_HEADERS = {"Authorization": f"Bearer {JINA_KEY}", "Content-Type": "application/json"}
+DETECT_PROMPT = (
+    "person . child . man . woman . boy . girl . "
+    "dog . cat . horse . bird . animal . "
+    "ball . toy . bicycle . car . bench . "
+    "tree . grass . water . sky . mountain . "
+    "building . stairs . door . fence . floor . "
+    "jacket . dress . shirt . hat . bag ."
+)
+# ── Load local models once at startup ──
+@st.cache_resource
+def load_local_models():
+    from transformers import (
+        BlipProcessor, BlipForImageTextRetrieval,
+        AutoProcessor, AutoModelForZeroShotObjectDetection
+    )
+    st.write("⏳ Loading BLIP ITM model (CPU)...")
+    blip_processor = BlipProcessor.from_pretrained(
+        "Salesforce/blip-image-captioning-large"
+    )
+    itm_model = BlipForImageTextRetrieval.from_pretrained(
+        "Salesforce/blip-itm-large-coco",
+        torch_dtype = torch.float32
+    )
+    itm_model.eval()
+    st.write(" Loading DINO model (CPU)...")
+    dino_processor = AutoProcessor.from_pretrained(
+        "IDEA-Research/grounding-dino-base"
+    )
+    dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
+        "IDEA-Research/grounding-dino-base",
+        torch_dtype = torch.float32
+    )
+    dino_model.eval()
+    return blip_processor, itm_model, dino_processor, dino_model
+# ── Step 2: BLIP ITM Scoring (local CPU) ──
+def compute_itm_scores(image, captions, blip_processor, itm_model):
+    scores = []
+    for cap in captions:
+        inp = blip_processor(
+            images=image, text=cap,
+            return_tensors="pt", padding=True
+        )
+        with torch.no_grad():
+            out   = itm_model(**inp)
+            score = torch.nn.functional.softmax(
+                out.itm_score, dim=1
+            )[:, 1].item()
+        scores.append(round(score, 4))
+    return scores
+# ── Step 3: Jina Reranker Scoring (API) ──
+def compute_jina_scores(image, captions):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_b64  = __import__("base64").b64encode(buffered.getvalue()).decode()
+    scores = []
+    for cap in captions:
+        try:
+            payload  = {
+                "model"     : "jina-reranker-m0",
+                "query"     : cap,
+                "documents" : [{"type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}]
+            }
+            response = requests.post(
+                JINA_URL,
+                headers = JINA_HEADERS,
+                json    = payload,
+                timeout = 30
             )
+            if response.status_code == 200:
+                result = response.json()
+                score  = result["results"][0]["relevance_score"]
+                scores.append(round(float(score), 4))
+            else:
+                scores.append(0.5)
+        except:
+            scores.append(0.5)
+    return scores
+# ── Step 4: Cosine Similarity Scoring (local numpy) ──
+def compute_cosine_scores(image, captions, blip_processor, itm_model):
+    # Get image embedding
+    img_inp = blip_processor(images=image, return_tensors="pt")
+    with torch.no_grad():
+        vis_out = itm_model.vision_model(
+            pixel_values=img_inp["pixel_values"]
+        )
+        img_feat = itm_model.vision_proj(
+            vis_out.last_hidden_state[:, 0, :]
+        ).numpy()
+        img_feat = normalize(img_feat, norm="l2")
+    # Get caption embeddings
+    cap_inp = blip_processor(
+        text=captions, return_tensors="pt",
+        padding=True, truncation=True, max_length=512
+    )
+    with torch.no_grad():
+        txt_out  = itm_model.text_encoder(
+            input_ids      = cap_inp["input_ids"],
+            attention_mask = cap_inp["attention_mask"]
+        )
+        cap_feat = itm_model.text_proj(
+            txt_out.last_hidden_state[:, 0, :]
+        ).numpy()
+        cap_feat = normalize(cap_feat, norm="l2")
+    scores = cosine_similarity(img_feat, cap_feat)[0]
+    return [round(float(s), 4) for s in scores]
+# ── Step 5: Majority Voting ──
+def majority_voting(captions, itm_scores, jina_scores, cosine_scores):
+    itm_ranked  = np.argsort(itm_scores)[::-1]
+    jina_ranked = np.argsort(jina_scores)[::-1]
+    cos_ranked  = np.argsort(cosine_scores)[::-1]
+    votes = [
+        int(itm_ranked[0]),  int(itm_ranked[1]),
+        int(jina_ranked[0]), int(jina_ranked[1]),
+        int(cos_ranked[0]),  int(cos_ranked[1]),
+    ]
+    vote_counts  = Counter(votes)
+    top2_indices = [idx for idx, _ in vote_counts.most_common(2)]
+    if len(top2_indices) < 2:
+        top2_indices = [int(itm_ranked[0]), int(jina_ranked[0])]
+    return (
+        captions[top2_indices[0]],
+        captions[top2_indices[1]],
+        top2_indices,
+        dict(vote_counts)
+    )
+# ── Step 6: DINO Object Detection (local CPU) ──
+def detect_objects(image, dino_processor, dino_model, threshold=0.3):
+    inp = dino_processor(
+        images=image, text=DETECT_PROMPT,
+        return_tensors="pt"
     )
+    with torch.no_grad():
+        outputs = dino_model(**inp)
+    target_sizes = torch.tensor([image.size[::-1]])
+    results      = dino_processor.post_process_grounded_object_detection(
+        outputs, inp.input_ids,
+        target_sizes=target_sizes
+    )[0]
+    scores = results["scores"]
+    labels = results["labels"]
+    keep   = scores >= threshold
+    labels = [labels[i] for i in range(len(labels)) if keep[i]]
+    sc_list= scores[keep].tolist()
+    if not labels:
+        return "No objects detected", []
+    seen = {}
+    for lbl, sc in zip(labels, sc_list):
+        lbl = lbl.strip().lower()
+        if lbl not in seen or seen[lbl] < sc:
+            seen[lbl] = sc
+    sorted_labels = [l for l, _ in sorted(seen.items(), key=lambda x: x[1], reverse=True)]
+    label_str     = "Detected: [" + ", ".join(sorted_labels) + "]"
+    return label_str, sorted_labels
+# ── Step 7: Qwen2.5-1.5B Caption Fusion (API) ──
+def fuse_captions_api(cap1, cap2, dino_labels):
+    prompt = f"""You are given two captions and detected objects for the same image.
+Write ONE fluent, natural, descriptive caption combining the best details.
+Return ONLY the caption, no explanation, no prefix.
+Caption 1        : {cap1}
+Caption 2        : {cap2}
+Detected objects : {dino_labels}
+Fused caption :"""
+    try:
+        response = requests.post(
+            QWEN_LM_URL,
+            headers = HF_HEADERS,
+            json    = {
+                "inputs"     : prompt,
+                "parameters" : {
+                    "max_new_tokens"     : 80,
+                    "do_sample"          : False,
+                    "repetition_penalty" : 1.1,
+                    "return_full_text"   : False
+                }
+            },
+            timeout = 40
+        )
+        if response.status_code == 200:
+            result = response.json()
+            if isinstance(result, list):
+                fused = result[0].get("generated_text", "").strip()
+            else:
+                fused = str(result).strip()
+            # Clean any prefix Qwen adds
+            for prefix in ["Fused caption :", "Fused caption:", "Caption:"]:
+                if fused.lower().startswith(prefix.lower()):
+                    fused = fused[len(prefix):].strip()
+            return fused if fused else cap1
+        else:
+            return cap1
+    except Exception as e:
+        return cap1
+# ════════════════════════════════════════
+#  STREAMLIT UI
+# ════════════════════════════════════════
+# ── Sidebar ──
+with st.sidebar:
+    st.title(" Image Caption Fusion")
+    st.markdown("---")
+    st.markdown("###  Pipeline")
+    st.markdown("""
+    1.  **Qwen2-VL-2B** — Generate 5 captions
+    2.  **BLIP ITM** — Image-text matching score
+    3.  **Jina Reranker M0** — Semantic reranking
+    4.  **Cosine Similarity** — Embedding similarity
+    5.  **Majority Voting** — Best 2 captions
+    6.  **Grounding DINO** — Object detection
+    7.  **Qwen2.5-1.5B** — Caption fusion
+    """)
+    st.markdown("---")
+    st.markdown("###  About")
+    st.markdown("""
+    This system generates a rich, humanized caption
+    for any image using a multi-model ensemble pipeline.
+    """)
+    st.markdown("---")
+    st.markdown("**Local models:** BLIP ITM, DINO")
+    st.markdown("**API models:** Qwen2-VL, Jina, Qwen2.5")
+# ── Main area ──
+st.title(" Image Caption Fusion System")
+st.markdown("Upload any image and get a detailed, humanized caption.")
+st.markdown("---")
+uploaded = st.file_uploader(
+    " Upload an image",
+    type=["jpg", "jpeg", "png"],
+    help="Upload any image to generate a fused caption"
+)
+if uploaded:
+    image = Image.open(uploaded).convert("RGB")
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.image(image, caption="Uploaded Image", use_column_width=True)
+    with col2:
+        if st.button(" Generate Caption", type="primary", use_container_width=True):
+            # Load local models
+            with st.spinner("Loading local models (first time takes ~2 min)..."):
+                blip_processor, itm_model, dino_processor, dino_model = load_local_models()
+            progress = st.progress(0)
+            status   = st.empty()
+            # Step 1 — Generate captions
+            status.info(" Step 1/7 — Generating 5 captions with Qwen2-VL...")
+            captions = generate_captions_api(image)
+            progress.progress(14)
+            with st.expander(" 5 Generated Captions", expanded=False):
+                for i, c in enumerate(captions):
+                    st.write(f"**{i+1}.** {c}")
+            # Step 2 — ITM scores
+            status.info(" Step 2/7 — Computing BLIP ITM scores...")
+            itm_scores = compute_itm_scores(image, captions, blip_processor, itm_model)
+            progress.progress(28)
+            # Step 3 — Jina scores
+            status.info(" Step 3/7 — Computing Jina Reranker scores...")
+            jina_scores = compute_jina_scores(image, captions)
+            progress.progress(42)
+            # Step 4 — Cosine scores
+            status.info(" Step 4/7 — Computing Cosine Similarity scores...")
+            cosine_scores = compute_cosine_scores(image, captions, blip_processor, itm_model)
+            progress.progress(57)
+            # Show score table
+            import pandas as pd
+            score_df = pd.DataFrame({
+                "Caption"   : [f"Cap {i+1}: {c[:50]}..." for i, c in enumerate(captions)],
+                "ITM"       : itm_scores,
+                "Jina"      : jina_scores,
+                "Cosine"    : cosine_scores
+            })
+            with st.expander(" All Scores", expanded=False):
+                st.dataframe(score_df, use_container_width=True)
+            # Step 5 — Majority voting
+            status.info(" Step 5/7 — Running Majority Voting...")
+            voted_cap1, voted_cap2, top2_idx, vote_counts = majority_voting(
+                captions, itm_scores, jina_scores, cosine_scores
+            )
+            progress.progress(71)
+            st.markdown("###  Majority Voted Captions")
+            col_a, col_b = st.columns(2)
+            with col_a:
+                st.success(f" **Caption 1:**
+{voted_cap1}")
+            with col_b:
+                st.info(f" **Caption 2:**
+{voted_cap2}")
+            # Step 6 — DINO
+            status.info(" Step 6/7 — Detecting objects with DINO...")
+            label_str, label_list = detect_objects(image, dino_processor, dino_model)
+            progress.progress(85)
+            st.markdown("###  Detected Objects")
+            if label_list:
+                cols = st.columns(min(len(label_list), 6))
+                for i, lbl in enumerate(label_list[:6]):
+                    cols[i].markdown(
+                        f"<span style='background:#e8f4fd;padding:4px 8px;"
+                        f"border-radius:12px;font-size:13px'> {lbl}</span>",
+                        unsafe_allow_html=True
+                    )
+            else:
+                st.write(label_str)
+            # Step 7 — Qwen fusion
+            status.info("Step 7/7 — Fusing captions with Qwen2.5-1.5B...")
+            fused = fuse_captions_api(voted_cap1, voted_cap2, label_str)
+            progress.progress(100)
+            status.success(" Pipeline complete!")
+            # Final output
+            st.markdown("---")
+            st.markdown("###  Final Fused Caption")
+            st.markdown(
+                f"<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);"
+                f"padding: 20px; border-radius: 12px; color: white; font-size: 18px;"
+                f"font-weight: 500; text-align: center;'>"
+                f" {fused}"
+                f"</div>",
+                unsafe_allow_html=True
+            )
+            st.markdown("---")

requirements.txt CHANGED Viewed

@@ -1,6 +1,12 @@
-accelerate
-diffusers
-invisible_watermark
 torch
 transformers
-xformers

+streamlit
+Pillow
+numpy
+scikit-learn
+requests
 torch
 transformers
+accelerate
+einops
+timm
+supervision
+huggingface_hub