Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 18 days ago

Commit

1fc1228

verified ·

1 Parent(s): dcfb164

delete app.py

Browse files

Files changed (1) hide show

app.py +0 -359

app.py DELETED Viewed

@@ -1,359 +0,0 @@
-import requests
-from io import BytesIO
-FLORENCE_URL = "https://api-inference.huggingface.co/models/microsoft/Florence-2-large"
-HF_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if "HF_TOKEN" in locals() or "HF_TOKEN" in globals() else {}
-import requests
-from io import BytesIO
-FLORENCE_URL = "https://api-inference.huggingface.co/models/microsoft/Florence-2-large"
-HF_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if "HF_TOKEN" in locals() or "HF_TOKEN" in globals() else {}
-import os
-import torch
-import numpy as np
-import requests
-import streamlit as st
-from PIL import Image
-from io import BytesIO
-from collections import Counter
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.preprocessing import normalize
-import base64
-import pandas as pd
-st.set_page_config(page_title="Image Caption Fusion", page_icon="🖼️", layout="wide")
-HF_TOKEN = os.environ.get("HF_TOKEN", "")
-JINA_KEY = os.environ.get("JINA_KEY", "")
-DEVICE   = "cpu"
-# ── Correct API endpoints ──
-FLORENCE_URL = "https://api-inference.huggingface.co/models/microsoft/Florence-2-large"
-QWEN_LM_URL  = "https://api-inference.huggingface.co/v1/chat/completions"
-JINA_URL     = "https://api.jina.ai/v1/rerank"
-HF_HEADERS   = {"Authorization": "Bearer " + HF_TOKEN, "Content-Type": "application/json"}
-JINA_HEADERS = {"Authorization": "Bearer " + JINA_KEY, "Content-Type": "application/json"}
-DETECT_PROMPT = (
-    "person . child . man . woman . boy . girl . "
-    "dog . cat . horse . bird . animal . "
-    "ball . toy . bicycle . car . bench . "
-    "tree . grass . water . sky . mountain . "
-    "building . stairs . door . fence . floor . "
-    "jacket . dress . shirt . hat . bag ."
-)
-@st.cache_resource
-def load_local_models():
-    from transformers import (
-        BlipProcessor, BlipForImageTextRetrieval,
-        AutoProcessor, AutoModelForZeroShotObjectDetection
-    )
-    blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-    itm_model = BlipForImageTextRetrieval.from_pretrained(
-        "Salesforce/blip-itm-large-coco", torch_dtype=torch.float32
-    )
-    itm_model.eval()
-    dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
-    dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
-        "IDEA-Research/grounding-dino-base", torch_dtype=torch.float32
-    )
-    dino_model.eval()
-    return blip_processor, itm_model, dino_processor, dino_model
-def image_to_base64(image):
-    buffered = BytesIO()
-    image.save(buffered, format="JPEG")
-    return base64.b64encode(buffered.getvalue()).decode()
-# ── FIXED: Qwen2-VL via chat completions API ──
-def generate_captions_api(image):
-    img_b64 = image_to_base64(image)
-    img_url = "data:image/jpeg;base64," + img_b64
-    PROMPTS = [
-        "Describe this image in one detailed sentence.",
-        "What is happening in this image? Write one descriptive sentence.",
-        "Describe the main subjects, actions and setting in one sentence.",
-        "Write a detailed caption focusing on people, animals and objects visible.",
-        "Describe this scene including background details and activities shown.",
-    ]
-    captions = []
-    for prompt in PROMPTS:
-        try:
-            payload = {
-                "model": "Qwen/Qwen2-VL-2B-Instruct",
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "image_url", "image_url": {"url": img_url}},
-                            {"type": "text", "text": prompt}
-                        ]
-                    }
-                ],
-                "max_tokens": 80
-            }
-            response = requests.post(FLORENCE_URL,
-                headers=HF_HEADERS,
-                json=payload,
-                timeout=40
-            )
-            if response.status_code == 200:
-                result = response.json()
-                cap = result["choices"][0]["message"]["content"].strip().lower()
-                captions.append(cap if cap else "a scene with various objects")
-            else:
-                st.warning("Qwen2-VL API error: " + str(response.status_code) + " " + response.text[:100])
-                captions.append("a scene with various objects and people")
-        except Exception as e:
-            st.warning("Florence-2 exception: " + str(e))
-            captions.append("a scene captured in the image")
-    seen, unique = set(), []
-    for c in captions:
-        if c not in seen:
-            seen.add(c)
-            unique.append(c)
-    while len(unique) < 5:
-        unique.append(unique[0])
-    return unique[:5]
-def compute_itm_scores(image, captions, blip_processor, itm_model):
-    scores = []
-    for cap in captions:
-        inp = blip_processor(images=image, text=cap, return_tensors="pt", padding=True)
-        with torch.no_grad():
-            out   = itm_model(**inp)
-            score = torch.nn.functional.softmax(out.itm_score, dim=1)[:, 1].item()
-        scores.append(round(score, 4))
-    return scores
-# ── FIXED: Jina Reranker M0 API ──
-def compute_jina_scores(image, captions):
-    img_b64 = image_to_base64(image)
-    scores  = []
-    for cap in captions:
-        try:
-            payload = {
-                "model": "jina-reranker-m0",
-                "query": cap,
-                "documents": ["data:image/jpeg;base64," + img_b64],
-                "top_n": 1
-            }
-            response = requests.post(FLORENCE_URL,
-                JINA_URL,
-                headers=JINA_HEADERS,
-                json=payload,
-                timeout=30
-            )
-            if response.status_code == 200:
-                result = response.json()
-                score  = result["results"][0]["relevance_score"]
-                scores.append(round(float(score), 4))
-            else:
-                st.warning("Jina API error: " + str(response.status_code) + " " + response.text[:100])
-                scores.append(0.0)
-        except Exception as e:
-            st.warning("Jina exception: " + str(e))
-            scores.append(0.0)
-    return scores
-def compute_cosine_scores(image, captions, blip_processor, itm_model):
-    img_inp = blip_processor(images=image, return_tensors="pt")
-    with torch.no_grad():
-        vis_out  = itm_model.vision_model(pixel_values=img_inp["pixel_values"])
-        img_feat = itm_model.vision_proj(vis_out.last_hidden_state[:, 0, :]).numpy()
-        img_feat = normalize(img_feat, norm="l2")
-    cap_inp = blip_processor(
-        text=captions, return_tensors="pt",
-        padding=True, truncation=True, max_length=512
-    )
-    with torch.no_grad():
-        txt_out  = itm_model.text_encoder(
-            input_ids=cap_inp["input_ids"],
-            attention_mask=cap_inp["attention_mask"]
-        )
-        cap_feat = itm_model.text_proj(txt_out.last_hidden_state[:, 0, :]).numpy()
-        cap_feat = normalize(cap_feat, norm="l2")
-    scores = cosine_similarity(img_feat, cap_feat)[0]
-    return [round(float(s), 4) for s in scores]
-def majority_voting(captions, itm_scores, jina_scores, cosine_scores):
-    itm_ranked  = np.argsort(itm_scores)[::-1]
-    jina_ranked = np.argsort(jina_scores)[::-1]
-    cos_ranked  = np.argsort(cosine_scores)[::-1]
-    votes = [
-        int(itm_ranked[0]),  int(itm_ranked[1]),
-        int(jina_ranked[0]), int(jina_ranked[1]),
-        int(cos_ranked[0]),  int(cos_ranked[1]),
-    ]
-    vote_counts  = Counter(votes)
-    top2_indices = [idx for idx, _ in vote_counts.most_common(2)]
-    if len(top2_indices) < 2:
-        top2_indices = [int(itm_ranked[0]), int(jina_ranked[0])]
-    return captions[top2_indices[0]], captions[top2_indices[1]], top2_indices, dict(vote_counts)
-def detect_objects(image, dino_processor, dino_model, threshold=0.3):
-    inp = dino_processor(images=image, text=DETECT_PROMPT, return_tensors="pt")
-    with torch.no_grad():
-        outputs = dino_model(**inp)
-    target_sizes = torch.tensor([image.size[::-1]])
-    results = dino_processor.post_process_grounded_object_detection(
-        outputs, inp.input_ids, target_sizes=target_sizes
-    )[0]
-    scores = results["scores"]
-    labels = results["labels"]
-    keep   = scores >= threshold
-    labels = [labels[i] for i in range(len(labels)) if keep[i]]
-    sc_list= scores[keep].tolist()
-    if not labels:
-        return "No objects detected", []
-    seen = {}
-    for lbl, sc in zip(labels, sc_list):
-        lbl = lbl.strip().lower()
-        if lbl not in seen or seen[lbl] < sc:
-            seen[lbl] = sc
-    sorted_labels = [l for l, _ in sorted(seen.items(), key=lambda x: x[1], reverse=True)]
-    label_str = "Detected: [" + ", ".join(sorted_labels) + "]"
-    return label_str, sorted_labels
-# ── FIXED: Qwen2.5-1.5B via chat completions ──
-def fuse_captions_api(cap1, cap2, dino_labels):
-    prompt = (
-        "You are given two captions and detected objects for the same image. "
-        "Write ONE fluent, natural, descriptive caption combining the best details. "
-        "Return ONLY the fused caption, nothing else. "
-        "Caption 1: " + cap1 + ". "
-        "Caption 2: " + cap2 + ". "
-        "Detected objects: " + dino_labels + "."
-    )
-    try:
-        payload = {
-            "model": "Qwen/Qwen2.5-1.5B-Instruct",
-            "messages": [
-                {"role": "system", "content": "You write accurate image captions. Return only the caption."},
-                {"role": "user",   "content": prompt}
-            ],
-            "max_tokens"        : 80,
-            "temperature"       : 0.1,
-            "repetition_penalty": 1.1
-        }
-        response = requests.post(FLORENCE_URL,
-            QWEN_LM_URL,
-            headers=HF_HEADERS,
-            json=payload,
-            timeout=40
-        )
-        if response.status_code == 200:
-            result = response.json()
-            fused  = result["choices"][0]["message"]["content"].strip()
-            return fused if fused else cap1
-        else:
-            st.warning("Qwen fusion API error: " + str(response.status_code))
-            return cap1
-    except Exception as e:
-        st.warning("Qwen fusion exception: " + str(e))
-        return cap1
-# ── SIDEBAR ──
-with st.sidebar:
-    st.title(" Image Caption Fusion")
-    st.markdown("---")
-    st.markdown("###  Pipeline Steps")
-    st.markdown("1.  Florence-2 — Generate 4 captions + BLIP local")
-    st.markdown("2.  BLIP ITM — Image-text matching")
-    st.markdown("3.  Jina Reranker M0 — Semantic reranking")
-    st.markdown("4.  Cosine Similarity — Embedding similarity")
-    st.markdown("5.  Majority Voting — Best 2 captions")
-    st.markdown("6.  Grounding DINO — Object detection")
-    st.markdown("7.  Qwen2.5-1.5B — Caption fusion")
-    st.markdown("---")
-    st.markdown("**Local:** BLIP ITM, DINO")
-    st.markdown("**API:** Florence-2, Jina, Qwen2.5")
-# ── MAIN UI ──
-st.title(" Image Caption Fusion System")
-st.markdown("Upload any image and get a detailed, humanized caption.")
-st.markdown("---")
-uploaded = st.file_uploader("Upload an image", type=["jpg","jpeg","png"])
-if uploaded:
-    image = Image.open(uploaded).convert("RGB")
-    col1, col2 = st.columns([1, 1])
-    with col1:
-        st.image(image, caption="Uploaded Image", width=400)
-    with col2:
-        if st.button(" Generate Caption", type="primary", use_container_width=True):
-            with st.spinner("Loading local models (first time ~2 min)..."):
-                blip_processor, itm_model, dino_processor, dino_model = load_local_models()
-            progress = st.progress(0)
-            status   = st.empty()
-            status.info(" Step 1/7 — Generating captions with Florence-2 + BLIP...")
-            captions = generate_captions_api(image)
-            progress.progress(14)
-            with st.expander(" 5 Generated Captions"):
-                for i, c in enumerate(captions):
-                    st.write(str(i+1) + ". " + c)
-            status.info(" Step 2/7 — Computing BLIP ITM scores...")
-            itm_scores = compute_itm_scores(image, captions, blip_processor, itm_model)
-            progress.progress(28)
-            status.info(" Step 3/7 — Computing Jina Reranker scores...")
-            jina_scores = compute_jina_scores(image, captions)
-            progress.progress(42)
-            status.info(" Step 4/7 — Computing Cosine Similarity...")
-            cosine_scores = compute_cosine_scores(image, captions, blip_processor, itm_model)
-            progress.progress(57)
-            score_df = pd.DataFrame({
-                "Caption": ["Cap " + str(i+1) + ": " + c[:50] for i, c in enumerate(captions)],
-                "ITM"    : itm_scores,
-                "Jina"   : jina_scores,
-                "Cosine" : cosine_scores
-            })
-            with st.expander(" All Scores"):
-                st.dataframe(score_df, use_container_width=True)
-            status.info(" Step 5/7 — Majority Voting...")
-            voted_cap1, voted_cap2, top2_idx, vote_counts = majority_voting(
-                captions, itm_scores, jina_scores, cosine_scores
-            )
-            progress.progress(71)
-            st.markdown("###  Majority Voted Captions")
-            col_a, col_b = st.columns(2)
-            with col_a:
-                st.success(" Caption 1: " + voted_cap1)
-            with col_b:
-                st.info(" Caption 2: " + voted_cap2)
-            status.info(" Step 6/7 — Detecting objects with DINO...")
-            label_str, label_list = detect_objects(image, dino_processor, dino_model)
-            progress.progress(85)
-            st.markdown("###  Detected Objects")
-            if label_list:
-                st.write(" | ".join([" " + l for l in label_list]))
-            else:
-                st.write(label_str)
-            status.info(" Step 7/7 — Fusing with Qwen2.5-1.5B...")
-            fused = fuse_captions_api(voted_cap1, voted_cap2, label_str)
-            progress.progress(100)
-            status.success(" Pipeline complete!")
-            st.markdown("---")
-            st.markdown("### Final Fused Caption")
-            st.markdown(
-                "<div style='background:linear-gradient(135deg,#667eea,#764ba2);"
-                "padding:20px;border-radius:12px;color:white;font-size:18px;"
-                "font-weight:500;text-align:center;'> " + fused + "</div>",
-                unsafe_allow_html=True
-            )