Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 7 days ago

Commit

1ae65de

verified ·

1 Parent(s): 58f42b0

return florence

Browse files

Files changed (1) hide show

app.py +144 -122

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ import pandas as pd
 import requests
 import base64
 import streamlit as st
-import google.generativeai as genai
 from PIL import Image
 from io import BytesIO
 from collections import Counter
@@ -19,11 +18,8 @@ st.set_page_config(
     initial_sidebar_state="expanded"
 )
-# ============================================================================
-# CREDENTIALS
-# ============================================================================
-JINA_KEY       = os.environ.get("JINA_KEY", "")
-GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
 JINA_URL     = "https://api.jina.ai/v1/rerank"
 JINA_HEADERS = {
@@ -45,38 +41,33 @@ DETECT_PROMPT = (
     "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
 )
-# ============================================================================
-# CREDENTIAL CHECK
-# ============================================================================
 if not JINA_KEY:
     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
-if not GOOGLE_API_KEY:
-    st.error("GOOGLE_API_KEY missing. Go to Space Settings → Secrets and add it.")
-    st.stop()
-# Configure Gemini after credentials are defined
-genai.configure(api_key=GOOGLE_API_KEY)
-# ============================================================================
-# LOAD LOCAL MODELS
-# Local: BLIP ITM, DINO, Qwen2.5
-# API:   Gemini 2.0 Flash, Jina Reranker
-# ============================================================================
 @st.cache_resource
 def load_local_models():
     from transformers import (
         AutoModelForCausalLM,
         AutoTokenizer,
         BlipProcessor,
         BlipForImageTextRetrieval,
-        AutoProcessor,
         AutoModelForZeroShotObjectDetection
     )
     gc.collect()
-    # BLIP — ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
@@ -86,7 +77,6 @@ def load_local_models():
     )
     blip_itm_model.eval()
-    # DINO — object detection
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
@@ -96,7 +86,6 @@ def load_local_models():
     )
     dino_model.eval()
-    # Qwen2.5-1.5B — caption fusion
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
@@ -107,14 +96,12 @@ def load_local_models():
     qwen_model.eval()
     return (
         blip_processor, blip_itm_model,
         dino_processor, dino_model,
         qwen_tokenizer, qwen_model
     )
-# ============================================================================
-# HELPERS
-# ============================================================================
 def image_to_bytes(image: Image.Image) -> bytes:
     buf = BytesIO()
     image.save(buf, format="JPEG", quality=85)
@@ -125,74 +112,125 @@ def image_to_data_uri(image: Image.Image) -> str:
     b64 = base64.b64encode(raw).decode()
     return f"data:image/jpeg;base64,{b64}"
-# ============================================================================
-# STEP 1 — GEMINI 2.0 FLASH: GENERATE 5 DIVERSE CAPTIONS
-# Single API call — all 5 captions in one request
-# Retry logic: tries gemini-2.0-flash first, falls back to gemini-1.5-flash-8b
-# gemini-1.5-flash-8b has separate quota pool from gemini-2.0-flash
-# ============================================================================
-def generate_captions_gemini(image: Image.Image) -> list:
-    prompt = """Look at this image carefully and write 5 different captions from different perspectives.
-1. Overall scene: One sentence describing the general scene.
-2. People: Describe the people, their clothing colors, style, and what they are doing in detail.
-3. Background: Describe the background, setting, and surroundings.
-4. Objects: Describe the objects, plants, and items visible in the image.
-5. Full description: A complete description covering who is in the image, what they are doing, their appearance, and where the scene takes place.
-Reply in this exact format:
-CAPTION_1: [your caption here]
-CAPTION_2: [your caption here]
-CAPTION_3: [your caption here]
-CAPTION_4: [your caption here]
-CAPTION_5: [your caption here]"""
-    # Try primary model first, fallback to secondary if quota exceeded
-    models_to_try = [
-        "gemini-2.0-flash",
-        "gemini-1.5-flash-8b",
-        "gemini-1.5-flash"
-    ]
-    raw_text = None
-    for model_name in models_to_try:
-        try:
-            model    = genai.GenerativeModel(model_name)
-            response = model.generate_content([prompt, image])
-            raw_text = response.text.strip()
-            break
-        except Exception as e:
-            error_msg = str(e)
-            if "429" in error_msg:
-                st.warning(f"{model_name} quota exceeded, trying next model...")
-                continue
-            else:
-                st.warning(f"Gemini error ({model_name}): {error_msg[:80]}")
-                continue
-    if raw_text is None:
-        st.error(
-            "All Gemini models hit quota limit. "
-            "Quota resets at midnight (Pacific Time). "
-            "Using fallback captions for now."
         )
-        return ["a scene shown in the image"] * 5
-    # Parse the 5 captions from structured response
-    captions = []
-    for i in range(1, 6):
-        marker      = f"CAPTION_{i}:"
-        next_marker = f"CAPTION_{i+1}:" if i < 5 else None
-        if marker in raw_text:
-            start = raw_text.index(marker) + len(marker)
-            end   = raw_text.index(next_marker) if next_marker and next_marker in raw_text else len(raw_text)
-            cap   = raw_text[start:end].strip().lower()
-            captions.append(cap if cap else "a scene shown in the image")
         else:
-            captions.append("a scene shown in the image")
     seen, unique = set(), []
     for c in captions:
@@ -208,9 +246,6 @@ CAPTION_5: [your caption here]"""
     return unique[:5]
-# ============================================================================
-# STEP 2 — BLIP ITM: IMAGE-TEXT MATCHING SCORES
-# ============================================================================
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
@@ -230,9 +265,6 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
             scores.append(0.0)
     return scores
-# ============================================================================
-# STEP 3 — JINA RERANKER M0: SEMANTIC SCORES
-# ============================================================================
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
@@ -263,9 +295,6 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
             scores.append(0.0)
     return scores
-# ============================================================================
-# STEP 4 — COSINE SIMILARITY: EMBEDDING SCORES
-# ============================================================================
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
@@ -292,9 +321,6 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
-# ============================================================================
-# STEP 5 — MAJORITY VOTING: SELECT TOP 2 CAPTIONS
-# ============================================================================
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
@@ -312,9 +338,6 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
-# ============================================================================
-# STEP 6 — GROUNDING DINO: OBJECT DETECTION
-# ============================================================================
 def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
     try:
         inputs = dino_proc(
@@ -355,7 +378,11 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
         return "Object detection unavailable", []
 # ============================================================================
-# STEP 7 — QWEN2.5-1.5B (LOCAL): CAPTION FUSION
 # ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
@@ -368,7 +395,7 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         "what each person looks like and what they are doing, "
         "the objects and plants visible around them, "
         "and the setting or background of the scene. "
-        "Write 3 to 4 sentences. Use simple, clear, everyday words. "
         "Do NOT summarize or shorten — keep every specific detail. "
         "Only include what is clearly visible. "
         "Return ONLY the caption, nothing else."
@@ -379,7 +406,7 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         f"Caption B: {cap2}\n"
         f"{objects}\n\n"
         "Write a detailed caption that includes all the clothing, "
-        "people, objects and background details:"
     )
     try:
@@ -416,15 +443,12 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
-# ============================================================================
-# SIDEBAR
-# ============================================================================
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
-**1. Gemini 2.0 Flash** (API)
 Generate 5 captions
 **2. BLIP ITM** (Local)
@@ -446,12 +470,9 @@ Object detection
 Caption fusion
     """)
     st.markdown("---")
-    st.markdown("**Local:** BLIP ITM, DINO, Qwen2.5")
-    st.markdown("**API:** Gemini 2.0 Flash, Jina")
-# ============================================================================
-# MAIN UI
-# ============================================================================
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
@@ -472,8 +493,9 @@ if uploaded_file is not None:
     with col_run:
         if st.button("Generate Caption", type="primary", use_container_width=True):
-            with st.spinner("Loading local models (first run takes 2-3 min)..."):
                 (
                     blip_proc, blip_itm,
                     dino_proc, dino_mod,
                     qwen_tok, qwen_mod
@@ -482,8 +504,8 @@ if uploaded_file is not None:
             progress = st.progress(0)
             status   = st.empty()
-            status.info("Step 1/7: Generating captions with Gemini 2.0 Flash...")
-            captions = generate_captions_gemini(input_image)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):

 import requests
 import base64
 import streamlit as st
 from PIL import Image
 from io import BytesIO
 from collections import Counter
     initial_sidebar_state="expanded"
 )
+HF_TOKEN = os.environ.get("HF_TOKEN", "")
+JINA_KEY = os.environ.get("JINA_KEY", "")
 JINA_URL     = "https://api.jina.ai/v1/rerank"
 JINA_HEADERS = {
     "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
 )
 if not JINA_KEY:
     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
 @st.cache_resource
 def load_local_models():
     from transformers import (
+        AutoProcessor,
         AutoModelForCausalLM,
         AutoTokenizer,
         BlipProcessor,
         BlipForImageTextRetrieval,
         AutoModelForZeroShotObjectDetection
     )
     gc.collect()
+    florence_processor = AutoProcessor.from_pretrained(
+        "microsoft/Florence-2-large",
+        trust_remote_code=True
+    )
+    florence_model = AutoModelForCausalLM.from_pretrained(
+        "microsoft/Florence-2-large",
+        trust_remote_code=True,
+        torch_dtype=torch.float32
+    )
+    florence_model.eval()
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
     )
     blip_itm_model.eval()
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
     )
     dino_model.eval()
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
     qwen_model.eval()
     return (
+        florence_processor, florence_model,
         blip_processor, blip_itm_model,
         dino_processor, dino_model,
         qwen_tokenizer, qwen_model
     )
 def image_to_bytes(image: Image.Image) -> bytes:
     buf = BytesIO()
     image.save(buf, format="JPEG", quality=85)
     b64 = base64.b64encode(raw).decode()
     return f"data:image/jpeg;base64,{b64}"
+def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
+    captions   = []
+    image_size = (image.width, image.height)
+    # Task 1: Short caption
+    try:
+        inputs = florence_proc(
+            text="<CAPTION>", images=image, return_tensors="pt"
+        )
+        with torch.no_grad():
+            ids = florence_mod.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=50, num_beams=3
+            )
+        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
+        parsed = florence_proc.post_process_generation(raw, task="<CAPTION>", image_size=image_size)
+        cap    = parsed.get("<CAPTION>", "").strip().lower()
+        captions.append(cap if cap else "a scene shown in the image")
+    except Exception as e:
+        st.warning(f"Florence CAPTION error: {str(e)[:80]}")
+        captions.append("a scene shown in the image")
+    # Task 2: Detailed caption
+    try:
+        inputs = florence_proc(
+            text="<DETAILED_CAPTION>", images=image, return_tensors="pt"
+        )
+        with torch.no_grad():
+            ids = florence_mod.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=100, num_beams=3
+            )
+        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
+        parsed = florence_proc.post_process_generation(raw, task="<DETAILED_CAPTION>", image_size=image_size)
+        cap    = parsed.get("<DETAILED_CAPTION>", "").strip().lower()
+        captions.append(cap if cap else "a scene shown in the image")
+    except Exception as e:
+        st.warning(f"Florence DETAILED_CAPTION error: {str(e)[:80]}")
+        captions.append("a scene shown in the image")
+    # Task 3: More detailed caption
+    try:
+        inputs = florence_proc(
+            text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt"
+        )
+        with torch.no_grad():
+            ids = florence_mod.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=150, num_beams=3
+            )
+        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
+        parsed = florence_proc.post_process_generation(raw, task="<MORE_DETAILED_CAPTION>", image_size=image_size)
+        cap    = parsed.get("<MORE_DETAILED_CAPTION>", "").strip().lower()
+        captions.append(cap if cap else "a scene shown in the image")
+    except Exception as e:
+        st.warning(f"Florence MORE_DETAILED_CAPTION error: {str(e)[:80]}")
+        captions.append("a scene shown in the image")
+    # Task 4: Dense region caption
+    try:
+        inputs = florence_proc(
+            text="<DENSE_REGION_CAPTION>", images=image, return_tensors="pt"
         )
+        with torch.no_grad():
+            ids = florence_mod.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=200, num_beams=3
+            )
+        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
+        parsed = florence_proc.post_process_generation(raw, task="<DENSE_REGION_CAPTION>", image_size=image_size)
+        labels = parsed.get("<DENSE_REGION_CAPTION>", {}).get("labels", [])
+        if labels:
+            seen_r, unique_r = set(), []
+            for l in labels:
+                if l.lower() not in seen_r:
+                    seen_r.add(l.lower())
+                    unique_r.append(l.lower())
+            cap = ", ".join(unique_r[:6]) + " visible in the scene"
         else:
+            cap = "a scene shown in the image"
+        captions.append(cap)
+    except Exception as e:
+        st.warning(f"Florence DENSE_REGION error: {str(e)[:80]}")
+        captions.append("a scene shown in the image")
+    # Task 5: Object detection
+    try:
+        inputs = florence_proc(
+            text="<OD>", images=image, return_tensors="pt"
+        )
+        with torch.no_grad():
+            ids = florence_mod.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=200, num_beams=3
+            )
+        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
+        parsed = florence_proc.post_process_generation(raw, task="<OD>", image_size=image_size)
+        labels = parsed.get("<OD>", {}).get("labels", [])
+        if labels:
+            seen_o, unique_o = set(), []
+            for l in labels:
+                if l.lower() not in seen_o:
+                    seen_o.add(l.lower())
+                    unique_o.append(l.lower())
+            cap = "a scene containing " + ", ".join(unique_o[:6])
+        else:
+            cap = "a scene shown in the image"
+        captions.append(cap)
+    except Exception as e:
+        st.warning(f"Florence OD error: {str(e)[:80]}")
+        captions.append("a scene shown in the image")
     seen, unique = set(), []
     for c in captions:
     return unique[:5]
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
             scores.append(0.0)
     return scores
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
             scores.append(0.0)
     return scores
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
 def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
     try:
         inputs = dino_proc(
         return "Object detection unavailable", []
 # ============================================================================
+# fuse_captions — CHANGED
+# system_prompt: explicitly covers clothing, colors, people, objects, setting
+# user_prompt: asks for all specific details including clothing and background
+# max_new_tokens: 100 → 180 (room for 3-4 full sentences)
+# temperature: 0.2 → 0.4 (more expressive while staying factual)
 # ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
         "what each person looks like and what they are doing, "
         "the objects and plants visible around them, "
         "and the setting or background of the scene. "
+        "Write 5 to 6 sentences. Use simple, clear, everyday words. "
         "Do NOT summarize or shorten — keep every specific detail. "
         "Only include what is clearly visible. "
         "Return ONLY the caption, nothing else."
         f"Caption B: {cap2}\n"
         f"{objects}\n\n"
         "Write a detailed caption that includes all the clothing, "
+        "people, objects and background in details:"
     )
     try:
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
+**1. Florence-2-Large** (Local)
 Generate 5 captions
 **2. BLIP ITM** (Local)
 Caption fusion
     """)
     st.markdown("---")
+    st.markdown("**Local:** Florence-2, BLIP ITM, DINO, Qwen2.5")
+    st.markdown("**API:** Jina")
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
     with col_run:
         if st.button("Generate Caption", type="primary", use_container_width=True):
+            with st.spinner("Loading local models (first run takes 3-4 min)..."):
                 (
+                    florence_proc, florence_mod,
                     blip_proc, blip_itm,
                     dino_proc, dino_mod,
                     qwen_tok, qwen_mod
             progress = st.progress(0)
             status   = st.empty()
+            status.info("Step 1/7: Generating captions with Florence-2-Large...")
+            captions = generate_captions_florence(input_image, florence_proc, florence_mod)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):