Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 16 days ago

Commit

00a1160

verified ·

1 Parent(s): 7f1f360

update gemini

Browse files

Files changed (1) hide show

app.py +82 -142

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import pandas as pd
 import requests
 import base64
 import streamlit as st
 from PIL import Image
 from io import BytesIO
 from collections import Counter
@@ -18,8 +19,11 @@ st.set_page_config(
     initial_sidebar_state="expanded"
 )
-HF_TOKEN = os.environ.get("HF_TOKEN", "")
-JINA_KEY = os.environ.get("JINA_KEY", "")
 JINA_URL     = "https://api.jina.ai/v1/rerank"
 JINA_HEADERS = {
@@ -41,33 +45,39 @@ DETECT_PROMPT = (
     "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
 )
 if not JINA_KEY:
     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
 @st.cache_resource
 def load_local_models():
     from transformers import (
-        AutoProcessor,
         AutoModelForCausalLM,
         AutoTokenizer,
         BlipProcessor,
         BlipForImageTextRetrieval,
         AutoModelForZeroShotObjectDetection
     )
     gc.collect()
-    florence_processor = AutoProcessor.from_pretrained(
-        "microsoft/Florence-2-large",
-        trust_remote_code=True
-    )
-    florence_model = AutoModelForCausalLM.from_pretrained(
-        "microsoft/Florence-2-large",
-        trust_remote_code=True,
-        torch_dtype=torch.float32
-    )
-    florence_model.eval()
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
@@ -77,6 +87,7 @@ def load_local_models():
     )
     blip_itm_model.eval()
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
@@ -86,6 +97,7 @@ def load_local_models():
     )
     dino_model.eval()
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
@@ -96,12 +108,14 @@ def load_local_models():
     qwen_model.eval()
     return (
-        florence_processor, florence_model,
         blip_processor, blip_itm_model,
         dino_processor, dino_model,
         qwen_tokenizer, qwen_model
     )
 def image_to_bytes(image: Image.Image) -> bytes:
     buf = BytesIO()
     image.save(buf, format="JPEG", quality=85)
@@ -112,126 +126,36 @@ def image_to_data_uri(image: Image.Image) -> str:
     b64 = base64.b64encode(raw).decode()
     return f"data:image/jpeg;base64,{b64}"
-def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
-    captions   = []
-    image_size = (image.width, image.height)
-    # Task 1: Short caption
-    try:
-        inputs = florence_proc(
-            text="<CAPTION>", images=image, return_tensors="pt"
-        )
-        with torch.no_grad():
-            ids = florence_mod.generate(
-                input_ids=inputs["input_ids"],
-                pixel_values=inputs["pixel_values"],
-                max_new_tokens=50, num_beams=3
-            )
-        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
-        parsed = florence_proc.post_process_generation(raw, task="<CAPTION>", image_size=image_size)
-        cap    = parsed.get("<CAPTION>", "").strip().lower()
-        captions.append(cap if cap else "a scene shown in the image")
-    except Exception as e:
-        st.warning(f"Florence CAPTION error: {str(e)[:80]}")
-        captions.append("a scene shown in the image")
-    # Task 2: Detailed caption
-    try:
-        inputs = florence_proc(
-            text="<DETAILED_CAPTION>", images=image, return_tensors="pt"
-        )
-        with torch.no_grad():
-            ids = florence_mod.generate(
-                input_ids=inputs["input_ids"],
-                pixel_values=inputs["pixel_values"],
-                max_new_tokens=100, num_beams=3
-            )
-        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
-        parsed = florence_proc.post_process_generation(raw, task="<DETAILED_CAPTION>", image_size=image_size)
-        cap    = parsed.get("<DETAILED_CAPTION>", "").strip().lower()
-        captions.append(cap if cap else "a scene shown in the image")
-    except Exception as e:
-        st.warning(f"Florence DETAILED_CAPTION error: {str(e)[:80]}")
-        captions.append("a scene shown in the image")
-    # Task 3: More detailed caption
-    try:
-        inputs = florence_proc(
-            text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt"
-        )
-        with torch.no_grad():
-            ids = florence_mod.generate(
-                input_ids=inputs["input_ids"],
-                pixel_values=inputs["pixel_values"],
-                max_new_tokens=150, num_beams=3
-            )
-        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
-        parsed = florence_proc.post_process_generation(raw, task="<MORE_DETAILED_CAPTION>", image_size=image_size)
-        cap    = parsed.get("<MORE_DETAILED_CAPTION>", "").strip().lower()
-        captions.append(cap if cap else "a scene shown in the image")
-    except Exception as e:
-        st.warning(f"Florence MORE_DETAILED_CAPTION error: {str(e)[:80]}")
-        captions.append("a scene shown in the image")
-    # Task 4: Dense region caption
-    try:
-        inputs = florence_proc(
-            text="<DENSE_REGION_CAPTION>", images=image, return_tensors="pt"
-        )
-        with torch.no_grad():
-            ids = florence_mod.generate(
-                input_ids=inputs["input_ids"],
-                pixel_values=inputs["pixel_values"],
-                max_new_tokens=200, num_beams=3
-            )
-        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
-        parsed = florence_proc.post_process_generation(raw, task="<DENSE_REGION_CAPTION>", image_size=image_size)
-        labels = parsed.get("<DENSE_REGION_CAPTION>", {}).get("labels", [])
-        if labels:
-            seen_r, unique_r = set(), []
-            for l in labels:
-                if l.lower() not in seen_r:
-                    seen_r.add(l.lower())
-                    unique_r.append(l.lower())
-            cap = ", ".join(unique_r[:6]) + " visible in the scene"
-        else:
-            cap = "a scene shown in the image"
-        captions.append(cap)
-    except Exception as e:
-        st.warning(f"Florence DENSE_REGION error: {str(e)[:80]}")
-        captions.append("a scene shown in the image")
-    # Task 5: Object detection
-    try:
-        inputs = florence_proc(
-            text="<OD>", images=image, return_tensors="pt"
-        )
-        with torch.no_grad():
-            ids = florence_mod.generate(
-                input_ids=inputs["input_ids"],
-                pixel_values=inputs["pixel_values"],
-                max_new_tokens=200, num_beams=3
-            )
-        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
-        parsed = florence_proc.post_process_generation(raw, task="<OD>", image_size=image_size)
-        labels = parsed.get("<OD>", {}).get("labels", [])
-        if labels:
-            seen_o, unique_o = set(), []
-            for l in labels:
-                if l.lower() not in seen_o:
-                    seen_o.add(l.lower())
-                    unique_o.append(l.lower())
-            cap = "a scene containing " + ", ".join(unique_o[:6])
-        else:
-            cap = "a scene shown in the image"
-        captions.append(cap)
-    except Exception as e:
-        st.warning(f"Florence OD error: {str(e)[:80]}")
-        captions.append("a scene shown in the image")
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
@@ -246,6 +170,9 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
     return unique[:5]
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
@@ -265,6 +192,9 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
             scores.append(0.0)
     return scores
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
@@ -295,6 +225,9 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
             scores.append(0.0)
     return scores
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
@@ -321,6 +254,9 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
@@ -338,6 +274,9 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
 def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
     try:
         inputs = dino_proc(
@@ -378,11 +317,7 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
         return "Object detection unavailable", []
 # ============================================================================
-# fuse_captions — CHANGED
-# system_prompt: explicitly covers clothing, colors, people, objects, setting
-# user_prompt: asks for all specific details including clothing and background
-# max_new_tokens: 100 → 180 (room for 3-4 full sentences)
-# temperature: 0.2 → 0.4 (more expressive while staying factual)
 # ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
@@ -443,12 +378,15 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
-**1. Florence-2-Large** (Local)
 Generate 5 captions
 **2. BLIP ITM** (Local)
@@ -470,9 +408,12 @@ Object detection
 Caption fusion
     """)
     st.markdown("---")
-    st.markdown("**Local:** Florence-2, BLIP ITM, DINO, Qwen2.5")
-    st.markdown("**API:** Jina")
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
@@ -493,9 +434,8 @@ if uploaded_file is not None:
     with col_run:
         if st.button("Generate Caption", type="primary", use_container_width=True):
-            with st.spinner("Loading local models (first run takes 3-4 min)..."):
                 (
-                    florence_proc, florence_mod,
                     blip_proc, blip_itm,
                     dino_proc, dino_mod,
                     qwen_tok, qwen_mod
@@ -504,8 +444,8 @@ if uploaded_file is not None:
             progress = st.progress(0)
             status   = st.empty()
-            status.info("Step 1/7: Generating captions with Florence-2-Large...")
-            captions = generate_captions_florence(input_image, florence_proc, florence_mod)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):

 import requests
 import base64
 import streamlit as st
+import google.generativeai as genai
 from PIL import Image
 from io import BytesIO
 from collections import Counter
     initial_sidebar_state="expanded"
 )
+# ============================================================================
+# CREDENTIALS
+# ============================================================================
+JINA_KEY        = os.environ.get("JINA_KEY", "")
+GOOGLE_API_KEY  = os.environ.get("GOOGLE_API_KEY", "")
 JINA_URL     = "https://api.jina.ai/v1/rerank"
 JINA_HEADERS = {
     "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
 )
+# ============================================================================
+# CREDENTIAL CHECK
+# ============================================================================
 if not JINA_KEY:
     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
+if not GOOGLE_API_KEY:
+    st.error("GOOGLE_API_KEY missing. Go to Space Settings → Secrets and add it.")
+    st.stop()
+# Configure Gemini API
+genai.configure(api_key=GOOGLE_API_KEY)
+# ============================================================================
+# LOAD LOCAL MODELS
+# Florence-2-Large removed — replaced by Gemini 1.5 Flash API
+# Saves 1.6GB RAM and 2-3 min startup time
+# Local: BLIP ITM, DINO, Qwen2.5
+# ============================================================================
 @st.cache_resource
 def load_local_models():
     from transformers import (
         AutoModelForCausalLM,
         AutoTokenizer,
         BlipProcessor,
         BlipForImageTextRetrieval,
+        AutoProcessor,
         AutoModelForZeroShotObjectDetection
     )
     gc.collect()
+    # BLIP — ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
     )
     blip_itm_model.eval()
+    # DINO — object detection
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
     )
     dino_model.eval()
+    # Qwen2.5-1.5B — caption fusion
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
     qwen_model.eval()
     return (
         blip_processor, blip_itm_model,
         dino_processor, dino_model,
         qwen_tokenizer, qwen_model
     )
+# ============================================================================
+# HELPERS
+# ============================================================================
 def image_to_bytes(image: Image.Image) -> bytes:
     buf = BytesIO()
     image.save(buf, format="JPEG", quality=85)
     b64 = base64.b64encode(raw).decode()
     return f"data:image/jpeg;base64,{b64}"
+# ============================================================================
+# STEP 1 — GEMINI 1.5 FLASH (API): GENERATE 5 DIVERSE CAPTIONS
+# 5 different prompts — each focuses on a different aspect of the image
+# Gemini sees the image directly as a VLM — no hallucination from task tokens
+# API response ~2-4 sec per caption — 5 captions in ~15-20 sec total
+# ============================================================================
+def generate_captions_gemini(image: Image.Image) -> list:
+    model = genai.GenerativeModel("gemini-1.5-flash")
+    prompts = [
+        "Describe this image in detail covering the overall scene.",
+        "Describe the people in this image — their clothing colors, style, and what they are doing.",
+        "Describe the background, setting, and surroundings visible in this image.",
+        "Describe all the objects, plants, and items visible around the people in this image.",
+        "Write a full description of this image covering who is in it, what is happening, their appearance, and where it takes place."
+    ]
+    captions = []
+    for prompt in prompts:
+        try:
+            response = model.generate_content([prompt, image])
+            cap = response.text.strip().lower()
+            captions.append(cap if cap else "a scene shown in the image")
+        except Exception as e:
+            st.warning(f"Gemini error: {str(e)[:80]}")
+            captions.append("a scene shown in the image")
+    # Deduplicate while keeping order
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
     return unique[:5]
+# ============================================================================
+# STEP 2 — BLIP ITM: IMAGE-TEXT MATCHING SCORES
+# ============================================================================
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
             scores.append(0.0)
     return scores
+# ============================================================================
+# STEP 3 — JINA RERANKER M0: SEMANTIC SCORES
+# ============================================================================
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
             scores.append(0.0)
     return scores
+# ============================================================================
+# STEP 4 — COSINE SIMILARITY: EMBEDDING SCORES
+# ============================================================================
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
+# ============================================================================
+# STEP 5 — MAJORITY VOTING
+# ============================================================================
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
+# ============================================================================
+# STEP 6 — GROUNDING DINO: OBJECT DETECTION
+# ============================================================================
 def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
     try:
         inputs = dino_proc(
         return "Object detection unavailable", []
 # ============================================================================
+# STEP 7 — QWEN2.5-1.5B (LOCAL): CAPTION FUSION
 # ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
+# ============================================================================
+# SIDEBAR
+# ============================================================================
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
+**1. Gemini 1.5 Flash** (API)
 Generate 5 captions
 **2. BLIP ITM** (Local)
 Caption fusion
     """)
     st.markdown("---")
+    st.markdown("**Local:** BLIP ITM, DINO, Qwen2.5")
+    st.markdown("**API:** Gemini 1.5 Flash, Jina")
+# ============================================================================
+# MAIN UI
+# ============================================================================
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
     with col_run:
         if st.button("Generate Caption", type="primary", use_container_width=True):
+            with st.spinner("Loading local models (first run takes 2-3 min)..."):
                 (
                     blip_proc, blip_itm,
                     dino_proc, dino_mod,
                     qwen_tok, qwen_mod
             progress = st.progress(0)
             status   = st.empty()
+            status.info("Step 1/7: Generating captions with Gemini 1.5 Flash...")
+            captions = generate_captions_gemini(input_image)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):