Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 7 days ago

Commit

0ae3fd7

verified ·

1 Parent(s): 4b24723

update app.py

Browse files

Files changed (1) hide show

app.py +43 -19

app.py CHANGED Viewed

@@ -56,11 +56,13 @@ if not GOOGLE_API_KEY:
     st.error("GOOGLE_API_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
-# Configure Gemini — after GOOGLE_API_KEY is defined
 genai.configure(api_key=GOOGLE_API_KEY)
 # ============================================================================
 # LOAD LOCAL MODELS
 # ============================================================================
 @st.cache_resource
 def load_local_models():
@@ -74,6 +76,7 @@ def load_local_models():
     )
     gc.collect()
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
@@ -83,6 +86,7 @@ def load_local_models():
     )
     blip_itm_model.eval()
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
@@ -92,6 +96,7 @@ def load_local_models():
     )
     dino_model.eval()
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
@@ -121,30 +126,49 @@ def image_to_data_uri(image: Image.Image) -> str:
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
-# STEP 1 — GEMINI 2.0 FLASH (API): GENERATE 5 DIVERSE CAPTIONS
 # ============================================================================
 def generate_captions_gemini(image: Image.Image) -> list:
     model = genai.GenerativeModel("gemini-2.0-flash")
-    prompts = [
-        "Describe this image in detail covering the overall scene with every possible detail in simple language.",
-        "Describe the people in this image — their clothing colors, style, and what they are doing.",
-        "Describe the background, setting, and surroundings visible in this image.",
-        "Describe all the objects, plants, and items visible around the people in this image.",
-        "Write a full description of this image covering who is in it, what is happening, their appearance, and where it takes place."
-    ]
-    captions = []
-    for prompt in prompts:
-        try:
-            response = model.generate_content([prompt, image])
-            cap = response.text.strip().lower()
-            captions.append(cap if cap else "a scene shown in the image")
-        except Exception as e:
-            st.warning(f"Gemini error: {str(e)[:80]}")
-            captions.append("a scene shown in the image")
     seen, unique = set(), []
     for c in captions:
@@ -245,7 +269,7 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
         return [0.0] * len(captions)
 # ============================================================================
-# STEP 5 — MAJORITY VOTING
 # ============================================================================
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]

     st.error("GOOGLE_API_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
+# Configure Gemini after credentials are defined
 genai.configure(api_key=GOOGLE_API_KEY)
 # ============================================================================
 # LOAD LOCAL MODELS
+# Local: BLIP ITM, DINO, Qwen2.5
+# API:   Gemini 2.0 Flash, Jina Reranker
 # ============================================================================
 @st.cache_resource
 def load_local_models():
     )
     gc.collect()
+    # BLIP — ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
     )
     blip_itm_model.eval()
+    # DINO — object detection
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
     )
     dino_model.eval()
+    # Qwen2.5-1.5B — caption fusion
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
+# STEP 1 — GEMINI 2.0 FLASH: GENERATE 5 DIVERSE CAPTIONS
+# Single API call — all 5 captions in one request
+# Avoids 429 rate limit that occurred with 5 separate calls
 # ============================================================================
 def generate_captions_gemini(image: Image.Image) -> list:
     model = genai.GenerativeModel("gemini-2.0-flash")
+    prompt = """Look at this image carefully and write 5 different captions from different perspectives.
+1. Overall scene:  describing the image in every possible detail in simple language.
+2. People: Describe the people, their clothing colors, style, and what they are doing.
+3. Background: Describe the background, setting, and surroundings.
+4. Objects: Describe the objects, plants, and items visible in the image.
+5. Full description: A complete description covering who is in the image, what they are doing, their appearance, and where the scene takes place.
+Reply in this exact format:
+CAPTION_1: [your caption here]
+CAPTION_2: [your caption here]
+CAPTION_3: [your caption here]
+CAPTION_4: [your caption here]
+CAPTION_5: [your caption here]"""
+    try:
+        response = model.generate_content([prompt, image])
+        raw_text = response.text.strip()
+        captions = []
+        for i in range(1, 6):
+            marker      = f"CAPTION_{i}:"
+            next_marker = f"CAPTION_{i+1}:" if i < 5 else None
+            if marker in raw_text:
+                start = raw_text.index(marker) + len(marker)
+                end   = raw_text.index(next_marker) if next_marker and next_marker in raw_text else len(raw_text)
+                cap   = raw_text[start:end].strip().lower()
+                captions.append(cap if cap else "a scene shown in the image")
+            else:
+                captions.append("a scene shown in the image")
+    except Exception as e:
+        st.warning(f"Gemini error: {str(e)[:80]}")
+        captions = ["a scene shown in the image"] * 5
     seen, unique = set(), []
     for c in captions:
         return [0.0] * len(captions)
 # ============================================================================
+# STEP 5 — MAJORITY VOTING: SELECT TOP 2 CAPTIONS
 # ============================================================================
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]