Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 7 days ago

Commit

4b24723

verified ·

1 Parent(s): c2bd5ca

update app.py

Browse files

Files changed (1) hide show

app.py +5 -15

app.py CHANGED Viewed

@@ -22,8 +22,8 @@ st.set_page_config(
 # ============================================================================
 # CREDENTIALS
 # ============================================================================
-JINA_KEY        = os.environ.get("JINA_KEY", "")
-GOOGLE_API_KEY  = os.environ.get("GOOGLE_API_KEY", "")
 JINA_URL     = "https://api.jina.ai/v1/rerank"
 JINA_HEADERS = {
@@ -56,14 +56,11 @@ if not GOOGLE_API_KEY:
     st.error("GOOGLE_API_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
-# Configure Gemini API
 genai.configure(api_key=GOOGLE_API_KEY)
 # ============================================================================
 # LOAD LOCAL MODELS
-# Florence-2-Large removed — replaced by Gemini 1.5 Flash API
-# Saves 1.6GB RAM and 2-3 min startup time
-# Local: BLIP ITM, DINO, Qwen2.5
 # ============================================================================
 @st.cache_resource
 def load_local_models():
@@ -77,7 +74,6 @@ def load_local_models():
     )
     gc.collect()
-    # BLIP — ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
@@ -87,7 +83,6 @@ def load_local_models():
     )
     blip_itm_model.eval()
-    # DINO — object detection
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
@@ -97,7 +92,6 @@ def load_local_models():
     )
     dino_model.eval()
-    # Qwen2.5-1.5B — caption fusion
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
@@ -127,17 +121,14 @@ def image_to_data_uri(image: Image.Image) -> str:
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
-# STEP 1 — GEMINI 1.5 FLASH (API): GENERATE 5 DIVERSE CAPTIONS
-# 5 different prompts — each focuses on a different aspect of the image
-# Gemini sees the image directly as a VLM — no hallucination from task tokens
-# API response ~2-4 sec per caption — 5 captions in ~15-20 sec total
 # ============================================================================
 def generate_captions_gemini(image: Image.Image) -> list:
     model = genai.GenerativeModel("gemini-2.0-flash")
     prompts = [
-        "Describe this image in detail covering the overall scene.",
         "Describe the people in this image — their clothing colors, style, and what they are doing.",
         "Describe the background, setting, and surroundings visible in this image.",
         "Describe all the objects, plants, and items visible around the people in this image.",
@@ -155,7 +146,6 @@ def generate_captions_gemini(image: Image.Image) -> list:
             st.warning(f"Gemini error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
-    # Deduplicate while keeping order
     seen, unique = set(), []
     for c in captions:
         if c not in seen:

 # ============================================================================
 # CREDENTIALS
 # ============================================================================
+JINA_KEY       = os.environ.get("JINA_KEY", "")
+GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
 JINA_URL     = "https://api.jina.ai/v1/rerank"
 JINA_HEADERS = {
     st.error("GOOGLE_API_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
+# Configure Gemini — after GOOGLE_API_KEY is defined
 genai.configure(api_key=GOOGLE_API_KEY)
 # ============================================================================
 # LOAD LOCAL MODELS
 # ============================================================================
 @st.cache_resource
 def load_local_models():
     )
     gc.collect()
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
     )
     blip_itm_model.eval()
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
     )
     dino_model.eval()
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
+# STEP 1 — GEMINI 2.0 FLASH (API): GENERATE 5 DIVERSE CAPTIONS
 # ============================================================================
 def generate_captions_gemini(image: Image.Image) -> list:
     model = genai.GenerativeModel("gemini-2.0-flash")
     prompts = [
+        "Describe this image in detail covering the overall scene with every possible detail in simple language.",
         "Describe the people in this image — their clothing colors, style, and what they are doing.",
         "Describe the background, setting, and surroundings visible in this image.",
         "Describe all the objects, plants, and items visible around the people in this image.",
             st.warning(f"Gemini error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
     for c in captions:
         if c not in seen: