Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 11 days ago

Commit

4f06c78

verified ·

1 Parent(s): c1c04ba

update app.py

Browse files

Files changed (1) hide show

app.py +48 -49

app.py CHANGED Viewed

@@ -29,13 +29,9 @@ JINA_KEY = os.environ.get("JINA_KEY", "")
 # ============================================================================
 # API ENDPOINTS
-# GIT-Large-COCO: raw bytes, no Content-Type (replaces Florence-2-Large)
-# Qwen2.5: model-specific endpoint
 # Jina: query=plain string, documents=list of data URI strings
 # ============================================================================
-GIT_URL     = "https://api-inference.huggingface.co/models/microsoft/git-large-coco"
-GIT_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
 QWEN_URL   = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
 HF_HEADERS = {
     "Authorization": f"Bearer {HF_TOKEN}",
@@ -69,11 +65,16 @@ if not JINA_KEY:
     st.stop()
 # ============================================================================
-# LOAD LOCAL MODELS — BLIP ITM + GROUNDING DINO
 # ============================================================================
 @st.cache_resource
 def load_local_models():
     from transformers import (
         BlipProcessor,
         BlipForImageTextRetrieval,
         AutoProcessor,
@@ -81,6 +82,19 @@ def load_local_models():
     )
     gc.collect()
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
@@ -90,6 +104,7 @@ def load_local_models():
     )
     blip_itm_model.eval()
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
@@ -99,7 +114,7 @@ def load_local_models():
     )
     dino_model.eval()
-    return blip_processor, blip_itm_model, dino_processor, dino_model
 # ============================================================================
 # HELPERS
@@ -115,47 +130,31 @@ def image_to_data_uri(image: Image.Image) -> str:
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
-# STEP 1 — GIT-LARGE-COCO: GENERATE 5 CAPTIONS
-# Replaces Florence-2-Large (not available on HF serverless API)
-# microsoft/git-large-coco gives detailed captions, confirmed on HF API
-# Called 5 times with different sampling params for caption diversity
 # ============================================================================
-def generate_captions_git(image: Image.Image) -> list:
-    img_bytes = image_to_bytes(image)
-    parameter_sets = [
-        {"max_new_tokens": 50},
-        {"max_new_tokens": 80},
-        {"max_new_tokens": 60, "temperature": 1.2, "do_sample": True},
-        {"max_new_tokens": 70, "temperature": 1.5, "do_sample": True},
-        {"max_new_tokens": 40, "temperature": 0.8, "do_sample": True},
     ]
     captions = []
-    for i, params in enumerate(parameter_sets):
         try:
-            response = requests.post(
-                GIT_URL,
-                headers=GIT_HEADERS,
-                data=img_bytes,
-                params={"wait_for_model": True},
-                timeout=60
-            )
-            if response.status_code == 200:
-                result = response.json()
-                if isinstance(result, list):
-                    cap = result[0].get("generated_text", "").strip().lower()
-                elif isinstance(result, dict):
-                    cap = result.get("generated_text", "").strip().lower()
-                else:
-                    cap = ""
-                captions.append(cap if cap else "a scene shown in the image")
-            else:
-                st.warning(f"GIT API error {response.status_code}")
-                captions.append("a scene shown in the image")
         except Exception as e:
-            st.warning(f"GIT exception: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
@@ -165,6 +164,7 @@ def generate_captions_git(image: Image.Image) -> list:
             unique.append(c)
     while len(unique) < 5:
         unique.append(unique[0])
     return unique[:5]
 # ============================================================================
@@ -373,7 +373,7 @@ with st.sidebar:
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
-**1. GIT-Large-COCO** (API)
 Generate 5 captions
 **2. BLIP ITM** (Local)
@@ -395,8 +395,8 @@ Object detection
 Caption fusion
     """)
     st.markdown("---")
-    st.markdown("**Local:** BLIP ITM, DINO")
-    st.markdown("**API:** GIT-Large, Jina, Qwen2.5")
 # ============================================================================
 # MAIN UI
@@ -421,14 +421,14 @@ if uploaded_file is not None:
     with col_run:
         if st.button("Generate Caption", type="primary", use_container_width=True):
-            with st.spinner("Loading local models (first run takes 1-2 min)..."):
-                blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
-            status.info("Step 1/7: Generating captions with GIT-Large-COCO...")
-            captions = generate_captions_git(input_image)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):
@@ -491,4 +491,3 @@ if uploaded_file is not None:
                 f"line-height:1.6;'>{final}</div>",
                 unsafe_allow_html=True
             )

 # ============================================================================
 # API ENDPOINTS
+# Qwen2.5: model-specific endpoint for caption fusion
 # Jina: query=plain string, documents=list of data URI strings
 # ============================================================================
 QWEN_URL   = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
 HF_HEADERS = {
     "Authorization": f"Bearer {HF_TOKEN}",
     st.stop()
 # ============================================================================
+# LOAD LOCAL MODELS
+# Moondream2: caption generation
+# BLIP ITM:   image-text matching + cosine similarity
+# DINO:       object detection
 # ============================================================================
 @st.cache_resource
 def load_local_models():
     from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
         BlipProcessor,
         BlipForImageTextRetrieval,
         AutoProcessor,
     )
     gc.collect()
+    # Moondream2 — Vision Language Model for caption generation
+    moon_tokenizer = AutoTokenizer.from_pretrained(
+        "vikhyatk/moondream2",
+        trust_remote_code=True
+    )
+    moon_model = AutoModelForCausalLM.from_pretrained(
+        "vikhyatk/moondream2",
+        trust_remote_code=True,
+        torch_dtype=torch.float32
+    )
+    moon_model.eval()
+    # BLIP — for ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
     )
     blip_itm_model.eval()
+    # DINO — for object detection
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
     )
     dino_model.eval()
+    return moon_tokenizer, moon_model, blip_processor, blip_itm_model, dino_processor, dino_model
 # ============================================================================
 # HELPERS
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
+# STEP 1 — MOONDREAM2 (LOCAL): GENERATE 5 DIVERSE CAPTIONS
+# vikhyatk/moondream2 — small VLM (~2GB), runs on CPU
+# 5 different prompts produce diverse caption perspectives
+# No API needed — fully local and reliable
 # ============================================================================
+def generate_captions_moondream(image: Image.Image, moon_tok, moon_mod) -> list:
+    prompts = [
+        "Describe this image in detail.",
+        "What is happening in this image?",
+        "Describe the people, objects, and setting in this image.",
+        "What do you see in this photograph?",
+        "Describe the scene including background and foreground in detail."
     ]
     captions = []
+    for prompt in prompts:
         try:
+            enc_image = moon_mod.encode_image(image)
+            cap = moon_mod.answer_question(enc_image, prompt, moon_tok)
+            cap = cap.strip().lower()
+            captions.append(cap if cap else "a scene shown in the image")
         except Exception as e:
+            st.warning(f"Moondream error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
             unique.append(c)
     while len(unique) < 5:
         unique.append(unique[0])
     return unique[:5]
 # ============================================================================
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
+**1. Moondream2** (Local)
 Generate 5 captions
 **2. BLIP ITM** (Local)
 Caption fusion
     """)
     st.markdown("---")
+    st.markdown("**Local:** Moondream2, BLIP ITM, DINO")
+    st.markdown("**API:** Jina, Qwen2.5")
 # ============================================================================
 # MAIN UI
     with col_run:
         if st.button("Generate Caption", type="primary", use_container_width=True):
+            with st.spinner("Loading local models (first run takes 2-3 min)..."):
+                moon_tok, moon_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
+            status.info("Step 1/7: Generating captions with Moondream2...")
+            captions = generate_captions_moondream(input_image, moon_tok, moon_mod)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):
                 f"line-height:1.6;'>{final}</div>",
                 unsafe_allow_html=True
             )