Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 7 days ago

Commit

cfb72d8

verified ·

1 Parent(s): e67f5ad

update app.py

Browse files

Files changed (1) hide show

app.py +49 -28

app.py CHANGED Viewed

@@ -50,9 +50,6 @@ if not JINA_KEY:
     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
-# ============================================================================
-# CHANGE 1: load_local_models — replaced moondream with GIT-Large-COCO
-# ============================================================================
 @st.cache_resource
 def load_local_models():
     from transformers import (
@@ -64,7 +61,6 @@ def load_local_models():
     )
     gc.collect()
-    # GIT-Large-COCO — local caption generation, no API, no auth needed
     git_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
     git_model     = AutoModelForCausalLM.from_pretrained(
         "microsoft/git-large-coco",
@@ -72,7 +68,6 @@ def load_local_models():
     )
     git_model.eval()
-    # BLIP — for ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
@@ -82,7 +77,6 @@ def load_local_models():
     )
     blip_itm_model.eval()
-    # DINO — for object detection
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
@@ -105,23 +99,55 @@ def image_to_data_uri(image: Image.Image) -> str:
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
-# CHANGE 2: generate_captions_git — replaced moondream caption function
 # ============================================================================
 def generate_captions_git(image: Image.Image, git_proc, git_mod) -> list:
-    length_params = [30, 50, 60, 70, 40]
-    captions      = []
-    for max_tokens in length_params:
-        try:
-            pixel_values = git_proc(
-                images=image,
-                return_tensors="pt"
-            ).pixel_values
             with torch.no_grad():
                 generated_ids = git_mod.generate(
                     pixel_values=pixel_values,
-                    max_new_tokens=max_tokens
                 )
             cap = git_proc.batch_decode(
@@ -135,17 +161,22 @@ def generate_captions_git(image: Image.Image, git_proc, git_mod) -> list:
             st.warning(f"GIT error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
             seen.add(c)
             unique.append(c)
     while len(unique) < 5:
         unique.append(unique[0])
     return unique[:5]
-# unchanged
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
@@ -165,7 +196,6 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
             scores.append(0.0)
     return scores
-# unchanged
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
@@ -199,7 +229,6 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
             scores.append(0.0)
     return scores
-# unchanged
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
@@ -227,7 +256,6 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
-# unchanged
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
@@ -245,7 +273,6 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
-# unchanged
 def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
     try:
         inputs = dino_proc(
@@ -288,7 +315,6 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
         st.warning(f"DINO error: {str(e)[:80]}")
         return "Object detection unavailable", []
-# unchanged
 def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
     system_prompt = (
         "You are an expert image captioning assistant. "
@@ -331,9 +357,6 @@ def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
         st.warning(f"Qwen exception: {str(e)[:60]}")
         return cap1
-# ============================================================================
-# CHANGE 3: sidebar — updated step 1 label
-# ============================================================================
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
@@ -385,14 +408,12 @@ if uploaded_file is not None:
         if st.button("Generate Caption", type="primary", use_container_width=True):
             with st.spinner("Loading local models (first run takes 2-3 min)..."):
-                # CHANGE 4: updated unpacking — git_proc, git_mod instead of moon_mod
                 git_proc, git_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
             status.info("Step 1/7: Generating captions with GIT-Large-COCO...")
-            # CHANGE 4: updated function call
             captions = generate_captions_git(input_image, git_proc, git_mod)
             progress.progress(14)
@@ -455,4 +476,4 @@ if uploaded_file is not None:
                 f"font-size:18px;font-weight:500;text-align:center;"
                 f"line-height:1.6;'>{final}</div>",
                 unsafe_allow_html=True
-            )

     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
 @st.cache_resource
 def load_local_models():
     from transformers import (
     )
     gc.collect()
     git_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
     git_model     = AutoModelForCausalLM.from_pretrained(
         "microsoft/git-large-coco",
     )
     git_model.eval()
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
     )
     blip_itm_model.eval()
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
+# ONLY CHANGE: generate_captions_git
+# Fix: 5 different generation strategies instead of just max_new_tokens
+# Greedy / beam search / sampling with different temperatures
 # ============================================================================
 def generate_captions_git(image: Image.Image, git_proc, git_mod) -> list:
+    strategies = [
+        # Greedy — short deterministic baseline
+        {
+            "max_new_tokens": 30
+        },
+        # Beam search — explores multiple decode paths
+        {
+            "max_new_tokens": 50,
+            "num_beams": 5,
+            "early_stopping": True
+        },
+        # Sampling — low temperature, focused output
+        {
+            "max_new_tokens": 60,
+            "do_sample": True,
+            "temperature": 0.7,
+            "top_k": 50
+        },
+        # Sampling — high temperature, creative output
+        {
+            "max_new_tokens": 70,
+            "do_sample": True,
+            "temperature": 1.3,
+            "top_k": 100
+        },
+        # Nucleus sampling — top-p based
+        {
+            "max_new_tokens": 55,
+            "do_sample": True,
+            "top_p": 0.9,
+            "temperature": 1.0
+        },
+    ]
+    captions     = []
+    pixel_values = git_proc(images=image, return_tensors="pt").pixel_values
+    for strategy in strategies:
+        try:
             with torch.no_grad():
                 generated_ids = git_mod.generate(
                     pixel_values=pixel_values,
+                    **strategy
                 )
             cap = git_proc.batch_decode(
             st.warning(f"GIT error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
+    # Deduplicate while keeping order
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
             seen.add(c)
             unique.append(c)
+    # If model still returns all duplicates keep originals so voting has input
+    if len(unique) < 2:
+        unique = captions
     while len(unique) < 5:
         unique.append(unique[0])
     return unique[:5]
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
             scores.append(0.0)
     return scores
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
             scores.append(0.0)
     return scores
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
 def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
     try:
         inputs = dino_proc(
         st.warning(f"DINO error: {str(e)[:80]}")
         return "Object detection unavailable", []
 def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
     system_prompt = (
         "You are an expert image captioning assistant. "
         st.warning(f"Qwen exception: {str(e)[:60]}")
         return cap1
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
         if st.button("Generate Caption", type="primary", use_container_width=True):
             with st.spinner("Loading local models (first run takes 2-3 min)..."):
                 git_proc, git_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
             status.info("Step 1/7: Generating captions with GIT-Large-COCO...")
             captions = generate_captions_git(input_image, git_proc, git_mod)
             progress.progress(14)
                 f"font-size:18px;font-weight:500;text-align:center;"
                 f"line-height:1.6;'>{final}</div>",
                 unsafe_allow_html=True
+            )