Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 7 days ago

Commit

f35b1cd

verified ·

1 Parent(s): 9ef58b8

update

Browse files

Files changed (1) hide show

app.py +170 -74

app.py CHANGED Viewed

@@ -27,36 +27,39 @@ JINA_HEADERS = {
     "Content-Type":  "application/json"
 }
-# ============================================================================
-# CHANGE 1: DETECT_PROMPT — expanded with colours, furniture, objects
-# More labels = richer grounding for Qwen fusion
-# ============================================================================
 DETECT_PROMPT = (
-    "person . man . woman . boy . girl . child . baby . "
-    "red . blue . green . yellow . black . white . orange . purple . brown . "
-    "shirt . jacket . dress . coat . hat . glasses . bag . shoes . "
-    "table . chair . bench . sofa . desk . stool . wooden chair . dining table . "
     "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
-    "car . bicycle . motorcycle . bus . truck . "
-    "tree . grass . flower . sky . water . river . mountain . road . "
-    "building . wall . door . window . floor . ceiling . stairs . "
-    "lamp . light . candle . fire . smoke . "
-    "phone . laptop . book . bag . umbrella . "
-    "dog . cat . bird . horse . animal . "
-    "food . pizza . cake . bread . fruit . "
-    "bar . restaurant . pub . cafe . kitchen . "
-    "wood . metal . glass . brick . "
-    "dark . bright . colorful ."
 )
 if not JINA_KEY:
     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
-# ============================================================================
-# CHANGE 2: load_local_models — replaced GIT with Florence-2-Large
-# Florence-2 has 3 built-in task tokens — accurate, grounded, no hallucination
-# ============================================================================
 @st.cache_resource
 def load_local_models():
     from transformers import (
@@ -69,7 +72,6 @@ def load_local_models():
     )
     gc.collect()
-    # Florence-2-Large — accurate caption generation with task tokens
     florence_processor = AutoProcessor.from_pretrained(
         "microsoft/Florence-2-large",
         trust_remote_code=True
@@ -81,7 +83,6 @@ def load_local_models():
     )
     florence_model.eval()
-    # BLIP — ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
@@ -91,7 +92,6 @@ def load_local_models():
     )
     blip_itm_model.eval()
-    # DINO — object detection
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
@@ -101,7 +101,6 @@ def load_local_models():
     )
     dino_model.eval()
-    # Qwen2.5-1.5B — caption fusion (local)
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
@@ -129,54 +128,146 @@ def image_to_data_uri(image: Image.Image) -> str:
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
-# CHANGE 3: generate_captions_florence — replaces generate_captions_git
-# Uses Florence-2 task tokens for naturally diverse and accurate captions
-# <CAPTION> / <DETAILED_CAPTION> / <MORE_DETAILED_CAPTION>
 # ============================================================================
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
-    tasks = [
-        ("<CAPTION>",               {"max_new_tokens": 50,  "num_beams": 3}),
-        ("<DETAILED_CAPTION>",      {"max_new_tokens": 100, "num_beams": 3}),
-        ("<MORE_DETAILED_CAPTION>", {"max_new_tokens": 150, "num_beams": 3}),
-        ("<DETAILED_CAPTION>",      {"max_new_tokens": 100, "num_beams": 5}),
-        ("<CAPTION>",               {"max_new_tokens": 80,  "num_beams": 5}),
-    ]
-    captions = []
-    for task_prompt, gen_kwargs in tasks:
-        try:
-            inputs = florence_proc(
-                text=task_prompt,
-                images=image,
-                return_tensors="pt"
             )
-            with torch.no_grad():
-                generated_ids = florence_mod.generate(
-                    input_ids=inputs["input_ids"],
-                    pixel_values=inputs["pixel_values"],
-                    **gen_kwargs
-                )
-            generated_text = florence_proc.batch_decode(
-                generated_ids, skip_special_tokens=False
-            )[0]
-            parsed = florence_proc.post_process_generation(
-                generated_text,
-                task=task_prompt,
-                image_size=(image.width, image.height)
             )
-            cap = parsed.get(task_prompt, "").strip().lower()
-            captions.append(cap if cap else "a scene shown in the image")
-        except Exception as e:
-            st.warning(f"Florence error: {str(e)[:80]}")
-            captions.append("a scene shown in the image")
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
@@ -322,18 +413,26 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
         st.warning(f"DINO error: {str(e)[:80]}")
         return "Object detection unavailable", []
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
-        "You are an expert image captioning assistant. "
-        "Write ONE natural, fluent, detailed and descriptive caption. "
-        "Combine the best details from both captions and incorporate the detected objects. "
-        "Return ONLY the final caption, no explanation or prefix."
     )
     user_prompt = (
         f"Caption A: {cap1}\n"
         f"Caption B: {cap2}\n"
         f"{objects}\n\n"
-        "Write a detailed fused caption:"
     )
     try:
@@ -351,8 +450,8 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         with torch.no_grad():
             generated_ids = qwen_mod.generate(
                 **model_inputs,
-                max_new_tokens=120,
-                temperature=0.3,
                 do_sample=True,
                 top_p=0.9
             )
@@ -360,7 +459,7 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
         fused = qwen_tok.decode(output_ids, skip_special_tokens=True).strip()
-        for prefix in ["Fused caption:", "Caption:", "Result:", "Answer:"]:
             if fused.lower().startswith(prefix.lower()):
                 fused = fused[len(prefix):].strip()
@@ -370,9 +469,6 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
-# ============================================================================
-# CHANGE 4: sidebar — updated step 1 label to Florence-2-Large
-# ============================================================================
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")

     "Content-Type":  "application/json"
 }
 DETECT_PROMPT = (
+    # Core Subjects & Actions
+    "person . man . woman . boy . girl . child . baby . a group of people . "
+    "sitting on a chair . riding a bicycle . holding an object . walking on the road . "
+    # Textures & Materials
+    "wooden surface . shiny metal . smooth glass . brick wall . leather bag . denim clothing . "
+    # Detailed Apparel & Wearables
+    "shirt . jacket . dress . coat . hat . glasses . backpack . shoes . tie . "
+    # Common Interior Objects
+    "table . chair . bench . sofa . desk . laptop . phone . book . umbrella . "
     "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
+    # Environmental & Spatial Elements
+    "in the foreground . in the background . tree . grass . flower . sky . "
+    "water . river . mountain . road . building . wall . door . window . floor . "
+    # Lighting & Atmospheric Context
+    "dark shadow . bright light . sunny day . indoor lamp . reflection . colorful texture . "
+    # Animals & Food
+    "dog . cat . bird . horse . animal . pizza . cake . bread . fruit . "
+    # Transportation & Setting
+    "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
 )
 if not JINA_KEY:
     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
 @st.cache_resource
 def load_local_models():
     from transformers import (
     )
     gc.collect()
     florence_processor = AutoProcessor.from_pretrained(
         "microsoft/Florence-2-large",
         trust_remote_code=True
     )
     florence_model.eval()
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-large"
     )
     )
     blip_itm_model.eval()
     dino_processor = AutoProcessor.from_pretrained(
         "IDEA-Research/grounding-dino-base"
     )
     )
     dino_model.eval()
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
+# CHANGE 1: generate_captions_florence
+# 5 different Florence-2 task tokens — each gives a different perspective
+#
+# Task breakdown:
+# <CAPTION>               → short overall scene description
+# <DETAILED_CAPTION>      → longer overall scene description
+# <MORE_DETAILED_CAPTION> → most detailed overall description
+# <DENSE_REGION_CAPTION>  → describes individual regions of the image
+#                           (returns region labels → joined into a sentence)
+# <OD>                    → object detection labels
+#                           (returns detected objects → formatted as caption)
+#
+# OD and DENSE_REGION_CAPTION return structured data not plain text,
+# so we extract their labels and convert to readable captions manually.
 # ============================================================================
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
+    captions   = []
+    image_size = (image.width, image.height)
+    # Task 1: Short caption
+    try:
+        inputs = florence_proc(
+            text="<CAPTION>", images=image, return_tensors="pt"
+        )
+        with torch.no_grad():
+            ids = florence_mod.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=50, num_beams=3
+            )
+        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
+        parsed = florence_proc.post_process_generation(raw, task="<CAPTION>", image_size=image_size)
+        cap    = parsed.get("<CAPTION>", "").strip().lower()
+        captions.append(cap if cap else "a scene shown in the image")
+    except Exception as e:
+        st.warning(f"Florence CAPTION error: {str(e)[:80]}")
+        captions.append("a scene shown in the image")
+    # Task 2: Detailed caption
+    try:
+        inputs = florence_proc(
+            text="<DETAILED_CAPTION>", images=image, return_tensors="pt"
+        )
+        with torch.no_grad():
+            ids = florence_mod.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=100, num_beams=3
             )
+        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
+        parsed = florence_proc.post_process_generation(raw, task="<DETAILED_CAPTION>", image_size=image_size)
+        cap    = parsed.get("<DETAILED_CAPTION>", "").strip().lower()
+        captions.append(cap if cap else "a scene shown in the image")
+    except Exception as e:
+        st.warning(f"Florence DETAILED_CAPTION error: {str(e)[:80]}")
+        captions.append("a scene shown in the image")
+    # Task 3: More detailed caption
+    try:
+        inputs = florence_proc(
+            text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt"
+        )
+        with torch.no_grad():
+            ids = florence_mod.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=150, num_beams=3
             )
+        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
+        parsed = florence_proc.post_process_generation(raw, task="<MORE_DETAILED_CAPTION>", image_size=image_size)
+        cap    = parsed.get("<MORE_DETAILED_CAPTION>", "").strip().lower()
+        captions.append(cap if cap else "a scene shown in the image")
+    except Exception as e:
+        st.warning(f"Florence MORE_DETAILED_CAPTION error: {str(e)[:80]}")
+        captions.append("a scene shown in the image")
+    # Task 4: Dense region caption
+    # Returns descriptions per image region — join them into one sentence
+    try:
+        inputs = florence_proc(
+            text="<DENSE_REGION_CAPTION>", images=image, return_tensors="pt"
+        )
+        with torch.no_grad():
+            ids = florence_proc.post_process_generation
+            ids = florence_mod.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=200, num_beams=3
+            )
+        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
+        parsed = florence_proc.post_process_generation(raw, task="<DENSE_REGION_CAPTION>", image_size=image_size)
+        labels = parsed.get("<DENSE_REGION_CAPTION>", {}).get("labels", [])
+        if labels:
+            # Remove duplicates while preserving order
+            seen_r, unique_r = set(), []
+            for l in labels:
+                if l.lower() not in seen_r:
+                    seen_r.add(l.lower())
+                    unique_r.append(l.lower())
+            cap = ", ".join(unique_r[:6]) + " visible in the scene"
+        else:
+            cap = "a scene shown in the image"
+        captions.append(cap)
+    except Exception as e:
+        st.warning(f"Florence DENSE_REGION error: {str(e)[:80]}")
+        captions.append("a scene shown in the image")
+    # Task 5: Object detection
+    # Returns detected object labels — format as descriptive caption
+    try:
+        inputs = florence_proc(
+            text="<OD>", images=image, return_tensors="pt"
+        )
+        with torch.no_grad():
+            ids = florence_mod.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=200, num_beams=3
+            )
+        raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
+        parsed = florence_proc.post_process_generation(raw, task="<OD>", image_size=image_size)
+        labels = parsed.get("<OD>", {}).get("labels", [])
+        if labels:
+            seen_o, unique_o = set(), []
+            for l in labels:
+                if l.lower() not in seen_o:
+                    seen_o.add(l.lower())
+                    unique_o.append(l.lower())
+            cap = "a scene containing " + ", ".join(unique_o[:6])
+        else:
+            cap = "a scene shown in the image"
+        captions.append(cap)
+    except Exception as e:
+        st.warning(f"Florence OD error: {str(e)[:80]}")
+        captions.append("a scene shown in the image")
+    # Deduplicate while preserving order
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
         st.warning(f"DINO error: {str(e)[:80]}")
         return "Object detection unavailable", []
+# ============================================================================
+# CHANGE 2: fuse_captions — simpler, natural prompt
+# Old prompt said "detailed and descriptive" → caused AI-sounding output
+# New prompt asks for simple, factual, human-like language
+# ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
+        "You are helping write image captions. "
+        "Write ONE short, simple, factual caption exactly as a person would "
+        "naturally describe this photo. Use plain everyday language. "
+        "Do not add any details that are not clearly visible. "
+        "Do not use dramatic or poetic language. "
+        "Return ONLY the caption, nothing else."
     )
     user_prompt = (
         f"Caption A: {cap1}\n"
         f"Caption B: {cap2}\n"
         f"{objects}\n\n"
+        "Write a simple natural caption:"
     )
     try:
         with torch.no_grad():
             generated_ids = qwen_mod.generate(
                 **model_inputs,
+                max_new_tokens=60,
+                temperature=0.2,
                 do_sample=True,
                 top_p=0.9
             )
         output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
         fused = qwen_tok.decode(output_ids, skip_special_tokens=True).strip()
+        for prefix in ["Caption:", "Result:", "Answer:", "Fused caption:"]:
             if fused.lower().startswith(prefix.lower()):
                 fused = fused[len(prefix):].strip()
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")