Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 6 days ago

Commit

ebc8d8e

verified ·

1 Parent(s): 380cdcf

remove dino

Browse files

Files changed (1) hide show

app.py +49 -107

app.py CHANGED Viewed

@@ -26,24 +26,16 @@ JINA_HEADERS = {
     "Content-Type":  "application/json"
 }
-DETECT_PROMPT = (
-    "person . man . woman . boy . girl . child . baby . a group of people . "
-    "sitting on a chair . riding a bicycle . holding an object . walking on the road . "
-    "wooden surface . shiny metal . smooth glass . brick wall . leather bag . denim clothing . "
-    "shirt . jacket . dress . coat . hat . glasses . backpack . shoes . tie . "
-    "table . chair . bench . sofa . desk . laptop . phone . book . umbrella . "
-    "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
-    "in the foreground . in the background . tree . grass . flower . sky . "
-    "water . river . mountain . road . building . wall . door . window . floor . "
-    "dark shadow . bright light . sunny day . indoor lamp . reflection . colorful texture . "
-    "dog . cat . bird . horse . animal . pizza . cake . bread . fruit . "
-    "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
-)
 if not JINA_KEY:
     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
 @st.cache_resource
 def load_local_models():
     from transformers import (
@@ -51,8 +43,7 @@ def load_local_models():
         AutoModelForCausalLM,
         AutoTokenizer,
         BlipProcessor,
-        BlipForImageTextRetrieval,
-        AutoModelForZeroShotObjectDetection
     )
     gc.collect()
@@ -76,15 +67,6 @@ def load_local_models():
     )
     blip_itm_model.eval()
-    dino_processor = AutoProcessor.from_pretrained(
-        "IDEA-Research/grounding-dino-base"
-    )
-    dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
-        "IDEA-Research/grounding-dino-base",
-        torch_dtype=torch.float32
-    )
-    dino_model.eval()
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
@@ -97,7 +79,6 @@ def load_local_models():
     return (
         florence_processor, florence_model,
         blip_processor, blip_itm_model,
-        dino_processor, dino_model,
         qwen_tokenizer, qwen_model
     )
@@ -112,17 +93,7 @@ def image_to_data_uri(image: Image.Image) -> str:
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
-# generate_captions_florence — speed optimized + diversity fixed
-#
-# Problem: num_beams=1 greedy produces near-identical captions across tasks
-# Fix:     Task 1 stays greedy (baseline), Tasks 2 and 3 use sampling
-#          with increasing temperature — each task explores different word paths
-#
-# Task 1: greedy       → deterministic, short, factual baseline
-# Task 2: temp=0.7     → slightly varied, focuses on detail
-# Task 3: temp=1.1     → more varied phrasing, different sentence structure
-#
-# Speed: sampling is as fast or faster than beam search — no regression
 # ============================================================================
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
@@ -170,7 +141,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
             st.warning(f"Florence {task_prompt} error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
-    # Deduplicate while keeping order
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
@@ -185,6 +155,9 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
     return unique[:5]
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
@@ -204,6 +177,9 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
             scores.append(0.0)
     return scores
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
@@ -234,6 +210,9 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
             scores.append(0.0)
     return scores
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
@@ -260,6 +239,9 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
@@ -277,55 +259,21 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
-def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
-    try:
-        inputs = dino_proc(
-            images=image, text=DETECT_PROMPT, return_tensors="pt"
-        )
-        with torch.no_grad():
-            outputs = dino_mod(**inputs)
-        target_sizes = torch.tensor([image.size[::-1]])
-        results      = dino_proc.post_process_grounded_object_detection(
-            outputs, inputs.input_ids, target_sizes=target_sizes
-        )[0]
-        scores = results["scores"]
-        labels = results.get("text_labels", results["labels"])
-        keep     = scores >= threshold
-        kept_sc  = scores[keep].tolist()
-        kept_lbl = [labels[i] for i in range(len(labels)) if keep[i]]
-        if not kept_lbl:
-            return "No objects detected", []
-        label_dict = {}
-        for lbl, sc in zip(kept_lbl, kept_sc):
-            lbl = lbl.strip().lower()
-            if lbl not in label_dict or label_dict[lbl] < sc:
-                label_dict[lbl] = sc
-        sorted_labels = [
-            l for l, _ in
-            sorted(label_dict.items(), key=lambda x: x[1], reverse=True)
-        ]
-        formatted = "Detected objects: [" + ", ".join(sorted_labels) + "]"
-        return formatted, sorted_labels
-    except Exception as e:
-        st.warning(f"DINO error: {str(e)[:80]}")
-        return "Object detection unavailable", []
-def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
         "You write image captions. "
-        "You will receive two captions and a list of detected objects. "
         "Your job is to combine them into one detailed caption. "
         "Include ALL specific details you find: "
         "the clothing colors and style of each person, "
         "what each person looks like and what they are doing, "
-        "the objects and plants visible around them, "
         "and the setting or background of the scene. "
         "Write 5 to 6 sentences. Use simple, clear, everyday words. "
         "Do NOT summarize or shorten — keep every specific detail. "
@@ -335,8 +283,7 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
     user_prompt = (
         f"Caption A: {cap1}\n"
-        f"Caption B: {cap2}\n"
-        f"{objects}\n\n"
         "Write a detailed caption that includes all the clothing, "
         "people, objects and background in details:"
     )
@@ -373,6 +320,9 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
@@ -393,16 +343,16 @@ Embedding similarity
 **5. Majority Voting**
 Best 2 captions selected
-**6. Grounding DINO** (Local)
-Object detection
-**7. Qwen2.5-1.5B** (Local)
 Caption fusion
     """)
     st.markdown("---")
-    st.markdown("**Local:** Florence-2, BLIP ITM, DINO, Qwen2.5")
     st.markdown("**API:** Jina")
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
@@ -427,32 +377,31 @@ if uploaded_file is not None:
                 (
                     florence_proc, florence_mod,
                     blip_proc, blip_itm,
-                    dino_proc, dino_mod,
                     qwen_tok, qwen_mod
                 ) = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
-            status.info("Step 1/7: Generating captions with Florence-2-Large...")
             captions = generate_captions_florence(input_image, florence_proc, florence_mod)
-            progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):
                 for i, cap in enumerate(captions):
                     st.write(f"**{i+1}.** {cap}")
-            status.info("Step 2/7: Computing BLIP ITM scores...")
             itm_scores = compute_itm_scores(input_image, captions, blip_proc, blip_itm)
-            progress.progress(28)
-            status.info("Step 3/7: Computing Jina Reranker scores...")
             jina_scores = compute_jina_scores(input_image, captions)
-            progress.progress(42)
-            status.info("Step 4/7: Computing Cosine Similarity scores...")
             cosine_scores = compute_cosine_scores(input_image, captions, blip_proc, blip_itm)
-            progress.progress(57)
             scores_df = pd.DataFrame({
                 "Caption": [f"Cap {i+1}: {c[:50]}" for i, c in enumerate(captions)],
@@ -463,11 +412,11 @@ if uploaded_file is not None:
             with st.expander("All Scores", expanded=False):
                 st.dataframe(scores_df, use_container_width=True, hide_index=True)
-            status.info("Step 5/7: Running majority voting...")
             best_1, best_2, _, _ = majority_voting(
                 captions, itm_scores, jina_scores, cosine_scores
             )
-            progress.progress(71)
             st.markdown("### Majority Voted Captions")
             c1, c2 = st.columns(2)
@@ -476,15 +425,8 @@ if uploaded_file is not None:
             with c2:
                 st.info(f"2. {best_2}")
-            status.info("Step 6/7: Detecting objects with DINO...")
-            obj_str, obj_list = detect_objects(input_image, dino_proc, dino_mod)
-            progress.progress(85)
-            st.markdown("### Detected Objects")
-            st.write(" | ".join(obj_list) if obj_list else obj_str)
-            status.info("Step 7/7: Fusing captions with Qwen2.5-1.5B...")
-            final = fuse_captions(best_1, best_2, obj_str, qwen_tok, qwen_mod)
             progress.progress(100)
             status.success("Pipeline complete!")

     "Content-Type":  "application/json"
 }
 if not JINA_KEY:
     st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
     st.stop()
+# ============================================================================
+# LOAD LOCAL MODELS
+# DINO removed — was adding hallucinated labels that hurt fusion accuracy
+# Local: Florence-2, BLIP ITM, Qwen2.5
+# API:   Jina Reranker
+# ============================================================================
 @st.cache_resource
 def load_local_models():
     from transformers import (
         AutoModelForCausalLM,
         AutoTokenizer,
         BlipProcessor,
+        BlipForImageTextRetrieval
     )
     gc.collect()
     )
     blip_itm_model.eval()
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
     return (
         florence_processor, florence_model,
         blip_processor, blip_itm_model,
         qwen_tokenizer, qwen_model
     )
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
+# STEP 1 — FLORENCE-2-LARGE: GENERATE 5 DIVERSE CAPTIONS
 # ============================================================================
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
             st.warning(f"Florence {task_prompt} error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
     return unique[:5]
+# ============================================================================
+# STEP 2 — BLIP ITM: IMAGE-TEXT MATCHING SCORES
+# ============================================================================
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
             scores.append(0.0)
     return scores
+# ============================================================================
+# STEP 3 — JINA RERANKER M0: SEMANTIC SCORES
+# ============================================================================
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
             scores.append(0.0)
     return scores
+# ============================================================================
+# STEP 4 — COSINE SIMILARITY: EMBEDDING SCORES
+# ============================================================================
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
+# ============================================================================
+# STEP 5 — MAJORITY VOTING: SELECT TOP 2 CAPTIONS
+# ============================================================================
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
+# ============================================================================
+# STEP 6 — QWEN2.5-1.5B: CAPTION FUSION
+# DINO objects removed from input — was causing hallucinations in fused output
+# Qwen now fuses only the two verified majority-voted captions
+# ============================================================================
+def fuse_captions(cap1: str, cap2: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
         "You write image captions. "
+        "You will receive two captions of the same image. "
         "Your job is to combine them into one detailed caption. "
         "Include ALL specific details you find: "
         "the clothing colors and style of each person, "
         "what each person looks like and what they are doing, "
+        "the objects and surroundings visible around them, "
         "and the setting or background of the scene. "
         "Write 5 to 6 sentences. Use simple, clear, everyday words. "
         "Do NOT summarize or shorten — keep every specific detail. "
     user_prompt = (
         f"Caption A: {cap1}\n"
+        f"Caption B: {cap2}\n\n"
         "Write a detailed caption that includes all the clothing, "
         "people, objects and background in details:"
     )
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
+# ============================================================================
+# SIDEBAR
+# ============================================================================
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
 **5. Majority Voting**
 Best 2 captions selected
+**6. Qwen2.5-1.5B** (Local)
 Caption fusion
     """)
     st.markdown("---")
+    st.markdown("**Local:** Florence-2, BLIP ITM, Qwen2.5")
     st.markdown("**API:** Jina")
+# ============================================================================
+# MAIN UI
+# ============================================================================
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
                 (
                     florence_proc, florence_mod,
                     blip_proc, blip_itm,
                     qwen_tok, qwen_mod
                 ) = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
+            status.info("Step 1/6: Generating captions with Florence-2-Large...")
             captions = generate_captions_florence(input_image, florence_proc, florence_mod)
+            progress.progress(16)
             with st.expander("5 Generated Captions", expanded=True):
                 for i, cap in enumerate(captions):
                     st.write(f"**{i+1}.** {cap}")
+            status.info("Step 2/6: Computing BLIP ITM scores...")
             itm_scores = compute_itm_scores(input_image, captions, blip_proc, blip_itm)
+            progress.progress(32)
+            status.info("Step 3/6: Computing Jina Reranker scores...")
             jina_scores = compute_jina_scores(input_image, captions)
+            progress.progress(50)
+            status.info("Step 4/6: Computing Cosine Similarity scores...")
             cosine_scores = compute_cosine_scores(input_image, captions, blip_proc, blip_itm)
+            progress.progress(66)
             scores_df = pd.DataFrame({
                 "Caption": [f"Cap {i+1}: {c[:50]}" for i, c in enumerate(captions)],
             with st.expander("All Scores", expanded=False):
                 st.dataframe(scores_df, use_container_width=True, hide_index=True)
+            status.info("Step 5/6: Running majority voting...")
             best_1, best_2, _, _ = majority_voting(
                 captions, itm_scores, jina_scores, cosine_scores
             )
+            progress.progress(83)
             st.markdown("### Majority Voted Captions")
             c1, c2 = st.columns(2)
             with c2:
                 st.info(f"2. {best_2}")
+            status.info("Step 6/6: Fusing captions with Qwen2.5-1.5B...")
+            final = fuse_captions(best_1, best_2, qwen_tok, qwen_mod)
             progress.progress(100)
             status.success("Pipeline complete!")