Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 8 days ago

Commit

9ef58b8

verified ·

1 Parent(s): 4b0137c

update Florence2

Browse files

Files changed (1) hide show

app.py +80 -52

app.py CHANGED Viewed

@@ -27,13 +27,26 @@ JINA_HEADERS = {
     "Content-Type":  "application/json"
 }
 DETECT_PROMPT = (
-    "person . child . man . woman . boy . girl . "
-    "dog . cat . horse . bird . animal . "
-    "ball . toy . bicycle . car . bench . "
-    "tree . grass . water . sky . mountain . "
-    "building . stairs . door . fence . floor . "
-    "jacket . dress . shirt . hat . bag ."
 )
 if not JINA_KEY:
@@ -41,11 +54,8 @@ if not JINA_KEY:
     st.stop()
 # ============================================================================
-# LOAD LOCAL MODELS
-# GIT-Large-COCO: caption generation
-# BLIP ITM:       image-text matching + cosine similarity
-# DINO:           object detection
-# Qwen2.5-1.5B:  caption fusion (moved local — API was returning 404)
 # ============================================================================
 @st.cache_resource
 def load_local_models():
@@ -59,13 +69,17 @@ def load_local_models():
     )
     gc.collect()
-    # GIT-Large-COCO — caption generation
-    git_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
-    git_model     = AutoModelForCausalLM.from_pretrained(
-        "microsoft/git-large-coco",
         torch_dtype=torch.float32
     )
-    git_model.eval()
     # BLIP — ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
@@ -87,7 +101,7 @@ def load_local_models():
     )
     dino_model.eval()
-    # Qwen2.5-1.5B — caption fusion (local, no API)
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
@@ -98,7 +112,7 @@ def load_local_models():
     qwen_model.eval()
     return (
-        git_processor, git_model,
         blip_processor, blip_itm_model,
         dino_processor, dino_model,
         qwen_tokenizer, qwen_model
@@ -114,32 +128,53 @@ def image_to_data_uri(image: Image.Image) -> str:
     b64 = base64.b64encode(raw).decode()
     return f"data:image/jpeg;base64,{b64}"
-def generate_captions_git(image: Image.Image, git_proc, git_mod) -> list:
-    strategies = [
-        {"max_new_tokens": 30},
-        {"max_new_tokens": 50, "num_beams": 5, "early_stopping": True},
-        {"max_new_tokens": 60, "do_sample": True, "temperature": 0.7, "top_k": 50},
-        {"max_new_tokens": 70, "do_sample": True, "temperature": 1.3, "top_k": 100},
-        {"max_new_tokens": 55, "do_sample": True, "top_p": 0.9, "temperature": 1.0},
     ]
-    captions     = []
-    pixel_values = git_proc(images=image, return_tensors="pt").pixel_values
-    for strategy in strategies:
         try:
             with torch.no_grad():
-                generated_ids = git_mod.generate(
-                    pixel_values=pixel_values,
-                    **strategy
                 )
-            cap = git_proc.batch_decode(
-                generated_ids, skip_special_tokens=True
-            )[0].strip().lower()
             captions.append(cap if cap else "a scene shown in the image")
         except Exception as e:
-            st.warning(f"GIT error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
@@ -287,14 +322,7 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
         st.warning(f"DINO error: {str(e)[:80]}")
         return "Object detection unavailable", []
-# ============================================================================
-# STEP 7 — QWEN2.5-1.5B (LOCAL): CAPTION FUSION
-# Moved from API to local — API was consistently returning 404
-# Uses chat template for proper instruct format
-# Prompt asks Qwen to enrich and add detail using detected objects
-# ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
         "You are an expert image captioning assistant. "
         "Write ONE natural, fluent, detailed and descriptive caption. "
@@ -315,9 +343,7 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         ]
         text = qwen_tok.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
         )
         model_inputs = qwen_tok([text], return_tensors="pt")
@@ -331,7 +357,6 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
                 top_p=0.9
             )
-        # Strip input tokens from output
         output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
         fused = qwen_tok.decode(output_ids, skip_special_tokens=True).strip()
@@ -345,12 +370,15 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
-**1. GIT-Large-COCO** (Local)
 Generate 5 captions
 **2. BLIP ITM** (Local)
@@ -372,7 +400,7 @@ Object detection
 Caption fusion
     """)
     st.markdown("---")
-    st.markdown("**Local:** GIT-Large, BLIP ITM, DINO, Qwen2.5")
     st.markdown("**API:** Jina")
 st.title("Image Caption Fusion System")
@@ -397,7 +425,7 @@ if uploaded_file is not None:
             with st.spinner("Loading local models (first run takes 3-4 min)..."):
                 (
-                    git_proc, git_mod,
                     blip_proc, blip_itm,
                     dino_proc, dino_mod,
                     qwen_tok, qwen_mod
@@ -406,8 +434,8 @@ if uploaded_file is not None:
             progress = st.progress(0)
             status   = st.empty()
-            status.info("Step 1/7: Generating captions with GIT-Large-COCO...")
-            captions = generate_captions_git(input_image, git_proc, git_mod)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):

     "Content-Type":  "application/json"
 }
+# ============================================================================
+# CHANGE 1: DETECT_PROMPT — expanded with colours, furniture, objects
+# More labels = richer grounding for Qwen fusion
+# ============================================================================
 DETECT_PROMPT = (
+    "person . man . woman . boy . girl . child . baby . "
+    "red . blue . green . yellow . black . white . orange . purple . brown . "
+    "shirt . jacket . dress . coat . hat . glasses . bag . shoes . "
+    "table . chair . bench . sofa . desk . stool . wooden chair . dining table . "
+    "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
+    "car . bicycle . motorcycle . bus . truck . "
+    "tree . grass . flower . sky . water . river . mountain . road . "
+    "building . wall . door . window . floor . ceiling . stairs . "
+    "lamp . light . candle . fire . smoke . "
+    "phone . laptop . book . bag . umbrella . "
+    "dog . cat . bird . horse . animal . "
+    "food . pizza . cake . bread . fruit . "
+    "bar . restaurant . pub . cafe . kitchen . "
+    "wood . metal . glass . brick . "
+    "dark . bright . colorful ."
 )
 if not JINA_KEY:
     st.stop()
 # ============================================================================
+# CHANGE 2: load_local_models — replaced GIT with Florence-2-Large
+# Florence-2 has 3 built-in task tokens — accurate, grounded, no hallucination
 # ============================================================================
 @st.cache_resource
 def load_local_models():
     )
     gc.collect()
+    # Florence-2-Large — accurate caption generation with task tokens
+    florence_processor = AutoProcessor.from_pretrained(
+        "microsoft/Florence-2-large",
+        trust_remote_code=True
+    )
+    florence_model = AutoModelForCausalLM.from_pretrained(
+        "microsoft/Florence-2-large",
+        trust_remote_code=True,
         torch_dtype=torch.float32
     )
+    florence_model.eval()
     # BLIP — ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
     )
     dino_model.eval()
+    # Qwen2.5-1.5B — caption fusion (local)
     qwen_tokenizer = AutoTokenizer.from_pretrained(
         "Qwen/Qwen2.5-1.5B-Instruct"
     )
     qwen_model.eval()
     return (
+        florence_processor, florence_model,
         blip_processor, blip_itm_model,
         dino_processor, dino_model,
         qwen_tokenizer, qwen_model
     b64 = base64.b64encode(raw).decode()
     return f"data:image/jpeg;base64,{b64}"
+# ============================================================================
+# CHANGE 3: generate_captions_florence — replaces generate_captions_git
+# Uses Florence-2 task tokens for naturally diverse and accurate captions
+# <CAPTION> / <DETAILED_CAPTION> / <MORE_DETAILED_CAPTION>
+# ============================================================================
+def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
+    tasks = [
+        ("<CAPTION>",               {"max_new_tokens": 50,  "num_beams": 3}),
+        ("<DETAILED_CAPTION>",      {"max_new_tokens": 100, "num_beams": 3}),
+        ("<MORE_DETAILED_CAPTION>", {"max_new_tokens": 150, "num_beams": 3}),
+        ("<DETAILED_CAPTION>",      {"max_new_tokens": 100, "num_beams": 5}),
+        ("<CAPTION>",               {"max_new_tokens": 80,  "num_beams": 5}),
     ]
+    captions = []
+    for task_prompt, gen_kwargs in tasks:
         try:
+            inputs = florence_proc(
+                text=task_prompt,
+                images=image,
+                return_tensors="pt"
+            )
             with torch.no_grad():
+                generated_ids = florence_mod.generate(
+                    input_ids=inputs["input_ids"],
+                    pixel_values=inputs["pixel_values"],
+                    **gen_kwargs
                 )
+            generated_text = florence_proc.batch_decode(
+                generated_ids, skip_special_tokens=False
+            )[0]
+            parsed = florence_proc.post_process_generation(
+                generated_text,
+                task=task_prompt,
+                image_size=(image.width, image.height)
+            )
+            cap = parsed.get(task_prompt, "").strip().lower()
             captions.append(cap if cap else "a scene shown in the image")
         except Exception as e:
+            st.warning(f"Florence error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
         st.warning(f"DINO error: {str(e)[:80]}")
         return "Object detection unavailable", []
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
         "You are an expert image captioning assistant. "
         "Write ONE natural, fluent, detailed and descriptive caption. "
         ]
         text = qwen_tok.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
         )
         model_inputs = qwen_tok([text], return_tensors="pt")
                 top_p=0.9
             )
         output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
         fused = qwen_tok.decode(output_ids, skip_special_tokens=True).strip()
         st.warning(f"Qwen fusion error: {str(e)[:80]}")
         return cap1
+# ============================================================================
+# CHANGE 4: sidebar — updated step 1 label to Florence-2-Large
+# ============================================================================
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
+**1. Florence-2-Large** (Local)
 Generate 5 captions
 **2. BLIP ITM** (Local)
 Caption fusion
     """)
     st.markdown("---")
+    st.markdown("**Local:** Florence-2, BLIP ITM, DINO, Qwen2.5")
     st.markdown("**API:** Jina")
 st.title("Image Caption Fusion System")
             with st.spinner("Loading local models (first run takes 3-4 min)..."):
                 (
+                    florence_proc, florence_mod,
                     blip_proc, blip_itm,
                     dino_proc, dino_mod,
                     qwen_tok, qwen_mod
             progress = st.progress(0)
             status   = st.empty()
+            status.info("Step 1/7: Generating captions with Florence-2-Large...")
+            captions = generate_captions_florence(input_image, florence_proc, florence_mod)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):