Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 7 days ago

Commit

380cdcf

verified ·

1 Parent(s): 6abcc47

update

Browse files

Files changed (1) hide show

app.py +29 -17

app.py CHANGED Viewed

@@ -112,16 +112,17 @@ def image_to_data_uri(image: Image.Image) -> str:
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
-# CHANGED: generate_captions_florence — speed optimized
 #
-# What changed:
-# 1. num_beams 3 → 1 (greedy decoding) — 3x faster, near-identical quality
-# 2. max_new_tokens reduced: 50→30, 100→80, 150→120 — only generate what needed
-# 3. Removed DENSE_REGION_CAPTION and OD tasks — slowest tasks (200 tokens each)
-#    and they return structured bounding box data not natural captions anyway
 #
-# Speed result: ~2-3 min → ~25 sec
-# Quality result: no meaningful loss — 3 caption tasks still give full diversity
 # ============================================================================
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
@@ -129,12 +130,24 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
     image_size = (image.width, image.height)
     tasks = [
-        ("<CAPTION>",               30,  1),
-        ("<DETAILED_CAPTION>",      80,  1),
-        ("<MORE_DETAILED_CAPTION>", 120, 1),
     ]
-    for task_prompt, max_tokens, num_beams in tasks:
         try:
             inputs = florence_proc(
                 text=task_prompt, images=image, return_tensors="pt"
@@ -144,7 +157,7 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
                     input_ids=inputs["input_ids"],
                     pixel_values=inputs["pixel_values"],
                     max_new_tokens=max_tokens,
-                    num_beams=num_beams
                 )
             raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
             parsed = florence_proc.post_process_generation(
@@ -157,6 +170,7 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
             st.warning(f"Florence {task_prompt} error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
@@ -342,10 +356,8 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         with torch.no_grad():
             generated_ids = qwen_mod.generate(
                 **model_inputs,
-                max_new_tokens=180,
-                temperature=0.4,
-                do_sample=True,
-                top_p=0.9
             )
         output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]

     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
+# generate_captions_florence — speed optimized + diversity fixed
 #
+# Problem: num_beams=1 greedy produces near-identical captions across tasks
+# Fix:     Task 1 stays greedy (baseline), Tasks 2 and 3 use sampling
+#          with increasing temperature — each task explores different word paths
 #
+# Task 1: greedy       → deterministic, short, factual baseline
+# Task 2: temp=0.7     → slightly varied, focuses on detail
+# Task 3: temp=1.1     → more varied phrasing, different sentence structure
+#
+# Speed: sampling is as fast or faster than beam search — no regression
 # ============================================================================
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
     image_size = (image.width, image.height)
     tasks = [
+        (
+            "<CAPTION>",
+            30,
+            {"num_beams": 1}
+        ),
+        (
+            "<DETAILED_CAPTION>",
+            80,
+            {"do_sample": True, "temperature": 0.7, "top_p": 0.9}
+        ),
+        (
+            "<MORE_DETAILED_CAPTION>",
+            120,
+            {"do_sample": True, "temperature": 1.1, "top_p": 0.95}
+        ),
     ]
+    for task_prompt, max_tokens, gen_params in tasks:
         try:
             inputs = florence_proc(
                 text=task_prompt, images=image, return_tensors="pt"
                     input_ids=inputs["input_ids"],
                     pixel_values=inputs["pixel_values"],
                     max_new_tokens=max_tokens,
+                    **gen_params
                 )
             raw    = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
             parsed = florence_proc.post_process_generation(
             st.warning(f"Florence {task_prompt} error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
+    # Deduplicate while keeping order
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
         with torch.no_grad():
             generated_ids = qwen_mod.generate(
                 **model_inputs,
+                max_new_tokens=120,
+                do_sample=False
             )
         output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]