Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 7 days ago

Commit

ec3e187

verified ·

1 Parent(s): eee50c1

update

Browse files

Files changed (1) hide show

app.py +20 -55

app.py CHANGED Viewed

@@ -28,31 +28,16 @@ JINA_HEADERS = {
 }
 DETECT_PROMPT = (
-    # Core Subjects & Actions
     "person . man . woman . boy . girl . child . baby . a group of people . "
     "sitting on a chair . riding a bicycle . holding an object . walking on the road . "
-    # Textures & Materials
     "wooden surface . shiny metal . smooth glass . brick wall . leather bag . denim clothing . "
-    # Detailed Apparel & Wearables
     "shirt . jacket . dress . coat . hat . glasses . backpack . shoes . tie . "
-    # Common Interior Objects
     "table . chair . bench . sofa . desk . laptop . phone . book . umbrella . "
     "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
-    # Environmental & Spatial Elements
     "in the foreground . in the background . tree . grass . flower . sky . "
     "water . river . mountain . road . building . wall . door . window . floor . "
-    # Lighting & Atmospheric Context
     "dark shadow . bright light . sunny day . indoor lamp . reflection . colorful texture . "
-    # Animals & Food
     "dog . cat . bird . horse . animal . pizza . cake . bread . fruit . "
-    # Transportation & Setting
     "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
 )
@@ -127,22 +112,6 @@ def image_to_data_uri(image: Image.Image) -> str:
     b64 = base64.b64encode(raw).decode()
     return f"data:image/jpeg;base64,{b64}"
-# ============================================================================
-# CHANGE 1: generate_captions_florence
-# 5 different Florence-2 task tokens — each gives a different perspective
-#
-# Task breakdown:
-# <CAPTION>               → short overall scene description
-# <DETAILED_CAPTION>      → longer overall scene description
-# <MORE_DETAILED_CAPTION> → most detailed overall description
-# <DENSE_REGION_CAPTION>  → describes individual regions of the image
-#                           (returns region labels → joined into a sentence)
-# <OD>                    → object detection labels
-#                           (returns detected objects → formatted as caption)
-#
-# OD and DENSE_REGION_CAPTION return structured data not plain text,
-# so we extract their labels and convert to readable captions manually.
-# ============================================================================
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
     captions   = []
@@ -206,13 +175,11 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
         captions.append("a scene shown in the image")
     # Task 4: Dense region caption
-    # Returns descriptions per image region — join them into one sentence
     try:
         inputs = florence_proc(
             text="<DENSE_REGION_CAPTION>", images=image, return_tensors="pt"
         )
         with torch.no_grad():
-            ids = florence_proc.post_process_generation
             ids = florence_mod.generate(
                 input_ids=inputs["input_ids"],
                 pixel_values=inputs["pixel_values"],
@@ -223,7 +190,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
         labels = parsed.get("<DENSE_REGION_CAPTION>", {}).get("labels", [])
         if labels:
-            # Remove duplicates while preserving order
             seen_r, unique_r = set(), []
             for l in labels:
                 if l.lower() not in seen_r:
@@ -238,7 +204,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
         captions.append("a scene shown in the image")
     # Task 5: Object detection
-    # Returns detected object labels — format as descriptive caption
     try:
         inputs = florence_proc(
             text="<OD>", images=image, return_tensors="pt"
@@ -267,7 +232,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
         st.warning(f"Florence OD error: {str(e)[:80]}")
         captions.append("a scene shown in the image")
-    # Deduplicate while preserving order
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
@@ -414,29 +378,30 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
         return "Object detection unavailable", []
 # ============================================================================
-# CHANGE 2: fuse_captions — simpler, natural prompt
-# Old prompt said "detailed and descriptive" → caused AI-sounding output
-# New prompt asks for simple, factual, human-like language
 # ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
-    "You write image captions. "
-    "Look at the two captions and detected objects provided. "
-    "Write ONE caption that covers: who is in the image, what they are doing, "
-    "what objects are around them, and where the scene is taking place. "
-    "Use simple, everyday words. Write 2 to 3 sentences. "
-    "Only describe what is clearly visible. "
-    "Do not guess, invent, or add dramatic language. "
-    "Return ONLY the caption, nothing else."
-)
-user_prompt = (
-    f"Caption A: {cap1}\n"
-    f"Caption B: {cap2}\n"
-    f"{objects}\n\n"
-    "Write a clear, natural caption covering the person, action, objects and setting:"
-)
     try:
         messages = [
@@ -453,7 +418,7 @@ user_prompt = (
         with torch.no_grad():
             generated_ids = qwen_mod.generate(
                 **model_inputs,
-                max_new_tokens=60,
                 temperature=0.2,
                 do_sample=True,
                 top_p=0.9

 }
 DETECT_PROMPT = (
     "person . man . woman . boy . girl . child . baby . a group of people . "
     "sitting on a chair . riding a bicycle . holding an object . walking on the road . "
     "wooden surface . shiny metal . smooth glass . brick wall . leather bag . denim clothing . "
     "shirt . jacket . dress . coat . hat . glasses . backpack . shoes . tie . "
     "table . chair . bench . sofa . desk . laptop . phone . book . umbrella . "
     "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
     "in the foreground . in the background . tree . grass . flower . sky . "
     "water . river . mountain . road . building . wall . door . window . floor . "
     "dark shadow . bright light . sunny day . indoor lamp . reflection . colorful texture . "
     "dog . cat . bird . horse . animal . pizza . cake . bread . fruit . "
     "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
 )
     b64 = base64.b64encode(raw).decode()
     return f"data:image/jpeg;base64,{b64}"
 def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
     captions   = []
         captions.append("a scene shown in the image")
     # Task 4: Dense region caption
     try:
         inputs = florence_proc(
             text="<DENSE_REGION_CAPTION>", images=image, return_tensors="pt"
         )
         with torch.no_grad():
             ids = florence_mod.generate(
                 input_ids=inputs["input_ids"],
                 pixel_values=inputs["pixel_values"],
         labels = parsed.get("<DENSE_REGION_CAPTION>", {}).get("labels", [])
         if labels:
             seen_r, unique_r = set(), []
             for l in labels:
                 if l.lower() not in seen_r:
         captions.append("a scene shown in the image")
     # Task 5: Object detection
     try:
         inputs = florence_proc(
             text="<OD>", images=image, return_tensors="pt"
         st.warning(f"Florence OD error: {str(e)[:80]}")
         captions.append("a scene shown in the image")
     seen, unique = set(), []
     for c in captions:
         if c not in seen:
         return "Object detection unavailable", []
 # ============================================================================
+# fuse_captions — updated prompt + fixed indentation error from document
+# Covers: who, what they are doing, objects around, where the scene is
+# 2-3 sentences, simple language, only visible facts
+# max_new_tokens increased to 100 for full 2-3 sentence output
 # ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
+        "You write image captions. "
+        "Look at the two captions and detected objects provided. "
+        "Write ONE caption that covers: who is in the image, what they are doing, "
+        "what objects are around them, and where the scene is taking place. "
+        "Use simple, everyday words. Write 2 to 3 sentences. "
+        "Only describe what is clearly visible. "
+        "Do not guess, invent, or add dramatic language. "
+        "Return ONLY the caption, nothing else."
+    )
+    user_prompt = (
+        f"Caption A: {cap1}\n"
+        f"Caption B: {cap2}\n"
+        f"{objects}\n\n"
+        "Write a clear, natural caption covering the person, action, objects and setting:"
+    )
     try:
         messages = [
         with torch.no_grad():
             generated_ids = qwen_mod.generate(
                 **model_inputs,
+                max_new_tokens=100,
                 temperature=0.2,
                 do_sample=True,
                 top_p=0.9