Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 17 days ago

Commit

04d9b72

verified ·

1 Parent(s): ec3e187

update prompt

Browse files

Files changed (1) hide show

app.py +19 -13

app.py CHANGED Viewed

@@ -378,21 +378,26 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
         return "Object detection unavailable", []
 # ============================================================================
-# fuse_captions — updated prompt + fixed indentation error from document
-# Covers: who, what they are doing, objects around, where the scene is
-# 2-3 sentences, simple language, only visible facts
-# max_new_tokens increased to 100 for full 2-3 sentence output
 # ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
         "You write image captions. "
-        "Look at the two captions and detected objects provided. "
-        "Write ONE caption that covers: who is in the image, what they are doing, "
-        "what objects are around them, and where the scene is taking place. "
-        "Use simple, everyday words. Write 2 to 3 sentences. "
-        "Only describe what is clearly visible. "
-        "Do not guess, invent, or add dramatic language. "
         "Return ONLY the caption, nothing else."
     )
@@ -400,7 +405,8 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         f"Caption A: {cap1}\n"
         f"Caption B: {cap2}\n"
         f"{objects}\n\n"
-        "Write a clear, natural caption covering the person, action, objects and setting:"
     )
     try:
@@ -418,8 +424,8 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
         with torch.no_grad():
             generated_ids = qwen_mod.generate(
                 **model_inputs,
-                max_new_tokens=100,
-                temperature=0.2,
                 do_sample=True,
                 top_p=0.9
             )

         return "Object detection unavailable", []
 # ============================================================================
+# fuse_captions — CHANGED
+# system_prompt: explicitly covers clothing, colors, people, objects, setting
+# user_prompt: asks for all specific details including clothing and background
+# max_new_tokens: 100 → 180 (room for 3-4 full sentences)
+# temperature: 0.2 → 0.4 (more expressive while staying factual)
 # ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
     system_prompt = (
         "You write image captions. "
+        "You will receive two captions and a list of detected objects. "
+        "Your job is to combine them into one detailed caption. "
+        "Include ALL specific details you find: "
+        "the clothing colors and style of each person, "
+        "what each person looks like and what they are doing, "
+        "the objects and plants visible around them, "
+        "and the setting or background of the scene. "
+        "Write 3 to 4 sentences. Use simple, clear, everyday words. "
+        "Do NOT summarize or shorten — keep every specific detail. "
+        "Only include what is clearly visible. "
         "Return ONLY the caption, nothing else."
     )
         f"Caption A: {cap1}\n"
         f"Caption B: {cap2}\n"
         f"{objects}\n\n"
+        "Write a detailed caption that includes all the clothing, "
+        "people, objects and background details:"
     )
     try:
         with torch.no_grad():
             generated_ids = qwen_mod.generate(
                 **model_inputs,
+                max_new_tokens=180,
+                temperature=0.4,
                 do_sample=True,
                 top_p=0.9
             )