Afsha001 commited on
Commit
04d9b72
·
verified ·
1 Parent(s): ec3e187

update prompt

Browse files
Files changed (1) hide show
  1. app.py +19 -13
app.py CHANGED
@@ -378,21 +378,26 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
378
  return "Object detection unavailable", []
379
 
380
  # ============================================================================
381
- # fuse_captions — updated prompt + fixed indentation error from document
382
- # Covers: who, what they are doing, objects around, where the scene is
383
- # 2-3 sentences, simple language, only visible facts
384
- # max_new_tokens increased to 100 for full 2-3 sentence output
 
385
  # ============================================================================
386
  def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
387
 
388
  system_prompt = (
389
  "You write image captions. "
390
- "Look at the two captions and detected objects provided. "
391
- "Write ONE caption that covers: who is in the image, what they are doing, "
392
- "what objects are around them, and where the scene is taking place. "
393
- "Use simple, everyday words. Write 2 to 3 sentences. "
394
- "Only describe what is clearly visible. "
395
- "Do not guess, invent, or add dramatic language. "
 
 
 
 
396
  "Return ONLY the caption, nothing else."
397
  )
398
 
@@ -400,7 +405,8 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
400
  f"Caption A: {cap1}\n"
401
  f"Caption B: {cap2}\n"
402
  f"{objects}\n\n"
403
- "Write a clear, natural caption covering the person, action, objects and setting:"
 
404
  )
405
 
406
  try:
@@ -418,8 +424,8 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
418
  with torch.no_grad():
419
  generated_ids = qwen_mod.generate(
420
  **model_inputs,
421
- max_new_tokens=100,
422
- temperature=0.2,
423
  do_sample=True,
424
  top_p=0.9
425
  )
 
378
  return "Object detection unavailable", []
379
 
380
  # ============================================================================
381
+ # fuse_captions — CHANGED
382
+ # system_prompt: explicitly covers clothing, colors, people, objects, setting
383
+ # user_prompt: asks for all specific details including clothing and background
384
+ # max_new_tokens: 100 180 (room for 3-4 full sentences)
385
+ # temperature: 0.2 → 0.4 (more expressive while staying factual)
386
  # ============================================================================
387
  def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
388
 
389
  system_prompt = (
390
  "You write image captions. "
391
+ "You will receive two captions and a list of detected objects. "
392
+ "Your job is to combine them into one detailed caption. "
393
+ "Include ALL specific details you find: "
394
+ "the clothing colors and style of each person, "
395
+ "what each person looks like and what they are doing, "
396
+ "the objects and plants visible around them, "
397
+ "and the setting or background of the scene. "
398
+ "Write 3 to 4 sentences. Use simple, clear, everyday words. "
399
+ "Do NOT summarize or shorten — keep every specific detail. "
400
+ "Only include what is clearly visible. "
401
  "Return ONLY the caption, nothing else."
402
  )
403
 
 
405
  f"Caption A: {cap1}\n"
406
  f"Caption B: {cap2}\n"
407
  f"{objects}\n\n"
408
+ "Write a detailed caption that includes all the clothing, "
409
+ "people, objects and background details:"
410
  )
411
 
412
  try:
 
424
  with torch.no_grad():
425
  generated_ids = qwen_mod.generate(
426
  **model_inputs,
427
+ max_new_tokens=180,
428
+ temperature=0.4,
429
  do_sample=True,
430
  top_p=0.9
431
  )