Afsha001 commited on
Commit
ec3e187
·
verified ·
1 Parent(s): eee50c1
Files changed (1) hide show
  1. app.py +20 -55
app.py CHANGED
@@ -28,31 +28,16 @@ JINA_HEADERS = {
28
  }
29
 
30
  DETECT_PROMPT = (
31
- # Core Subjects & Actions
32
  "person . man . woman . boy . girl . child . baby . a group of people . "
33
  "sitting on a chair . riding a bicycle . holding an object . walking on the road . "
34
-
35
- # Textures & Materials
36
  "wooden surface . shiny metal . smooth glass . brick wall . leather bag . denim clothing . "
37
-
38
- # Detailed Apparel & Wearables
39
  "shirt . jacket . dress . coat . hat . glasses . backpack . shoes . tie . "
40
-
41
- # Common Interior Objects
42
  "table . chair . bench . sofa . desk . laptop . phone . book . umbrella . "
43
  "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
44
-
45
- # Environmental & Spatial Elements
46
  "in the foreground . in the background . tree . grass . flower . sky . "
47
  "water . river . mountain . road . building . wall . door . window . floor . "
48
-
49
- # Lighting & Atmospheric Context
50
  "dark shadow . bright light . sunny day . indoor lamp . reflection . colorful texture . "
51
-
52
- # Animals & Food
53
  "dog . cat . bird . horse . animal . pizza . cake . bread . fruit . "
54
-
55
- # Transportation & Setting
56
  "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
57
  )
58
 
@@ -127,22 +112,6 @@ def image_to_data_uri(image: Image.Image) -> str:
127
  b64 = base64.b64encode(raw).decode()
128
  return f"data:image/jpeg;base64,{b64}"
129
 
130
- # ============================================================================
131
- # CHANGE 1: generate_captions_florence
132
- # 5 different Florence-2 task tokens — each gives a different perspective
133
- #
134
- # Task breakdown:
135
- # <CAPTION> → short overall scene description
136
- # <DETAILED_CAPTION> → longer overall scene description
137
- # <MORE_DETAILED_CAPTION> → most detailed overall description
138
- # <DENSE_REGION_CAPTION> → describes individual regions of the image
139
- # (returns region labels → joined into a sentence)
140
- # <OD> → object detection labels
141
- # (returns detected objects → formatted as caption)
142
- #
143
- # OD and DENSE_REGION_CAPTION return structured data not plain text,
144
- # so we extract their labels and convert to readable captions manually.
145
- # ============================================================================
146
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
147
 
148
  captions = []
@@ -206,13 +175,11 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
206
  captions.append("a scene shown in the image")
207
 
208
  # Task 4: Dense region caption
209
- # Returns descriptions per image region — join them into one sentence
210
  try:
211
  inputs = florence_proc(
212
  text="<DENSE_REGION_CAPTION>", images=image, return_tensors="pt"
213
  )
214
  with torch.no_grad():
215
- ids = florence_proc.post_process_generation
216
  ids = florence_mod.generate(
217
  input_ids=inputs["input_ids"],
218
  pixel_values=inputs["pixel_values"],
@@ -223,7 +190,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
223
  labels = parsed.get("<DENSE_REGION_CAPTION>", {}).get("labels", [])
224
 
225
  if labels:
226
- # Remove duplicates while preserving order
227
  seen_r, unique_r = set(), []
228
  for l in labels:
229
  if l.lower() not in seen_r:
@@ -238,7 +204,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
238
  captions.append("a scene shown in the image")
239
 
240
  # Task 5: Object detection
241
- # Returns detected object labels — format as descriptive caption
242
  try:
243
  inputs = florence_proc(
244
  text="<OD>", images=image, return_tensors="pt"
@@ -267,7 +232,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
267
  st.warning(f"Florence OD error: {str(e)[:80]}")
268
  captions.append("a scene shown in the image")
269
 
270
- # Deduplicate while preserving order
271
  seen, unique = set(), []
272
  for c in captions:
273
  if c not in seen:
@@ -414,29 +378,30 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
414
  return "Object detection unavailable", []
415
 
416
  # ============================================================================
417
- # CHANGE 2: fuse_captions — simpler, natural prompt
418
- # Old prompt said "detailed and descriptive" caused AI-sounding output
419
- # New prompt asks for simple, factual, human-like language
 
420
  # ============================================================================
421
  def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
422
 
423
  system_prompt = (
424
- "You write image captions. "
425
- "Look at the two captions and detected objects provided. "
426
- "Write ONE caption that covers: who is in the image, what they are doing, "
427
- "what objects are around them, and where the scene is taking place. "
428
- "Use simple, everyday words. Write 2 to 3 sentences. "
429
- "Only describe what is clearly visible. "
430
- "Do not guess, invent, or add dramatic language. "
431
- "Return ONLY the caption, nothing else."
432
- )
433
 
434
- user_prompt = (
435
- f"Caption A: {cap1}\n"
436
- f"Caption B: {cap2}\n"
437
- f"{objects}\n\n"
438
- "Write a clear, natural caption covering the person, action, objects and setting:"
439
- )
440
 
441
  try:
442
  messages = [
@@ -453,7 +418,7 @@ user_prompt = (
453
  with torch.no_grad():
454
  generated_ids = qwen_mod.generate(
455
  **model_inputs,
456
- max_new_tokens=60,
457
  temperature=0.2,
458
  do_sample=True,
459
  top_p=0.9
 
28
  }
29
 
30
  DETECT_PROMPT = (
 
31
  "person . man . woman . boy . girl . child . baby . a group of people . "
32
  "sitting on a chair . riding a bicycle . holding an object . walking on the road . "
 
 
33
  "wooden surface . shiny metal . smooth glass . brick wall . leather bag . denim clothing . "
 
 
34
  "shirt . jacket . dress . coat . hat . glasses . backpack . shoes . tie . "
 
 
35
  "table . chair . bench . sofa . desk . laptop . phone . book . umbrella . "
36
  "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
 
 
37
  "in the foreground . in the background . tree . grass . flower . sky . "
38
  "water . river . mountain . road . building . wall . door . window . floor . "
 
 
39
  "dark shadow . bright light . sunny day . indoor lamp . reflection . colorful texture . "
 
 
40
  "dog . cat . bird . horse . animal . pizza . cake . bread . fruit . "
 
 
41
  "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
42
  )
43
 
 
112
  b64 = base64.b64encode(raw).decode()
113
  return f"data:image/jpeg;base64,{b64}"
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
116
 
117
  captions = []
 
175
  captions.append("a scene shown in the image")
176
 
177
  # Task 4: Dense region caption
 
178
  try:
179
  inputs = florence_proc(
180
  text="<DENSE_REGION_CAPTION>", images=image, return_tensors="pt"
181
  )
182
  with torch.no_grad():
 
183
  ids = florence_mod.generate(
184
  input_ids=inputs["input_ids"],
185
  pixel_values=inputs["pixel_values"],
 
190
  labels = parsed.get("<DENSE_REGION_CAPTION>", {}).get("labels", [])
191
 
192
  if labels:
 
193
  seen_r, unique_r = set(), []
194
  for l in labels:
195
  if l.lower() not in seen_r:
 
204
  captions.append("a scene shown in the image")
205
 
206
  # Task 5: Object detection
 
207
  try:
208
  inputs = florence_proc(
209
  text="<OD>", images=image, return_tensors="pt"
 
232
  st.warning(f"Florence OD error: {str(e)[:80]}")
233
  captions.append("a scene shown in the image")
234
 
 
235
  seen, unique = set(), []
236
  for c in captions:
237
  if c not in seen:
 
378
  return "Object detection unavailable", []
379
 
380
  # ============================================================================
381
+ # fuse_captions — updated prompt + fixed indentation error from document
382
+ # Covers: who, what they are doing, objects around, where the scene is
383
+ # 2-3 sentences, simple language, only visible facts
384
+ # max_new_tokens increased to 100 for full 2-3 sentence output
385
  # ============================================================================
386
  def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
387
 
388
  system_prompt = (
389
+ "You write image captions. "
390
+ "Look at the two captions and detected objects provided. "
391
+ "Write ONE caption that covers: who is in the image, what they are doing, "
392
+ "what objects are around them, and where the scene is taking place. "
393
+ "Use simple, everyday words. Write 2 to 3 sentences. "
394
+ "Only describe what is clearly visible. "
395
+ "Do not guess, invent, or add dramatic language. "
396
+ "Return ONLY the caption, nothing else."
397
+ )
398
 
399
+ user_prompt = (
400
+ f"Caption A: {cap1}\n"
401
+ f"Caption B: {cap2}\n"
402
+ f"{objects}\n\n"
403
+ "Write a clear, natural caption covering the person, action, objects and setting:"
404
+ )
405
 
406
  try:
407
  messages = [
 
418
  with torch.no_grad():
419
  generated_ids = qwen_mod.generate(
420
  **model_inputs,
421
+ max_new_tokens=100,
422
  temperature=0.2,
423
  do_sample=True,
424
  top_p=0.9