Afsha001 commited on
Commit
f35b1cd
·
verified ·
1 Parent(s): 9ef58b8
Files changed (1) hide show
  1. app.py +170 -74
app.py CHANGED
@@ -27,36 +27,39 @@ JINA_HEADERS = {
27
  "Content-Type": "application/json"
28
  }
29
 
30
- # ============================================================================
31
- # CHANGE 1: DETECT_PROMPT — expanded with colours, furniture, objects
32
- # More labels = richer grounding for Qwen fusion
33
- # ============================================================================
34
  DETECT_PROMPT = (
35
- "person . man . woman . boy . girl . child . baby . "
36
- "red . blue . green . yellow . black . white . orange . purple . brown . "
37
- "shirt . jacket . dress . coat . hat . glasses . bag . shoes . "
38
- "table . chair . bench . sofa . desk . stool . wooden chair . dining table . "
 
 
 
 
 
 
 
 
39
  "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
40
- "car . bicycle . motorcycle . bus . truck . "
41
- "tree . grass . flower . sky . water . river . mountain . road . "
42
- "building . wall . door . window . floor . ceiling . stairs . "
43
- "lamp . light . candle . fire . smoke . "
44
- "phone . laptop . book . bag . umbrella . "
45
- "dog . cat . bird . horse . animal . "
46
- "food . pizza . cake . bread . fruit . "
47
- "bar . restaurant . pub . cafe . kitchen . "
48
- "wood . metal . glass . brick . "
49
- "dark . bright . colorful ."
 
 
 
50
  )
51
 
52
  if not JINA_KEY:
53
  st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
54
  st.stop()
55
 
56
- # ============================================================================
57
- # CHANGE 2: load_local_models — replaced GIT with Florence-2-Large
58
- # Florence-2 has 3 built-in task tokens — accurate, grounded, no hallucination
59
- # ============================================================================
60
  @st.cache_resource
61
  def load_local_models():
62
  from transformers import (
@@ -69,7 +72,6 @@ def load_local_models():
69
  )
70
  gc.collect()
71
 
72
- # Florence-2-Large — accurate caption generation with task tokens
73
  florence_processor = AutoProcessor.from_pretrained(
74
  "microsoft/Florence-2-large",
75
  trust_remote_code=True
@@ -81,7 +83,6 @@ def load_local_models():
81
  )
82
  florence_model.eval()
83
 
84
- # BLIP — ITM scoring and cosine similarity
85
  blip_processor = BlipProcessor.from_pretrained(
86
  "Salesforce/blip-image-captioning-large"
87
  )
@@ -91,7 +92,6 @@ def load_local_models():
91
  )
92
  blip_itm_model.eval()
93
 
94
- # DINO — object detection
95
  dino_processor = AutoProcessor.from_pretrained(
96
  "IDEA-Research/grounding-dino-base"
97
  )
@@ -101,7 +101,6 @@ def load_local_models():
101
  )
102
  dino_model.eval()
103
 
104
- # Qwen2.5-1.5B — caption fusion (local)
105
  qwen_tokenizer = AutoTokenizer.from_pretrained(
106
  "Qwen/Qwen2.5-1.5B-Instruct"
107
  )
@@ -129,54 +128,146 @@ def image_to_data_uri(image: Image.Image) -> str:
129
  return f"data:image/jpeg;base64,{b64}"
130
 
131
  # ============================================================================
132
- # CHANGE 3: generate_captions_florence — replaces generate_captions_git
133
- # Uses Florence-2 task tokens for naturally diverse and accurate captions
134
- # <CAPTION> / <DETAILED_CAPTION> / <MORE_DETAILED_CAPTION>
 
 
 
 
 
 
 
 
 
 
 
135
  # ============================================================================
136
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
137
 
138
- tasks = [
139
- ("<CAPTION>", {"max_new_tokens": 50, "num_beams": 3}),
140
- ("<DETAILED_CAPTION>", {"max_new_tokens": 100, "num_beams": 3}),
141
- ("<MORE_DETAILED_CAPTION>", {"max_new_tokens": 150, "num_beams": 3}),
142
- ("<DETAILED_CAPTION>", {"max_new_tokens": 100, "num_beams": 5}),
143
- ("<CAPTION>", {"max_new_tokens": 80, "num_beams": 5}),
144
- ]
145
 
146
- captions = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- for task_prompt, gen_kwargs in tasks:
149
- try:
150
- inputs = florence_proc(
151
- text=task_prompt,
152
- images=image,
153
- return_tensors="pt"
 
 
 
 
154
  )
 
 
 
 
 
 
 
155
 
156
- with torch.no_grad():
157
- generated_ids = florence_mod.generate(
158
- input_ids=inputs["input_ids"],
159
- pixel_values=inputs["pixel_values"],
160
- **gen_kwargs
161
- )
162
-
163
- generated_text = florence_proc.batch_decode(
164
- generated_ids, skip_special_tokens=False
165
- )[0]
166
-
167
- parsed = florence_proc.post_process_generation(
168
- generated_text,
169
- task=task_prompt,
170
- image_size=(image.width, image.height)
171
  )
 
 
 
 
 
 
 
172
 
173
- cap = parsed.get(task_prompt, "").strip().lower()
174
- captions.append(cap if cap else "a scene shown in the image")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
- except Exception as e:
177
- st.warning(f"Florence error: {str(e)[:80]}")
178
- captions.append("a scene shown in the image")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
 
180
  seen, unique = set(), []
181
  for c in captions:
182
  if c not in seen:
@@ -322,18 +413,26 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
322
  st.warning(f"DINO error: {str(e)[:80]}")
323
  return "Object detection unavailable", []
324
 
 
 
 
 
 
325
  def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
 
326
  system_prompt = (
327
- "You are an expert image captioning assistant. "
328
- "Write ONE natural, fluent, detailed and descriptive caption. "
329
- "Combine the best details from both captions and incorporate the detected objects. "
330
- "Return ONLY the final caption, no explanation or prefix."
 
 
331
  )
332
  user_prompt = (
333
  f"Caption A: {cap1}\n"
334
  f"Caption B: {cap2}\n"
335
  f"{objects}\n\n"
336
- "Write a detailed fused caption:"
337
  )
338
 
339
  try:
@@ -351,8 +450,8 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
351
  with torch.no_grad():
352
  generated_ids = qwen_mod.generate(
353
  **model_inputs,
354
- max_new_tokens=120,
355
- temperature=0.3,
356
  do_sample=True,
357
  top_p=0.9
358
  )
@@ -360,7 +459,7 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
360
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
361
  fused = qwen_tok.decode(output_ids, skip_special_tokens=True).strip()
362
 
363
- for prefix in ["Fused caption:", "Caption:", "Result:", "Answer:"]:
364
  if fused.lower().startswith(prefix.lower()):
365
  fused = fused[len(prefix):].strip()
366
 
@@ -370,9 +469,6 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
370
  st.warning(f"Qwen fusion error: {str(e)[:80]}")
371
  return cap1
372
 
373
- # ============================================================================
374
- # CHANGE 4: sidebar — updated step 1 label to Florence-2-Large
375
- # ============================================================================
376
  with st.sidebar:
377
  st.title("Image Caption Fusion")
378
  st.markdown("---")
 
27
  "Content-Type": "application/json"
28
  }
29
 
 
 
 
 
30
  DETECT_PROMPT = (
31
+ # Core Subjects & Actions
32
+ "person . man . woman . boy . girl . child . baby . a group of people . "
33
+ "sitting on a chair . riding a bicycle . holding an object . walking on the road . "
34
+
35
+ # Textures & Materials
36
+ "wooden surface . shiny metal . smooth glass . brick wall . leather bag . denim clothing . "
37
+
38
+ # Detailed Apparel & Wearables
39
+ "shirt . jacket . dress . coat . hat . glasses . backpack . shoes . tie . "
40
+
41
+ # Common Interior Objects
42
+ "table . chair . bench . sofa . desk . laptop . phone . book . umbrella . "
43
  "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
44
+
45
+ # Environmental & Spatial Elements
46
+ "in the foreground . in the background . tree . grass . flower . sky . "
47
+ "water . river . mountain . road . building . wall . door . window . floor . "
48
+
49
+ # Lighting & Atmospheric Context
50
+ "dark shadow . bright light . sunny day . indoor lamp . reflection . colorful texture . "
51
+
52
+ # Animals & Food
53
+ "dog . cat . bird . horse . animal . pizza . cake . bread . fruit . "
54
+
55
+ # Transportation & Setting
56
+ "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
57
  )
58
 
59
  if not JINA_KEY:
60
  st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
61
  st.stop()
62
 
 
 
 
 
63
  @st.cache_resource
64
  def load_local_models():
65
  from transformers import (
 
72
  )
73
  gc.collect()
74
 
 
75
  florence_processor = AutoProcessor.from_pretrained(
76
  "microsoft/Florence-2-large",
77
  trust_remote_code=True
 
83
  )
84
  florence_model.eval()
85
 
 
86
  blip_processor = BlipProcessor.from_pretrained(
87
  "Salesforce/blip-image-captioning-large"
88
  )
 
92
  )
93
  blip_itm_model.eval()
94
 
 
95
  dino_processor = AutoProcessor.from_pretrained(
96
  "IDEA-Research/grounding-dino-base"
97
  )
 
101
  )
102
  dino_model.eval()
103
 
 
104
  qwen_tokenizer = AutoTokenizer.from_pretrained(
105
  "Qwen/Qwen2.5-1.5B-Instruct"
106
  )
 
128
  return f"data:image/jpeg;base64,{b64}"
129
 
130
  # ============================================================================
131
+ # CHANGE 1: generate_captions_florence
132
+ # 5 different Florence-2 task tokens each gives a different perspective
133
+ #
134
+ # Task breakdown:
135
+ # <CAPTION> → short overall scene description
136
+ # <DETAILED_CAPTION> → longer overall scene description
137
+ # <MORE_DETAILED_CAPTION> → most detailed overall description
138
+ # <DENSE_REGION_CAPTION> → describes individual regions of the image
139
+ # (returns region labels → joined into a sentence)
140
+ # <OD> → object detection labels
141
+ # (returns detected objects → formatted as caption)
142
+ #
143
+ # OD and DENSE_REGION_CAPTION return structured data not plain text,
144
+ # so we extract their labels and convert to readable captions manually.
145
  # ============================================================================
146
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
147
 
148
+ captions = []
149
+ image_size = (image.width, image.height)
 
 
 
 
 
150
 
151
+ # Task 1: Short caption
152
+ try:
153
+ inputs = florence_proc(
154
+ text="<CAPTION>", images=image, return_tensors="pt"
155
+ )
156
+ with torch.no_grad():
157
+ ids = florence_mod.generate(
158
+ input_ids=inputs["input_ids"],
159
+ pixel_values=inputs["pixel_values"],
160
+ max_new_tokens=50, num_beams=3
161
+ )
162
+ raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
163
+ parsed = florence_proc.post_process_generation(raw, task="<CAPTION>", image_size=image_size)
164
+ cap = parsed.get("<CAPTION>", "").strip().lower()
165
+ captions.append(cap if cap else "a scene shown in the image")
166
+ except Exception as e:
167
+ st.warning(f"Florence CAPTION error: {str(e)[:80]}")
168
+ captions.append("a scene shown in the image")
169
 
170
+ # Task 2: Detailed caption
171
+ try:
172
+ inputs = florence_proc(
173
+ text="<DETAILED_CAPTION>", images=image, return_tensors="pt"
174
+ )
175
+ with torch.no_grad():
176
+ ids = florence_mod.generate(
177
+ input_ids=inputs["input_ids"],
178
+ pixel_values=inputs["pixel_values"],
179
+ max_new_tokens=100, num_beams=3
180
  )
181
+ raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
182
+ parsed = florence_proc.post_process_generation(raw, task="<DETAILED_CAPTION>", image_size=image_size)
183
+ cap = parsed.get("<DETAILED_CAPTION>", "").strip().lower()
184
+ captions.append(cap if cap else "a scene shown in the image")
185
+ except Exception as e:
186
+ st.warning(f"Florence DETAILED_CAPTION error: {str(e)[:80]}")
187
+ captions.append("a scene shown in the image")
188
 
189
+ # Task 3: More detailed caption
190
+ try:
191
+ inputs = florence_proc(
192
+ text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt"
193
+ )
194
+ with torch.no_grad():
195
+ ids = florence_mod.generate(
196
+ input_ids=inputs["input_ids"],
197
+ pixel_values=inputs["pixel_values"],
198
+ max_new_tokens=150, num_beams=3
 
 
 
 
 
199
  )
200
+ raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
201
+ parsed = florence_proc.post_process_generation(raw, task="<MORE_DETAILED_CAPTION>", image_size=image_size)
202
+ cap = parsed.get("<MORE_DETAILED_CAPTION>", "").strip().lower()
203
+ captions.append(cap if cap else "a scene shown in the image")
204
+ except Exception as e:
205
+ st.warning(f"Florence MORE_DETAILED_CAPTION error: {str(e)[:80]}")
206
+ captions.append("a scene shown in the image")
207
 
208
+ # Task 4: Dense region caption
209
+ # Returns descriptions per image region join them into one sentence
210
+ try:
211
+ inputs = florence_proc(
212
+ text="<DENSE_REGION_CAPTION>", images=image, return_tensors="pt"
213
+ )
214
+ with torch.no_grad():
215
+ ids = florence_proc.post_process_generation
216
+ ids = florence_mod.generate(
217
+ input_ids=inputs["input_ids"],
218
+ pixel_values=inputs["pixel_values"],
219
+ max_new_tokens=200, num_beams=3
220
+ )
221
+ raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
222
+ parsed = florence_proc.post_process_generation(raw, task="<DENSE_REGION_CAPTION>", image_size=image_size)
223
+ labels = parsed.get("<DENSE_REGION_CAPTION>", {}).get("labels", [])
224
+
225
+ if labels:
226
+ # Remove duplicates while preserving order
227
+ seen_r, unique_r = set(), []
228
+ for l in labels:
229
+ if l.lower() not in seen_r:
230
+ seen_r.add(l.lower())
231
+ unique_r.append(l.lower())
232
+ cap = ", ".join(unique_r[:6]) + " visible in the scene"
233
+ else:
234
+ cap = "a scene shown in the image"
235
+ captions.append(cap)
236
+ except Exception as e:
237
+ st.warning(f"Florence DENSE_REGION error: {str(e)[:80]}")
238
+ captions.append("a scene shown in the image")
239
 
240
+ # Task 5: Object detection
241
+ # Returns detected object labels — format as descriptive caption
242
+ try:
243
+ inputs = florence_proc(
244
+ text="<OD>", images=image, return_tensors="pt"
245
+ )
246
+ with torch.no_grad():
247
+ ids = florence_mod.generate(
248
+ input_ids=inputs["input_ids"],
249
+ pixel_values=inputs["pixel_values"],
250
+ max_new_tokens=200, num_beams=3
251
+ )
252
+ raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
253
+ parsed = florence_proc.post_process_generation(raw, task="<OD>", image_size=image_size)
254
+ labels = parsed.get("<OD>", {}).get("labels", [])
255
+
256
+ if labels:
257
+ seen_o, unique_o = set(), []
258
+ for l in labels:
259
+ if l.lower() not in seen_o:
260
+ seen_o.add(l.lower())
261
+ unique_o.append(l.lower())
262
+ cap = "a scene containing " + ", ".join(unique_o[:6])
263
+ else:
264
+ cap = "a scene shown in the image"
265
+ captions.append(cap)
266
+ except Exception as e:
267
+ st.warning(f"Florence OD error: {str(e)[:80]}")
268
+ captions.append("a scene shown in the image")
269
 
270
+ # Deduplicate while preserving order
271
  seen, unique = set(), []
272
  for c in captions:
273
  if c not in seen:
 
413
  st.warning(f"DINO error: {str(e)[:80]}")
414
  return "Object detection unavailable", []
415
 
416
+ # ============================================================================
417
+ # CHANGE 2: fuse_captions — simpler, natural prompt
418
+ # Old prompt said "detailed and descriptive" → caused AI-sounding output
419
+ # New prompt asks for simple, factual, human-like language
420
+ # ============================================================================
421
  def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
422
+
423
  system_prompt = (
424
+ "You are helping write image captions. "
425
+ "Write ONE short, simple, factual caption exactly as a person would "
426
+ "naturally describe this photo. Use plain everyday language. "
427
+ "Do not add any details that are not clearly visible. "
428
+ "Do not use dramatic or poetic language. "
429
+ "Return ONLY the caption, nothing else."
430
  )
431
  user_prompt = (
432
  f"Caption A: {cap1}\n"
433
  f"Caption B: {cap2}\n"
434
  f"{objects}\n\n"
435
+ "Write a simple natural caption:"
436
  )
437
 
438
  try:
 
450
  with torch.no_grad():
451
  generated_ids = qwen_mod.generate(
452
  **model_inputs,
453
+ max_new_tokens=60,
454
+ temperature=0.2,
455
  do_sample=True,
456
  top_p=0.9
457
  )
 
459
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
460
  fused = qwen_tok.decode(output_ids, skip_special_tokens=True).strip()
461
 
462
+ for prefix in ["Caption:", "Result:", "Answer:", "Fused caption:"]:
463
  if fused.lower().startswith(prefix.lower()):
464
  fused = fused[len(prefix):].strip()
465
 
 
469
  st.warning(f"Qwen fusion error: {str(e)[:80]}")
470
  return cap1
471
 
 
 
 
472
  with st.sidebar:
473
  st.title("Image Caption Fusion")
474
  st.markdown("---")