Afsha001 commited on
Commit
b7d8863
Β·
1 Parent(s): a0d9361

fix all f-string syntax errors

Browse files
Files changed (1) hide show
  1. app.py +82 -202
app.py CHANGED
@@ -1,7 +1,5 @@
1
 
2
  import os
3
- import re
4
- import time
5
  import torch
6
  import numpy as np
7
  import requests
@@ -11,27 +9,20 @@ from io import BytesIO
11
  from collections import Counter
12
  from sklearn.metrics.pairwise import cosine_similarity
13
  from sklearn.preprocessing import normalize
 
 
14
 
15
- # ── Page config ──
16
- st.set_page_config(
17
- page_title = "Image Caption Fusion",
18
- page_icon = "πŸ–ΌοΈ",
19
- layout = "wide"
20
- )
21
 
22
- # ── API Keys from HF Secrets ──
23
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
24
  JINA_KEY = os.environ.get("JINA_KEY", "")
 
25
 
26
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
27
-
28
- # ── API endpoints ──
29
  QWEN_VL_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-2B-Instruct"
30
  QWEN_LM_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct"
31
  JINA_URL = "https://api.jina.ai/v1/rerank"
32
-
33
- HF_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
34
- JINA_HEADERS = {"Authorization": f"Bearer {JINA_KEY}", "Content-Type": "application/json"}
35
 
36
  DETECT_PROMPT = (
37
  "person . child . man . woman . boy . girl . "
@@ -42,38 +33,25 @@ DETECT_PROMPT = (
42
  "jacket . dress . shirt . hat . bag ."
43
  )
44
 
45
- # ── Load local models once at startup ──
46
  @st.cache_resource
47
  def load_local_models():
48
  from transformers import (
49
  BlipProcessor, BlipForImageTextRetrieval,
50
  AutoProcessor, AutoModelForZeroShotObjectDetection
51
  )
52
-
53
- st.write("⏳ Loading BLIP ITM model (CPU)...")
54
- blip_processor = BlipProcessor.from_pretrained(
55
- "Salesforce/blip-image-captioning-large"
56
- )
57
  itm_model = BlipForImageTextRetrieval.from_pretrained(
58
- "Salesforce/blip-itm-large-coco",
59
- torch_dtype = torch.float32
60
  )
61
  itm_model.eval()
62
-
63
- st.write(" Loading DINO model (CPU)...")
64
- dino_processor = AutoProcessor.from_pretrained(
65
- "IDEA-Research/grounding-dino-base"
66
- )
67
  dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
68
- "IDEA-Research/grounding-dino-base",
69
- torch_dtype = torch.float32
70
  )
71
  dino_model.eval()
72
-
73
  return blip_processor, itm_model, dino_processor, dino_model
74
 
75
- # ── Step 1: Generate 5 captions via Qwen2-VL API ──
76
- def generate_captions_api(image: Image.Image) -> list:
77
  buffered = BytesIO()
78
  image.save(buffered, format="JPEG")
79
  img_bytes = buffered.getvalue()
@@ -88,14 +66,12 @@ def generate_captions_api(image: Image.Image) -> list:
88
 
89
  captions = []
90
  for prompt in PROMPTS:
91
- payload = {"inputs": prompt, "image": img_bytes.hex()}
92
  try:
93
  response = requests.post(
94
  QWEN_VL_URL,
95
- headers = HF_HEADERS,
96
- json = {"inputs": prompt},
97
- files = {"image": img_bytes},
98
- timeout = 30
99
  )
100
  if response.status_code == 200:
101
  result = response.json()
@@ -106,10 +82,9 @@ def generate_captions_api(image: Image.Image) -> list:
106
  captions.append(cap if cap else "a scene with various objects and people")
107
  else:
108
  captions.append("a detailed scene with people and objects")
109
- except Exception as e:
110
  captions.append("a scene captured in the image")
111
 
112
- # Deduplicate
113
  seen, unique = set(), []
114
  for c in captions:
115
  if c not in seen:
@@ -117,172 +92,125 @@ def generate_captions_api(image: Image.Image) -> list:
117
  unique.append(c)
118
  while len(unique) < 5:
119
  unique.append(unique[0])
120
-
121
  return unique[:5]
122
 
123
- # ── Step 2: BLIP ITM Scoring (local CPU) ──
124
  def compute_itm_scores(image, captions, blip_processor, itm_model):
125
  scores = []
126
  for cap in captions:
127
- inp = blip_processor(
128
- images=image, text=cap,
129
- return_tensors="pt", padding=True
130
- )
131
  with torch.no_grad():
132
  out = itm_model(**inp)
133
- score = torch.nn.functional.softmax(
134
- out.itm_score, dim=1
135
- )[:, 1].item()
136
  scores.append(round(score, 4))
137
  return scores
138
 
139
- # ── Step 3: Jina Reranker Scoring (API) ──
140
  def compute_jina_scores(image, captions):
141
  buffered = BytesIO()
142
  image.save(buffered, format="JPEG")
143
- img_b64 = __import__("base64").b64encode(buffered.getvalue()).decode()
144
-
145
- scores = []
146
  for cap in captions:
147
  try:
148
  payload = {
149
  "model" : "jina-reranker-m0",
150
  "query" : cap,
151
  "documents" : [{"type": "image_url",
152
- "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}]
153
  }
154
- response = requests.post(
155
- JINA_URL,
156
- headers = JINA_HEADERS,
157
- json = payload,
158
- timeout = 30
159
- )
160
  if response.status_code == 200:
161
  result = response.json()
162
  score = result["results"][0]["relevance_score"]
163
  scores.append(round(float(score), 4))
164
  else:
165
  scores.append(0.5)
166
- except:
167
  scores.append(0.5)
168
  return scores
169
 
170
- # ── Step 4: Cosine Similarity Scoring (local numpy) ──
171
  def compute_cosine_scores(image, captions, blip_processor, itm_model):
172
- # Get image embedding
173
  img_inp = blip_processor(images=image, return_tensors="pt")
174
  with torch.no_grad():
175
- vis_out = itm_model.vision_model(
176
- pixel_values=img_inp["pixel_values"]
177
- )
178
- img_feat = itm_model.vision_proj(
179
- vis_out.last_hidden_state[:, 0, :]
180
- ).numpy()
181
  img_feat = normalize(img_feat, norm="l2")
182
-
183
- # Get caption embeddings
184
  cap_inp = blip_processor(
185
  text=captions, return_tensors="pt",
186
  padding=True, truncation=True, max_length=512
187
  )
188
  with torch.no_grad():
189
  txt_out = itm_model.text_encoder(
190
- input_ids = cap_inp["input_ids"],
191
- attention_mask = cap_inp["attention_mask"]
192
  )
193
- cap_feat = itm_model.text_proj(
194
- txt_out.last_hidden_state[:, 0, :]
195
- ).numpy()
196
  cap_feat = normalize(cap_feat, norm="l2")
197
-
198
  scores = cosine_similarity(img_feat, cap_feat)[0]
199
  return [round(float(s), 4) for s in scores]
200
 
201
- # ── Step 5: Majority Voting ──
202
  def majority_voting(captions, itm_scores, jina_scores, cosine_scores):
203
  itm_ranked = np.argsort(itm_scores)[::-1]
204
  jina_ranked = np.argsort(jina_scores)[::-1]
205
  cos_ranked = np.argsort(cosine_scores)[::-1]
206
-
207
  votes = [
208
  int(itm_ranked[0]), int(itm_ranked[1]),
209
  int(jina_ranked[0]), int(jina_ranked[1]),
210
  int(cos_ranked[0]), int(cos_ranked[1]),
211
  ]
212
-
213
  vote_counts = Counter(votes)
214
  top2_indices = [idx for idx, _ in vote_counts.most_common(2)]
215
-
216
  if len(top2_indices) < 2:
217
  top2_indices = [int(itm_ranked[0]), int(jina_ranked[0])]
 
218
 
219
- return (
220
- captions[top2_indices[0]],
221
- captions[top2_indices[1]],
222
- top2_indices,
223
- dict(vote_counts)
224
- )
225
-
226
- # ── Step 6: DINO Object Detection (local CPU) ──
227
  def detect_objects(image, dino_processor, dino_model, threshold=0.3):
228
- inp = dino_processor(
229
- images=image, text=DETECT_PROMPT,
230
- return_tensors="pt"
231
- )
232
  with torch.no_grad():
233
  outputs = dino_model(**inp)
234
-
235
  target_sizes = torch.tensor([image.size[::-1]])
236
- results = dino_processor.post_process_grounded_object_detection(
237
- outputs, inp.input_ids,
238
- target_sizes=target_sizes
239
  )[0]
240
-
241
  scores = results["scores"]
242
  labels = results["labels"]
243
  keep = scores >= threshold
244
  labels = [labels[i] for i in range(len(labels)) if keep[i]]
245
  sc_list= scores[keep].tolist()
246
-
247
  if not labels:
248
  return "No objects detected", []
249
-
250
  seen = {}
251
  for lbl, sc in zip(labels, sc_list):
252
  lbl = lbl.strip().lower()
253
  if lbl not in seen or seen[lbl] < sc:
254
  seen[lbl] = sc
255
-
256
  sorted_labels = [l for l, _ in sorted(seen.items(), key=lambda x: x[1], reverse=True)]
257
- label_str = "Detected: [" + ", ".join(sorted_labels) + "]"
258
  return label_str, sorted_labels
259
 
260
- # ── Step 7: Qwen2.5-1.5B Caption Fusion (API) ──
261
  def fuse_captions_api(cap1, cap2, dino_labels):
262
- prompt = f"""You are given two captions and detected objects for the same image.
263
- Write ONE fluent, natural, descriptive caption combining the best details.
264
- Return ONLY the caption, no explanation, no prefix.
265
-
266
- Caption 1 : {cap1}
267
- Caption 2 : {cap2}
268
- Detected objects : {dino_labels}
269
-
270
- Fused caption :"""
271
-
272
  try:
273
  response = requests.post(
274
  QWEN_LM_URL,
275
- headers = HF_HEADERS,
276
- json = {
277
- "inputs" : prompt,
278
- "parameters" : {
279
- "max_new_tokens" : 80,
280
- "do_sample" : False,
281
- "repetition_penalty" : 1.1,
282
- "return_full_text" : False
283
  }
284
  },
285
- timeout = 40
286
  )
287
  if response.status_code == 200:
288
  result = response.json()
@@ -290,112 +218,80 @@ Fused caption :"""
290
  fused = result[0].get("generated_text", "").strip()
291
  else:
292
  fused = str(result).strip()
293
-
294
- # Clean any prefix Qwen adds
295
- for prefix in ["Fused caption :", "Fused caption:", "Caption:"]:
296
  if fused.lower().startswith(prefix.lower()):
297
  fused = fused[len(prefix):].strip()
298
  return fused if fused else cap1
299
-
300
  else:
301
  return cap1
302
-
303
- except Exception as e:
304
  return cap1
305
 
306
- # ════════════════════════════════════════
307
- # STREAMLIT UI
308
- # ════════════════════════════════════════
309
-
310
- # ── Sidebar ──
311
  with st.sidebar:
312
  st.title(" Image Caption Fusion")
313
  st.markdown("---")
314
- st.markdown("### Pipeline")
315
- st.markdown("""
316
- 1. **Qwen2-VL-2B** β€” Generate 5 captions
317
- 2. **BLIP ITM** β€” Image-text matching score
318
- 3. **Jina Reranker M0** β€” Semantic reranking
319
- 4. **Cosine Similarity** β€” Embedding similarity
320
- 5. **Majority Voting** β€” Best 2 captions
321
- 6. **Grounding DINO** β€” Object detection
322
- 7. **Qwen2.5-1.5B** β€” Caption fusion
323
- """)
324
- st.markdown("---")
325
- st.markdown("### About")
326
- st.markdown("""
327
- This system generates a rich, humanized caption
328
- for any image using a multi-model ensemble pipeline.
329
- """)
330
  st.markdown("---")
331
- st.markdown("**Local models:** BLIP ITM, DINO")
332
- st.markdown("**API models:** Qwen2-VL, Jina, Qwen2.5")
333
 
334
- # ── Main area ──
335
  st.title(" Image Caption Fusion System")
336
  st.markdown("Upload any image and get a detailed, humanized caption.")
337
  st.markdown("---")
338
 
339
- uploaded = st.file_uploader(
340
- " Upload an image",
341
- type=["jpg", "jpeg", "png"],
342
- help="Upload any image to generate a fused caption"
343
- )
344
 
345
  if uploaded:
346
  image = Image.open(uploaded).convert("RGB")
347
-
348
  col1, col2 = st.columns([1, 1])
349
  with col1:
350
  st.image(image, caption="Uploaded Image", use_column_width=True)
351
-
352
  with col2:
353
  if st.button(" Generate Caption", type="primary", use_container_width=True):
354
-
355
- # Load local models
356
- with st.spinner("Loading local models (first time takes ~2 min)..."):
357
  blip_processor, itm_model, dino_processor, dino_model = load_local_models()
358
 
359
  progress = st.progress(0)
360
  status = st.empty()
361
 
362
- # Step 1 β€” Generate captions
363
  status.info(" Step 1/7 β€” Generating 5 captions with Qwen2-VL...")
364
  captions = generate_captions_api(image)
365
  progress.progress(14)
366
-
367
- with st.expander(" 5 Generated Captions", expanded=False):
368
  for i, c in enumerate(captions):
369
- st.write(f"**{i+1}.** {c}")
370
 
371
- # Step 2 β€” ITM scores
372
  status.info(" Step 2/7 β€” Computing BLIP ITM scores...")
373
  itm_scores = compute_itm_scores(image, captions, blip_processor, itm_model)
374
  progress.progress(28)
375
 
376
- # Step 3 β€” Jina scores
377
  status.info(" Step 3/7 β€” Computing Jina Reranker scores...")
378
  jina_scores = compute_jina_scores(image, captions)
379
  progress.progress(42)
380
 
381
- # Step 4 β€” Cosine scores
382
- status.info(" Step 4/7 β€” Computing Cosine Similarity scores...")
383
  cosine_scores = compute_cosine_scores(image, captions, blip_processor, itm_model)
384
  progress.progress(57)
385
 
386
- # Show score table
387
- import pandas as pd
388
  score_df = pd.DataFrame({
389
- "Caption" : [f"Cap {i+1}: {c[:50]}..." for i, c in enumerate(captions)],
390
- "ITM" : itm_scores,
391
- "Jina" : jina_scores,
392
- "Cosine" : cosine_scores
393
  })
394
- with st.expander(" All Scores", expanded=False):
395
  st.dataframe(score_df, use_container_width=True)
396
 
397
- # Step 5 β€” Majority voting
398
- status.info(" Step 5/7 β€” Running Majority Voting...")
399
  voted_cap1, voted_cap2, top2_idx, vote_counts = majority_voting(
400
  captions, itm_scores, jina_scores, cosine_scores
401
  )
@@ -404,46 +300,30 @@ if uploaded:
404
  st.markdown("### Majority Voted Captions")
405
  col_a, col_b = st.columns(2)
406
  with col_a:
407
- st.success(f" **Caption 1:**
408
-
409
- {voted_cap1}")
410
  with col_b:
411
- st.info(f" **Caption 2:**
412
 
413
- {voted_cap2}")
414
-
415
- # Step 6 β€” DINO
416
  status.info(" Step 6/7 β€” Detecting objects with DINO...")
417
  label_str, label_list = detect_objects(image, dino_processor, dino_model)
418
  progress.progress(85)
419
 
420
  st.markdown("### Detected Objects")
421
  if label_list:
422
- cols = st.columns(min(len(label_list), 6))
423
- for i, lbl in enumerate(label_list[:6]):
424
- cols[i].markdown(
425
- f"<span style='background:#e8f4fd;padding:4px 8px;"
426
- f"border-radius:12px;font-size:13px'> {lbl}</span>",
427
- unsafe_allow_html=True
428
- )
429
  else:
430
  st.write(label_str)
431
 
432
- # Step 7 β€” Qwen fusion
433
- status.info("Step 7/7 β€” Fusing captions with Qwen2.5-1.5B...")
434
  fused = fuse_captions_api(voted_cap1, voted_cap2, label_str)
435
  progress.progress(100)
436
  status.success(" Pipeline complete!")
437
 
438
- # Final output
439
  st.markdown("---")
440
  st.markdown("### Final Fused Caption")
441
  st.markdown(
442
- f"<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);"
443
- f"padding: 20px; border-radius: 12px; color: white; font-size: 18px;"
444
- f"font-weight: 500; text-align: center;'>"
445
- f" {fused}"
446
- f"</div>",
447
  unsafe_allow_html=True
448
  )
449
- st.markdown("---")
 
1
 
2
  import os
 
 
3
  import torch
4
  import numpy as np
5
  import requests
 
9
  from collections import Counter
10
  from sklearn.metrics.pairwise import cosine_similarity
11
  from sklearn.preprocessing import normalize
12
+ import base64
13
+ import pandas as pd
14
 
15
+ st.set_page_config(page_title="Image Caption Fusion", page_icon="πŸ–ΌοΈ", layout="wide")
 
 
 
 
 
16
 
 
17
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
18
  JINA_KEY = os.environ.get("JINA_KEY", "")
19
+ DEVICE = "cpu"
20
 
 
 
 
21
  QWEN_VL_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-2B-Instruct"
22
  QWEN_LM_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct"
23
  JINA_URL = "https://api.jina.ai/v1/rerank"
24
+ HF_HEADERS = {"Authorization": "Bearer " + HF_TOKEN}
25
+ JINA_HEADERS = {"Authorization": "Bearer " + JINA_KEY, "Content-Type": "application/json"}
 
26
 
27
  DETECT_PROMPT = (
28
  "person . child . man . woman . boy . girl . "
 
33
  "jacket . dress . shirt . hat . bag ."
34
  )
35
 
 
36
  @st.cache_resource
37
  def load_local_models():
38
  from transformers import (
39
  BlipProcessor, BlipForImageTextRetrieval,
40
  AutoProcessor, AutoModelForZeroShotObjectDetection
41
  )
42
+ blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
 
 
 
 
43
  itm_model = BlipForImageTextRetrieval.from_pretrained(
44
+ "Salesforce/blip-itm-large-coco", torch_dtype=torch.float32
 
45
  )
46
  itm_model.eval()
47
+ dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
 
 
 
 
48
  dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
49
+ "IDEA-Research/grounding-dino-base", torch_dtype=torch.float32
 
50
  )
51
  dino_model.eval()
 
52
  return blip_processor, itm_model, dino_processor, dino_model
53
 
54
+ def generate_captions_api(image):
 
55
  buffered = BytesIO()
56
  image.save(buffered, format="JPEG")
57
  img_bytes = buffered.getvalue()
 
66
 
67
  captions = []
68
  for prompt in PROMPTS:
 
69
  try:
70
  response = requests.post(
71
  QWEN_VL_URL,
72
+ headers=HF_HEADERS,
73
+ json={"inputs": prompt},
74
+ timeout=30
 
75
  )
76
  if response.status_code == 200:
77
  result = response.json()
 
82
  captions.append(cap if cap else "a scene with various objects and people")
83
  else:
84
  captions.append("a detailed scene with people and objects")
85
+ except Exception:
86
  captions.append("a scene captured in the image")
87
 
 
88
  seen, unique = set(), []
89
  for c in captions:
90
  if c not in seen:
 
92
  unique.append(c)
93
  while len(unique) < 5:
94
  unique.append(unique[0])
 
95
  return unique[:5]
96
 
 
97
  def compute_itm_scores(image, captions, blip_processor, itm_model):
98
  scores = []
99
  for cap in captions:
100
+ inp = blip_processor(images=image, text=cap, return_tensors="pt", padding=True)
 
 
 
101
  with torch.no_grad():
102
  out = itm_model(**inp)
103
+ score = torch.nn.functional.softmax(out.itm_score, dim=1)[:, 1].item()
 
 
104
  scores.append(round(score, 4))
105
  return scores
106
 
 
107
  def compute_jina_scores(image, captions):
108
  buffered = BytesIO()
109
  image.save(buffered, format="JPEG")
110
+ img_b64 = base64.b64encode(buffered.getvalue()).decode()
111
+ scores = []
 
112
  for cap in captions:
113
  try:
114
  payload = {
115
  "model" : "jina-reranker-m0",
116
  "query" : cap,
117
  "documents" : [{"type": "image_url",
118
+ "image_url": {"url": "data:image/jpeg;base64," + img_b64}}]
119
  }
120
+ response = requests.post(JINA_URL, headers=JINA_HEADERS, json=payload, timeout=30)
 
 
 
 
 
121
  if response.status_code == 200:
122
  result = response.json()
123
  score = result["results"][0]["relevance_score"]
124
  scores.append(round(float(score), 4))
125
  else:
126
  scores.append(0.5)
127
+ except Exception:
128
  scores.append(0.5)
129
  return scores
130
 
 
131
  def compute_cosine_scores(image, captions, blip_processor, itm_model):
 
132
  img_inp = blip_processor(images=image, return_tensors="pt")
133
  with torch.no_grad():
134
+ vis_out = itm_model.vision_model(pixel_values=img_inp["pixel_values"])
135
+ img_feat = itm_model.vision_proj(vis_out.last_hidden_state[:, 0, :]).numpy()
 
 
 
 
136
  img_feat = normalize(img_feat, norm="l2")
 
 
137
  cap_inp = blip_processor(
138
  text=captions, return_tensors="pt",
139
  padding=True, truncation=True, max_length=512
140
  )
141
  with torch.no_grad():
142
  txt_out = itm_model.text_encoder(
143
+ input_ids=cap_inp["input_ids"],
144
+ attention_mask=cap_inp["attention_mask"]
145
  )
146
+ cap_feat = itm_model.text_proj(txt_out.last_hidden_state[:, 0, :]).numpy()
 
 
147
  cap_feat = normalize(cap_feat, norm="l2")
 
148
  scores = cosine_similarity(img_feat, cap_feat)[0]
149
  return [round(float(s), 4) for s in scores]
150
 
 
151
  def majority_voting(captions, itm_scores, jina_scores, cosine_scores):
152
  itm_ranked = np.argsort(itm_scores)[::-1]
153
  jina_ranked = np.argsort(jina_scores)[::-1]
154
  cos_ranked = np.argsort(cosine_scores)[::-1]
 
155
  votes = [
156
  int(itm_ranked[0]), int(itm_ranked[1]),
157
  int(jina_ranked[0]), int(jina_ranked[1]),
158
  int(cos_ranked[0]), int(cos_ranked[1]),
159
  ]
 
160
  vote_counts = Counter(votes)
161
  top2_indices = [idx for idx, _ in vote_counts.most_common(2)]
 
162
  if len(top2_indices) < 2:
163
  top2_indices = [int(itm_ranked[0]), int(jina_ranked[0])]
164
+ return captions[top2_indices[0]], captions[top2_indices[1]], top2_indices, dict(vote_counts)
165
 
 
 
 
 
 
 
 
 
166
  def detect_objects(image, dino_processor, dino_model, threshold=0.3):
167
+ inp = dino_processor(images=image, text=DETECT_PROMPT, return_tensors="pt")
 
 
 
168
  with torch.no_grad():
169
  outputs = dino_model(**inp)
 
170
  target_sizes = torch.tensor([image.size[::-1]])
171
+ results = dino_processor.post_process_grounded_object_detection(
172
+ outputs, inp.input_ids, target_sizes=target_sizes
 
173
  )[0]
 
174
  scores = results["scores"]
175
  labels = results["labels"]
176
  keep = scores >= threshold
177
  labels = [labels[i] for i in range(len(labels)) if keep[i]]
178
  sc_list= scores[keep].tolist()
 
179
  if not labels:
180
  return "No objects detected", []
 
181
  seen = {}
182
  for lbl, sc in zip(labels, sc_list):
183
  lbl = lbl.strip().lower()
184
  if lbl not in seen or seen[lbl] < sc:
185
  seen[lbl] = sc
 
186
  sorted_labels = [l for l, _ in sorted(seen.items(), key=lambda x: x[1], reverse=True)]
187
+ label_str = "Detected: [" + ", ".join(sorted_labels) + "]"
188
  return label_str, sorted_labels
189
 
 
190
  def fuse_captions_api(cap1, cap2, dino_labels):
191
+ prompt = (
192
+ "You are given two captions and detected objects for the same image. "
193
+ "Write ONE fluent, natural, descriptive caption combining the best details. "
194
+ "Return ONLY the caption, no explanation, no prefix. "
195
+ "Caption 1: " + cap1 + " "
196
+ "Caption 2: " + cap2 + " "
197
+ "Detected objects: " + dino_labels + " "
198
+ "Fused caption:"
199
+ )
 
200
  try:
201
  response = requests.post(
202
  QWEN_LM_URL,
203
+ headers=HF_HEADERS,
204
+ json={
205
+ "inputs": prompt,
206
+ "parameters": {
207
+ "max_new_tokens" : 80,
208
+ "do_sample" : False,
209
+ "repetition_penalty": 1.1,
210
+ "return_full_text" : False
211
  }
212
  },
213
+ timeout=40
214
  )
215
  if response.status_code == 200:
216
  result = response.json()
 
218
  fused = result[0].get("generated_text", "").strip()
219
  else:
220
  fused = str(result).strip()
221
+ for prefix in ["Fused caption:", "Caption:"]:
 
 
222
  if fused.lower().startswith(prefix.lower()):
223
  fused = fused[len(prefix):].strip()
224
  return fused if fused else cap1
 
225
  else:
226
  return cap1
227
+ except Exception:
 
228
  return cap1
229
 
230
+ # ── SIDEBAR ──
 
 
 
 
231
  with st.sidebar:
232
  st.title(" Image Caption Fusion")
233
  st.markdown("---")
234
+ st.markdown("### Pipeline Steps")
235
+ st.markdown("1. Qwen2-VL-2B β€” Generate 5 captions")
236
+ st.markdown("2. BLIP ITM β€” Image-text matching")
237
+ st.markdown("3. Jina Reranker M0 β€” Semantic reranking")
238
+ st.markdown("4. Cosine Similarity β€” Embedding similarity")
239
+ st.markdown("5. Majority Voting β€” Best 2 captions")
240
+ st.markdown("6. Grounding DINO β€” Object detection")
241
+ st.markdown("7. Qwen2.5-1.5B β€” Caption fusion")
 
 
 
 
 
 
 
 
242
  st.markdown("---")
243
+ st.markdown("**Local:** BLIP ITM, DINO")
244
+ st.markdown("**API:** Qwen2-VL, Jina, Qwen2.5")
245
 
246
+ # ── MAIN UI ──
247
  st.title(" Image Caption Fusion System")
248
  st.markdown("Upload any image and get a detailed, humanized caption.")
249
  st.markdown("---")
250
 
251
+ uploaded = st.file_uploader(" Upload an image", type=["jpg","jpeg","png"])
 
 
 
 
252
 
253
  if uploaded:
254
  image = Image.open(uploaded).convert("RGB")
 
255
  col1, col2 = st.columns([1, 1])
256
  with col1:
257
  st.image(image, caption="Uploaded Image", use_column_width=True)
 
258
  with col2:
259
  if st.button(" Generate Caption", type="primary", use_container_width=True):
260
+ with st.spinner("Loading local models..."):
 
 
261
  blip_processor, itm_model, dino_processor, dino_model = load_local_models()
262
 
263
  progress = st.progress(0)
264
  status = st.empty()
265
 
 
266
  status.info(" Step 1/7 β€” Generating 5 captions with Qwen2-VL...")
267
  captions = generate_captions_api(image)
268
  progress.progress(14)
269
+ with st.expander(" 5 Generated Captions"):
 
270
  for i, c in enumerate(captions):
271
+ st.write(str(i+1) + ". " + c)
272
 
 
273
  status.info(" Step 2/7 β€” Computing BLIP ITM scores...")
274
  itm_scores = compute_itm_scores(image, captions, blip_processor, itm_model)
275
  progress.progress(28)
276
 
 
277
  status.info(" Step 3/7 β€” Computing Jina Reranker scores...")
278
  jina_scores = compute_jina_scores(image, captions)
279
  progress.progress(42)
280
 
281
+ status.info(" Step 4/7 β€” Computing Cosine Similarity...")
 
282
  cosine_scores = compute_cosine_scores(image, captions, blip_processor, itm_model)
283
  progress.progress(57)
284
 
 
 
285
  score_df = pd.DataFrame({
286
+ "Caption" : ["Cap " + str(i+1) + ": " + c[:50] for i, c in enumerate(captions)],
287
+ "ITM" : itm_scores,
288
+ "Jina" : jina_scores,
289
+ "Cosine" : cosine_scores
290
  })
291
+ with st.expander(" All Scores"):
292
  st.dataframe(score_df, use_container_width=True)
293
 
294
+ status.info(" Step 5/7 β€” Majority Voting...")
 
295
  voted_cap1, voted_cap2, top2_idx, vote_counts = majority_voting(
296
  captions, itm_scores, jina_scores, cosine_scores
297
  )
 
300
  st.markdown("### Majority Voted Captions")
301
  col_a, col_b = st.columns(2)
302
  with col_a:
303
+ st.success(" Caption 1: " + voted_cap1)
 
 
304
  with col_b:
305
+ st.info(" Caption 2: " + voted_cap2)
306
 
 
 
 
307
  status.info(" Step 6/7 β€” Detecting objects with DINO...")
308
  label_str, label_list = detect_objects(image, dino_processor, dino_model)
309
  progress.progress(85)
310
 
311
  st.markdown("### Detected Objects")
312
  if label_list:
313
+ st.write(" | ".join(["πŸ” " + l for l in label_list]))
 
 
 
 
 
 
314
  else:
315
  st.write(label_str)
316
 
317
+ status.info(" Step 7/7 β€” Fusing with Qwen2.5-1.5B...")
 
318
  fused = fuse_captions_api(voted_cap1, voted_cap2, label_str)
319
  progress.progress(100)
320
  status.success(" Pipeline complete!")
321
 
 
322
  st.markdown("---")
323
  st.markdown("### Final Fused Caption")
324
  st.markdown(
325
+ "<div style='background:linear-gradient(135deg,#667eea,#764ba2);"
326
+ "padding:20px;border-radius:12px;color:white;font-size:18px;"
327
+ "font-weight:500;text-align:center;'> " + fused + "</div>",
 
 
328
  unsafe_allow_html=True
329
  )