Afsha001 commited on
Commit
bdb5d8d
Β·
1 Parent(s): b7d8863

fix Qwen2-VL and Jina API call formats

Browse files
Files changed (1) hide show
  1. app.py +88 -58
app.py CHANGED
@@ -18,10 +18,11 @@ HF_TOKEN = os.environ.get("HF_TOKEN", "")
18
  JINA_KEY = os.environ.get("JINA_KEY", "")
19
  DEVICE = "cpu"
20
 
21
- QWEN_VL_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-2B-Instruct"
22
- QWEN_LM_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct"
 
23
  JINA_URL = "https://api.jina.ai/v1/rerank"
24
- HF_HEADERS = {"Authorization": "Bearer " + HF_TOKEN}
25
  JINA_HEADERS = {"Authorization": "Bearer " + JINA_KEY, "Content-Type": "application/json"}
26
 
27
  DETECT_PROMPT = (
@@ -51,10 +52,15 @@ def load_local_models():
51
  dino_model.eval()
52
  return blip_processor, itm_model, dino_processor, dino_model
53
 
54
- def generate_captions_api(image):
55
  buffered = BytesIO()
56
  image.save(buffered, format="JPEG")
57
- img_bytes = buffered.getvalue()
 
 
 
 
 
58
 
59
  PROMPTS = [
60
  "Describe this image in one detailed sentence.",
@@ -67,22 +73,34 @@ def generate_captions_api(image):
67
  captions = []
68
  for prompt in PROMPTS:
69
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  response = requests.post(
71
  QWEN_VL_URL,
72
  headers=HF_HEADERS,
73
- json={"inputs": prompt},
74
- timeout=30
75
  )
76
  if response.status_code == 200:
77
  result = response.json()
78
- if isinstance(result, list):
79
- cap = result[0].get("generated_text", "").strip().lower()
80
- else:
81
- cap = str(result).strip().lower()
82
- captions.append(cap if cap else "a scene with various objects and people")
83
  else:
84
- captions.append("a detailed scene with people and objects")
85
- except Exception:
 
 
86
  captions.append("a scene captured in the image")
87
 
88
  seen, unique = set(), []
@@ -104,28 +122,42 @@ def compute_itm_scores(image, captions, blip_processor, itm_model):
104
  scores.append(round(score, 4))
105
  return scores
106
 
 
107
  def compute_jina_scores(image, captions):
108
- buffered = BytesIO()
109
- image.save(buffered, format="JPEG")
110
- img_b64 = base64.b64encode(buffered.getvalue()).decode()
111
- scores = []
112
  for cap in captions:
113
  try:
114
- payload = {
115
- "model" : "jina-reranker-m0",
116
- "query" : cap,
117
- "documents" : [{"type": "image_url",
118
- "image_url": {"url": "data:image/jpeg;base64," + img_b64}}]
 
 
 
 
 
 
 
 
119
  }
120
- response = requests.post(JINA_URL, headers=JINA_HEADERS, json=payload, timeout=30)
 
 
 
 
 
121
  if response.status_code == 200:
122
  result = response.json()
123
  score = result["results"][0]["relevance_score"]
124
  scores.append(round(float(score), 4))
125
  else:
126
- scores.append(0.5)
127
- except Exception:
128
- scores.append(0.5)
 
 
129
  return scores
130
 
131
  def compute_cosine_scores(image, captions, blip_processor, itm_model):
@@ -187,44 +219,42 @@ def detect_objects(image, dino_processor, dino_model, threshold=0.3):
187
  label_str = "Detected: [" + ", ".join(sorted_labels) + "]"
188
  return label_str, sorted_labels
189
 
 
190
  def fuse_captions_api(cap1, cap2, dino_labels):
191
  prompt = (
192
  "You are given two captions and detected objects for the same image. "
193
  "Write ONE fluent, natural, descriptive caption combining the best details. "
194
- "Return ONLY the caption, no explanation, no prefix. "
195
- "Caption 1: " + cap1 + " "
196
- "Caption 2: " + cap2 + " "
197
- "Detected objects: " + dino_labels + " "
198
- "Fused caption:"
199
  )
200
  try:
 
 
 
 
 
 
 
 
 
 
201
  response = requests.post(
202
  QWEN_LM_URL,
203
  headers=HF_HEADERS,
204
- json={
205
- "inputs": prompt,
206
- "parameters": {
207
- "max_new_tokens" : 80,
208
- "do_sample" : False,
209
- "repetition_penalty": 1.1,
210
- "return_full_text" : False
211
- }
212
- },
213
  timeout=40
214
  )
215
  if response.status_code == 200:
216
  result = response.json()
217
- if isinstance(result, list):
218
- fused = result[0].get("generated_text", "").strip()
219
- else:
220
- fused = str(result).strip()
221
- for prefix in ["Fused caption:", "Caption:"]:
222
- if fused.lower().startswith(prefix.lower()):
223
- fused = fused[len(prefix):].strip()
224
  return fused if fused else cap1
225
  else:
 
226
  return cap1
227
- except Exception:
 
228
  return cap1
229
 
230
  # ── SIDEBAR ──
@@ -248,16 +278,16 @@ st.title(" Image Caption Fusion System")
248
  st.markdown("Upload any image and get a detailed, humanized caption.")
249
  st.markdown("---")
250
 
251
- uploaded = st.file_uploader(" Upload an image", type=["jpg","jpeg","png"])
252
 
253
  if uploaded:
254
  image = Image.open(uploaded).convert("RGB")
255
  col1, col2 = st.columns([1, 1])
256
  with col1:
257
- st.image(image, caption="Uploaded Image", use_column_width=True)
258
  with col2:
259
  if st.button(" Generate Caption", type="primary", use_container_width=True):
260
- with st.spinner("Loading local models..."):
261
  blip_processor, itm_model, dino_processor, dino_model = load_local_models()
262
 
263
  progress = st.progress(0)
@@ -283,10 +313,10 @@ if uploaded:
283
  progress.progress(57)
284
 
285
  score_df = pd.DataFrame({
286
- "Caption" : ["Cap " + str(i+1) + ": " + c[:50] for i, c in enumerate(captions)],
287
- "ITM" : itm_scores,
288
- "Jina" : jina_scores,
289
- "Cosine" : cosine_scores
290
  })
291
  with st.expander(" All Scores"):
292
  st.dataframe(score_df, use_container_width=True)
@@ -310,7 +340,7 @@ if uploaded:
310
 
311
  st.markdown("### Detected Objects")
312
  if label_list:
313
- st.write(" | ".join(["πŸ” " + l for l in label_list]))
314
  else:
315
  st.write(label_str)
316
 
@@ -320,7 +350,7 @@ if uploaded:
320
  status.success(" Pipeline complete!")
321
 
322
  st.markdown("---")
323
- st.markdown("### Final Fused Caption")
324
  st.markdown(
325
  "<div style='background:linear-gradient(135deg,#667eea,#764ba2);"
326
  "padding:20px;border-radius:12px;color:white;font-size:18px;"
 
18
  JINA_KEY = os.environ.get("JINA_KEY", "")
19
  DEVICE = "cpu"
20
 
21
+ # ── Correct API endpoints ──
22
+ QWEN_VL_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-2B-Instruct/v1/chat/completions"
23
+ QWEN_LM_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
24
  JINA_URL = "https://api.jina.ai/v1/rerank"
25
+ HF_HEADERS = {"Authorization": "Bearer " + HF_TOKEN, "Content-Type": "application/json"}
26
  JINA_HEADERS = {"Authorization": "Bearer " + JINA_KEY, "Content-Type": "application/json"}
27
 
28
  DETECT_PROMPT = (
 
52
  dino_model.eval()
53
  return blip_processor, itm_model, dino_processor, dino_model
54
 
55
+ def image_to_base64(image):
56
  buffered = BytesIO()
57
  image.save(buffered, format="JPEG")
58
+ return base64.b64encode(buffered.getvalue()).decode()
59
+
60
+ # ── FIXED: Qwen2-VL via chat completions API ──
61
+ def generate_captions_api(image):
62
+ img_b64 = image_to_base64(image)
63
+ img_url = "data:image/jpeg;base64," + img_b64
64
 
65
  PROMPTS = [
66
  "Describe this image in one detailed sentence.",
 
73
  captions = []
74
  for prompt in PROMPTS:
75
  try:
76
+ payload = {
77
+ "model": "Qwen/Qwen2-VL-2B-Instruct",
78
+ "messages": [
79
+ {
80
+ "role": "user",
81
+ "content": [
82
+ {"type": "image_url", "image_url": {"url": img_url}},
83
+ {"type": "text", "text": prompt}
84
+ ]
85
+ }
86
+ ],
87
+ "max_tokens": 80
88
+ }
89
  response = requests.post(
90
  QWEN_VL_URL,
91
  headers=HF_HEADERS,
92
+ json=payload,
93
+ timeout=40
94
  )
95
  if response.status_code == 200:
96
  result = response.json()
97
+ cap = result["choices"][0]["message"]["content"].strip().lower()
98
+ captions.append(cap if cap else "a scene with various objects")
 
 
 
99
  else:
100
+ st.warning("Qwen2-VL API error: " + str(response.status_code) + " " + response.text[:100])
101
+ captions.append("a scene with various objects and people")
102
+ except Exception as e:
103
+ st.warning("Qwen2-VL exception: " + str(e))
104
  captions.append("a scene captured in the image")
105
 
106
  seen, unique = set(), []
 
122
  scores.append(round(score, 4))
123
  return scores
124
 
125
+ # ── FIXED: Jina Reranker M0 API ──
126
  def compute_jina_scores(image, captions):
127
+ img_b64 = image_to_base64(image)
128
+ scores = []
 
 
129
  for cap in captions:
130
  try:
131
+ payload = {
132
+ "model": "jina-reranker-m0",
133
+ "query": {
134
+ "type" : "text",
135
+ "text" : cap
136
+ },
137
+ "documents": [
138
+ {
139
+ "type" : "image_url",
140
+ "image_url": {"url": "data:image/jpeg;base64," + img_b64}
141
+ }
142
+ ],
143
+ "top_n": 1
144
  }
145
+ response = requests.post(
146
+ JINA_URL,
147
+ headers=JINA_HEADERS,
148
+ json=payload,
149
+ timeout=30
150
+ )
151
  if response.status_code == 200:
152
  result = response.json()
153
  score = result["results"][0]["relevance_score"]
154
  scores.append(round(float(score), 4))
155
  else:
156
+ st.warning("Jina API error: " + str(response.status_code) + " " + response.text[:100])
157
+ scores.append(0.0)
158
+ except Exception as e:
159
+ st.warning("Jina exception: " + str(e))
160
+ scores.append(0.0)
161
  return scores
162
 
163
  def compute_cosine_scores(image, captions, blip_processor, itm_model):
 
219
  label_str = "Detected: [" + ", ".join(sorted_labels) + "]"
220
  return label_str, sorted_labels
221
 
222
+ # ── FIXED: Qwen2.5-1.5B via chat completions ──
223
  def fuse_captions_api(cap1, cap2, dino_labels):
224
  prompt = (
225
  "You are given two captions and detected objects for the same image. "
226
  "Write ONE fluent, natural, descriptive caption combining the best details. "
227
+ "Return ONLY the fused caption, nothing else. "
228
+ "Caption 1: " + cap1 + ". "
229
+ "Caption 2: " + cap2 + ". "
230
+ "Detected objects: " + dino_labels + "."
 
231
  )
232
  try:
233
+ payload = {
234
+ "model": "Qwen/Qwen2.5-1.5B-Instruct",
235
+ "messages": [
236
+ {"role": "system", "content": "You write accurate image captions. Return only the caption."},
237
+ {"role": "user", "content": prompt}
238
+ ],
239
+ "max_tokens" : 80,
240
+ "temperature" : 0.1,
241
+ "repetition_penalty": 1.1
242
+ }
243
  response = requests.post(
244
  QWEN_LM_URL,
245
  headers=HF_HEADERS,
246
+ json=payload,
 
 
 
 
 
 
 
 
247
  timeout=40
248
  )
249
  if response.status_code == 200:
250
  result = response.json()
251
+ fused = result["choices"][0]["message"]["content"].strip()
 
 
 
 
 
 
252
  return fused if fused else cap1
253
  else:
254
+ st.warning("Qwen fusion API error: " + str(response.status_code))
255
  return cap1
256
+ except Exception as e:
257
+ st.warning("Qwen fusion exception: " + str(e))
258
  return cap1
259
 
260
  # ── SIDEBAR ──
 
278
  st.markdown("Upload any image and get a detailed, humanized caption.")
279
  st.markdown("---")
280
 
281
+ uploaded = st.file_uploader("Upload an image", type=["jpg","jpeg","png"])
282
 
283
  if uploaded:
284
  image = Image.open(uploaded).convert("RGB")
285
  col1, col2 = st.columns([1, 1])
286
  with col1:
287
+ st.image(image, caption="Uploaded Image", width=400)
288
  with col2:
289
  if st.button(" Generate Caption", type="primary", use_container_width=True):
290
+ with st.spinner("Loading local models (first time ~2 min)..."):
291
  blip_processor, itm_model, dino_processor, dino_model = load_local_models()
292
 
293
  progress = st.progress(0)
 
313
  progress.progress(57)
314
 
315
  score_df = pd.DataFrame({
316
+ "Caption": ["Cap " + str(i+1) + ": " + c[:50] for i, c in enumerate(captions)],
317
+ "ITM" : itm_scores,
318
+ "Jina" : jina_scores,
319
+ "Cosine" : cosine_scores
320
  })
321
  with st.expander(" All Scores"):
322
  st.dataframe(score_df, use_container_width=True)
 
340
 
341
  st.markdown("### Detected Objects")
342
  if label_list:
343
+ st.write(" | ".join([" " + l for l in label_list]))
344
  else:
345
  st.write(label_str)
346
 
 
350
  status.success(" Pipeline complete!")
351
 
352
  st.markdown("---")
353
+ st.markdown("### Final Fused Caption")
354
  st.markdown(
355
  "<div style='background:linear-gradient(135deg,#667eea,#764ba2);"
356
  "padding:20px;border-radius:12px;color:white;font-size:18px;"