Afsha001 commited on
Commit
1fc1228
·
verified ·
1 Parent(s): dcfb164

delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -359
app.py DELETED
@@ -1,359 +0,0 @@
1
- import requests
2
- from io import BytesIO
3
- FLORENCE_URL = "https://api-inference.huggingface.co/models/microsoft/Florence-2-large"
4
- HF_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if "HF_TOKEN" in locals() or "HF_TOKEN" in globals() else {}
5
- import requests
6
- from io import BytesIO
7
- FLORENCE_URL = "https://api-inference.huggingface.co/models/microsoft/Florence-2-large"
8
- HF_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if "HF_TOKEN" in locals() or "HF_TOKEN" in globals() else {}
9
-
10
- import os
11
- import torch
12
- import numpy as np
13
- import requests
14
- import streamlit as st
15
- from PIL import Image
16
- from io import BytesIO
17
- from collections import Counter
18
- from sklearn.metrics.pairwise import cosine_similarity
19
- from sklearn.preprocessing import normalize
20
- import base64
21
- import pandas as pd
22
-
23
- st.set_page_config(page_title="Image Caption Fusion", page_icon="🖼️", layout="wide")
24
-
25
- HF_TOKEN = os.environ.get("HF_TOKEN", "")
26
- JINA_KEY = os.environ.get("JINA_KEY", "")
27
- DEVICE = "cpu"
28
-
29
- # ── Correct API endpoints ──
30
- FLORENCE_URL = "https://api-inference.huggingface.co/models/microsoft/Florence-2-large"
31
- QWEN_LM_URL = "https://api-inference.huggingface.co/v1/chat/completions"
32
- JINA_URL = "https://api.jina.ai/v1/rerank"
33
- HF_HEADERS = {"Authorization": "Bearer " + HF_TOKEN, "Content-Type": "application/json"}
34
- JINA_HEADERS = {"Authorization": "Bearer " + JINA_KEY, "Content-Type": "application/json"}
35
-
36
- DETECT_PROMPT = (
37
- "person . child . man . woman . boy . girl . "
38
- "dog . cat . horse . bird . animal . "
39
- "ball . toy . bicycle . car . bench . "
40
- "tree . grass . water . sky . mountain . "
41
- "building . stairs . door . fence . floor . "
42
- "jacket . dress . shirt . hat . bag ."
43
- )
44
-
45
- @st.cache_resource
46
- def load_local_models():
47
- from transformers import (
48
- BlipProcessor, BlipForImageTextRetrieval,
49
- AutoProcessor, AutoModelForZeroShotObjectDetection
50
- )
51
- blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
52
- itm_model = BlipForImageTextRetrieval.from_pretrained(
53
- "Salesforce/blip-itm-large-coco", torch_dtype=torch.float32
54
- )
55
- itm_model.eval()
56
- dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
57
- dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
58
- "IDEA-Research/grounding-dino-base", torch_dtype=torch.float32
59
- )
60
- dino_model.eval()
61
- return blip_processor, itm_model, dino_processor, dino_model
62
-
63
- def image_to_base64(image):
64
- buffered = BytesIO()
65
- image.save(buffered, format="JPEG")
66
- return base64.b64encode(buffered.getvalue()).decode()
67
-
68
- # ── FIXED: Qwen2-VL via chat completions API ──
69
- def generate_captions_api(image):
70
- img_b64 = image_to_base64(image)
71
- img_url = "data:image/jpeg;base64," + img_b64
72
-
73
- PROMPTS = [
74
- "Describe this image in one detailed sentence.",
75
- "What is happening in this image? Write one descriptive sentence.",
76
- "Describe the main subjects, actions and setting in one sentence.",
77
- "Write a detailed caption focusing on people, animals and objects visible.",
78
- "Describe this scene including background details and activities shown.",
79
- ]
80
-
81
- captions = []
82
- for prompt in PROMPTS:
83
- try:
84
- payload = {
85
- "model": "Qwen/Qwen2-VL-2B-Instruct",
86
- "messages": [
87
- {
88
- "role": "user",
89
- "content": [
90
- {"type": "image_url", "image_url": {"url": img_url}},
91
- {"type": "text", "text": prompt}
92
- ]
93
- }
94
- ],
95
- "max_tokens": 80
96
- }
97
- response = requests.post(FLORENCE_URL,
98
-
99
- headers=HF_HEADERS,
100
- json=payload,
101
- timeout=40
102
- )
103
- if response.status_code == 200:
104
- result = response.json()
105
- cap = result["choices"][0]["message"]["content"].strip().lower()
106
- captions.append(cap if cap else "a scene with various objects")
107
- else:
108
- st.warning("Qwen2-VL API error: " + str(response.status_code) + " " + response.text[:100])
109
- captions.append("a scene with various objects and people")
110
- except Exception as e:
111
- st.warning("Florence-2 exception: " + str(e))
112
- captions.append("a scene captured in the image")
113
-
114
- seen, unique = set(), []
115
- for c in captions:
116
- if c not in seen:
117
- seen.add(c)
118
- unique.append(c)
119
- while len(unique) < 5:
120
- unique.append(unique[0])
121
- return unique[:5]
122
-
123
- def compute_itm_scores(image, captions, blip_processor, itm_model):
124
- scores = []
125
- for cap in captions:
126
- inp = blip_processor(images=image, text=cap, return_tensors="pt", padding=True)
127
- with torch.no_grad():
128
- out = itm_model(**inp)
129
- score = torch.nn.functional.softmax(out.itm_score, dim=1)[:, 1].item()
130
- scores.append(round(score, 4))
131
- return scores
132
-
133
- # ── FIXED: Jina Reranker M0 API ──
134
- def compute_jina_scores(image, captions):
135
- img_b64 = image_to_base64(image)
136
- scores = []
137
- for cap in captions:
138
- try:
139
- payload = {
140
- "model": "jina-reranker-m0",
141
- "query": cap,
142
- "documents": ["data:image/jpeg;base64," + img_b64],
143
- "top_n": 1
144
- }
145
- response = requests.post(FLORENCE_URL,
146
- JINA_URL,
147
- headers=JINA_HEADERS,
148
- json=payload,
149
- timeout=30
150
- )
151
- if response.status_code == 200:
152
- result = response.json()
153
- score = result["results"][0]["relevance_score"]
154
- scores.append(round(float(score), 4))
155
- else:
156
- st.warning("Jina API error: " + str(response.status_code) + " " + response.text[:100])
157
- scores.append(0.0)
158
- except Exception as e:
159
- st.warning("Jina exception: " + str(e))
160
- scores.append(0.0)
161
- return scores
162
-
163
- def compute_cosine_scores(image, captions, blip_processor, itm_model):
164
- img_inp = blip_processor(images=image, return_tensors="pt")
165
- with torch.no_grad():
166
- vis_out = itm_model.vision_model(pixel_values=img_inp["pixel_values"])
167
- img_feat = itm_model.vision_proj(vis_out.last_hidden_state[:, 0, :]).numpy()
168
- img_feat = normalize(img_feat, norm="l2")
169
- cap_inp = blip_processor(
170
- text=captions, return_tensors="pt",
171
- padding=True, truncation=True, max_length=512
172
- )
173
- with torch.no_grad():
174
- txt_out = itm_model.text_encoder(
175
- input_ids=cap_inp["input_ids"],
176
- attention_mask=cap_inp["attention_mask"]
177
- )
178
- cap_feat = itm_model.text_proj(txt_out.last_hidden_state[:, 0, :]).numpy()
179
- cap_feat = normalize(cap_feat, norm="l2")
180
- scores = cosine_similarity(img_feat, cap_feat)[0]
181
- return [round(float(s), 4) for s in scores]
182
-
183
- def majority_voting(captions, itm_scores, jina_scores, cosine_scores):
184
- itm_ranked = np.argsort(itm_scores)[::-1]
185
- jina_ranked = np.argsort(jina_scores)[::-1]
186
- cos_ranked = np.argsort(cosine_scores)[::-1]
187
- votes = [
188
- int(itm_ranked[0]), int(itm_ranked[1]),
189
- int(jina_ranked[0]), int(jina_ranked[1]),
190
- int(cos_ranked[0]), int(cos_ranked[1]),
191
- ]
192
- vote_counts = Counter(votes)
193
- top2_indices = [idx for idx, _ in vote_counts.most_common(2)]
194
- if len(top2_indices) < 2:
195
- top2_indices = [int(itm_ranked[0]), int(jina_ranked[0])]
196
- return captions[top2_indices[0]], captions[top2_indices[1]], top2_indices, dict(vote_counts)
197
-
198
- def detect_objects(image, dino_processor, dino_model, threshold=0.3):
199
- inp = dino_processor(images=image, text=DETECT_PROMPT, return_tensors="pt")
200
- with torch.no_grad():
201
- outputs = dino_model(**inp)
202
- target_sizes = torch.tensor([image.size[::-1]])
203
- results = dino_processor.post_process_grounded_object_detection(
204
- outputs, inp.input_ids, target_sizes=target_sizes
205
- )[0]
206
- scores = results["scores"]
207
- labels = results["labels"]
208
- keep = scores >= threshold
209
- labels = [labels[i] for i in range(len(labels)) if keep[i]]
210
- sc_list= scores[keep].tolist()
211
- if not labels:
212
- return "No objects detected", []
213
- seen = {}
214
- for lbl, sc in zip(labels, sc_list):
215
- lbl = lbl.strip().lower()
216
- if lbl not in seen or seen[lbl] < sc:
217
- seen[lbl] = sc
218
- sorted_labels = [l for l, _ in sorted(seen.items(), key=lambda x: x[1], reverse=True)]
219
- label_str = "Detected: [" + ", ".join(sorted_labels) + "]"
220
- return label_str, sorted_labels
221
-
222
- # ── FIXED: Qwen2.5-1.5B via chat completions ──
223
- def fuse_captions_api(cap1, cap2, dino_labels):
224
- prompt = (
225
- "You are given two captions and detected objects for the same image. "
226
- "Write ONE fluent, natural, descriptive caption combining the best details. "
227
- "Return ONLY the fused caption, nothing else. "
228
- "Caption 1: " + cap1 + ". "
229
- "Caption 2: " + cap2 + ". "
230
- "Detected objects: " + dino_labels + "."
231
- )
232
- try:
233
- payload = {
234
- "model": "Qwen/Qwen2.5-1.5B-Instruct",
235
- "messages": [
236
- {"role": "system", "content": "You write accurate image captions. Return only the caption."},
237
- {"role": "user", "content": prompt}
238
- ],
239
- "max_tokens" : 80,
240
- "temperature" : 0.1,
241
- "repetition_penalty": 1.1
242
- }
243
- response = requests.post(FLORENCE_URL,
244
- QWEN_LM_URL,
245
- headers=HF_HEADERS,
246
- json=payload,
247
- timeout=40
248
- )
249
- if response.status_code == 200:
250
- result = response.json()
251
- fused = result["choices"][0]["message"]["content"].strip()
252
- return fused if fused else cap1
253
- else:
254
- st.warning("Qwen fusion API error: " + str(response.status_code))
255
- return cap1
256
- except Exception as e:
257
- st.warning("Qwen fusion exception: " + str(e))
258
- return cap1
259
-
260
- # ── SIDEBAR ──
261
- with st.sidebar:
262
- st.title(" Image Caption Fusion")
263
- st.markdown("---")
264
- st.markdown("### Pipeline Steps")
265
- st.markdown("1. Florence-2 — Generate 4 captions + BLIP local")
266
- st.markdown("2. BLIP ITM — Image-text matching")
267
- st.markdown("3. Jina Reranker M0 — Semantic reranking")
268
- st.markdown("4. Cosine Similarity — Embedding similarity")
269
- st.markdown("5. Majority Voting — Best 2 captions")
270
- st.markdown("6. Grounding DINO — Object detection")
271
- st.markdown("7. Qwen2.5-1.5B — Caption fusion")
272
- st.markdown("---")
273
- st.markdown("**Local:** BLIP ITM, DINO")
274
- st.markdown("**API:** Florence-2, Jina, Qwen2.5")
275
-
276
- # ── MAIN UI ──
277
- st.title(" Image Caption Fusion System")
278
- st.markdown("Upload any image and get a detailed, humanized caption.")
279
- st.markdown("---")
280
-
281
- uploaded = st.file_uploader("Upload an image", type=["jpg","jpeg","png"])
282
-
283
- if uploaded:
284
- image = Image.open(uploaded).convert("RGB")
285
- col1, col2 = st.columns([1, 1])
286
- with col1:
287
- st.image(image, caption="Uploaded Image", width=400)
288
- with col2:
289
- if st.button(" Generate Caption", type="primary", use_container_width=True):
290
- with st.spinner("Loading local models (first time ~2 min)..."):
291
- blip_processor, itm_model, dino_processor, dino_model = load_local_models()
292
-
293
- progress = st.progress(0)
294
- status = st.empty()
295
-
296
- status.info(" Step 1/7 — Generating captions with Florence-2 + BLIP...")
297
- captions = generate_captions_api(image)
298
- progress.progress(14)
299
- with st.expander(" 5 Generated Captions"):
300
- for i, c in enumerate(captions):
301
- st.write(str(i+1) + ". " + c)
302
-
303
- status.info(" Step 2/7 — Computing BLIP ITM scores...")
304
- itm_scores = compute_itm_scores(image, captions, blip_processor, itm_model)
305
- progress.progress(28)
306
-
307
- status.info(" Step 3/7 — Computing Jina Reranker scores...")
308
- jina_scores = compute_jina_scores(image, captions)
309
- progress.progress(42)
310
-
311
- status.info(" Step 4/7 — Computing Cosine Similarity...")
312
- cosine_scores = compute_cosine_scores(image, captions, blip_processor, itm_model)
313
- progress.progress(57)
314
-
315
- score_df = pd.DataFrame({
316
- "Caption": ["Cap " + str(i+1) + ": " + c[:50] for i, c in enumerate(captions)],
317
- "ITM" : itm_scores,
318
- "Jina" : jina_scores,
319
- "Cosine" : cosine_scores
320
- })
321
- with st.expander(" All Scores"):
322
- st.dataframe(score_df, use_container_width=True)
323
-
324
- status.info(" Step 5/7 — Majority Voting...")
325
- voted_cap1, voted_cap2, top2_idx, vote_counts = majority_voting(
326
- captions, itm_scores, jina_scores, cosine_scores
327
- )
328
- progress.progress(71)
329
-
330
- st.markdown("### Majority Voted Captions")
331
- col_a, col_b = st.columns(2)
332
- with col_a:
333
- st.success(" Caption 1: " + voted_cap1)
334
- with col_b:
335
- st.info(" Caption 2: " + voted_cap2)
336
-
337
- status.info(" Step 6/7 — Detecting objects with DINO...")
338
- label_str, label_list = detect_objects(image, dino_processor, dino_model)
339
- progress.progress(85)
340
-
341
- st.markdown("### Detected Objects")
342
- if label_list:
343
- st.write(" | ".join([" " + l for l in label_list]))
344
- else:
345
- st.write(label_str)
346
-
347
- status.info(" Step 7/7 — Fusing with Qwen2.5-1.5B...")
348
- fused = fuse_captions_api(voted_cap1, voted_cap2, label_str)
349
- progress.progress(100)
350
- status.success(" Pipeline complete!")
351
-
352
- st.markdown("---")
353
- st.markdown("### Final Fused Caption")
354
- st.markdown(
355
- "<div style='background:linear-gradient(135deg,#667eea,#764ba2);"
356
- "padding:20px;border-radius:12px;color:white;font-size:18px;"
357
- "font-weight:500;text-align:center;'> " + fused + "</div>",
358
- unsafe_allow_html=True
359
- )