Afsha001 commited on
Commit
e67f5ad
Β·
verified Β·
1 Parent(s): 7be1ea6

update aap.py

Browse files
Files changed (1) hide show
  1. app.py +48 -71
app.py CHANGED
@@ -12,26 +12,15 @@ from collections import Counter
12
  from sklearn.metrics.pairwise import cosine_similarity
13
  from sklearn.preprocessing import normalize
14
 
15
- # ============================================================================
16
- # PAGE CONFIG
17
- # ============================================================================
18
  st.set_page_config(
19
  page_title="Image Caption Fusion System",
20
  layout="wide",
21
  initial_sidebar_state="expanded"
22
  )
23
 
24
- # ============================================================================
25
- # CREDENTIALS
26
- # ============================================================================
27
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
28
  JINA_KEY = os.environ.get("JINA_KEY", "")
29
 
30
- # ============================================================================
31
- # API ENDPOINTS
32
- # Qwen2.5: model-specific endpoint for caption fusion
33
- # Jina: query=plain string, documents=list of data URI strings
34
- # ============================================================================
35
  QWEN_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
36
  HF_HEADERS = {
37
  "Authorization": f"Bearer {HF_TOKEN}",
@@ -53,9 +42,6 @@ DETECT_PROMPT = (
53
  "jacket . dress . shirt . hat . bag ."
54
  )
55
 
56
- # ============================================================================
57
- # CREDENTIAL CHECK
58
- # ============================================================================
59
  if not HF_TOKEN:
60
  st.error("HF_TOKEN missing. Go to Space Settings β†’ Secrets and add it.")
61
  st.stop()
@@ -65,24 +51,26 @@ if not JINA_KEY:
65
  st.stop()
66
 
67
  # ============================================================================
68
- # LOAD LOCAL MODELS
69
- # Moondream2: caption generation via official moondream package
70
- # BLIP ITM: image-text matching + cosine similarity
71
- # DINO: object detection
72
  # ============================================================================
73
  @st.cache_resource
74
  def load_local_models():
75
- import moondream as md
76
  from transformers import (
 
 
77
  BlipProcessor,
78
  BlipForImageTextRetrieval,
79
- AutoProcessor,
80
  AutoModelForZeroShotObjectDetection
81
  )
82
  gc.collect()
83
 
84
- # Moondream2 β€” official package avoids transformers version conflict
85
- moon_model = md.vl(model="moondream-2b")
 
 
 
 
 
86
 
87
  # BLIP β€” for ITM scoring and cosine similarity
88
  blip_processor = BlipProcessor.from_pretrained(
@@ -104,11 +92,8 @@ def load_local_models():
104
  )
105
  dino_model.eval()
106
 
107
- return moon_model, blip_processor, blip_itm_model, dino_processor, dino_model
108
 
109
- # ============================================================================
110
- # HELPERS
111
- # ============================================================================
112
  def image_to_bytes(image: Image.Image) -> bytes:
113
  buf = BytesIO()
114
  image.save(buf, format="JPEG", quality=85)
@@ -120,29 +105,34 @@ def image_to_data_uri(image: Image.Image) -> str:
120
  return f"data:image/jpeg;base64,{b64}"
121
 
122
  # ============================================================================
123
- # STEP 1 β€” MOONDREAM2 (LOCAL): GENERATE 5 DIVERSE CAPTIONS
124
- # Official moondream package β€” no transformers conflict
125
- # 5 different prompts produce diverse caption perspectives
126
  # ============================================================================
127
- def generate_captions_moondream(image: Image.Image, moon_mod) -> list:
128
-
129
- prompts = [
130
- "Describe this image in detail.",
131
- "What is happening in this image?",
132
- "Describe the people, objects, and setting in this image.",
133
- "What do you see in this photograph?",
134
- "Describe the scene including background and foreground in detail."
135
- ]
136
-
137
- captions = []
138
 
139
- for prompt in prompts:
140
  try:
141
- result = moon_mod.query(image, prompt)
142
- cap = result["answer"].strip().lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  captions.append(cap if cap else "a scene shown in the image")
 
144
  except Exception as e:
145
- st.warning(f"Moondream error: {str(e)[:80]}")
146
  captions.append("a scene shown in the image")
147
 
148
  seen, unique = set(), []
@@ -155,9 +145,7 @@ def generate_captions_moondream(image: Image.Image, moon_mod) -> list:
155
 
156
  return unique[:5]
157
 
158
- # ============================================================================
159
- # STEP 2 β€” BLIP ITM: IMAGE-TEXT MATCHING SCORES
160
- # ============================================================================
161
  def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
162
  scores = []
163
  for cap in captions:
@@ -177,9 +165,7 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
177
  scores.append(0.0)
178
  return scores
179
 
180
- # ============================================================================
181
- # STEP 3 β€” JINA RERANKER M0: SEMANTIC SCORES
182
- # ============================================================================
183
  def compute_jina_scores(image: Image.Image, captions: list) -> list:
184
  img_data_uri = image_to_data_uri(image)
185
  scores = []
@@ -213,9 +199,7 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
213
  scores.append(0.0)
214
  return scores
215
 
216
- # ============================================================================
217
- # STEP 4 β€” COSINE SIMILARITY: EMBEDDING SCORES
218
- # ============================================================================
219
  def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
220
  try:
221
  img_inp = blip_proc(images=image, return_tensors="pt")
@@ -243,9 +227,7 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
243
  st.warning(f"Cosine error: {str(e)[:60]}")
244
  return [0.0] * len(captions)
245
 
246
- # ============================================================================
247
- # STEP 5 β€” MAJORITY VOTING: SELECT TOP 2 CAPTIONS
248
- # ============================================================================
249
  def majority_voting(captions, itm, jina, cosine) -> tuple:
250
  itm_r = np.argsort(itm)[::-1]
251
  jina_r = np.argsort(jina)[::-1]
@@ -263,9 +245,7 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
263
 
264
  return captions[top2[0]], captions[top2[1]], top2, dict(counts)
265
 
266
- # ============================================================================
267
- # STEP 6 β€” GROUNDING DINO: OBJECT DETECTION
268
- # ============================================================================
269
  def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
270
  try:
271
  inputs = dino_proc(
@@ -308,9 +288,7 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
308
  st.warning(f"DINO error: {str(e)[:80]}")
309
  return "Object detection unavailable", []
310
 
311
- # ============================================================================
312
- # STEP 7 β€” QWEN2.5-1.5B: CAPTION FUSION
313
- # ============================================================================
314
  def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
315
  system_prompt = (
316
  "You are an expert image captioning assistant. "
@@ -354,14 +332,14 @@ def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
354
  return cap1
355
 
356
  # ============================================================================
357
- # SIDEBAR
358
  # ============================================================================
359
  with st.sidebar:
360
  st.title("Image Caption Fusion")
361
  st.markdown("---")
362
  st.markdown("### Pipeline Steps")
363
  st.markdown("""
364
- **1. Moondream2** (Local)
365
  Generate 5 captions
366
 
367
  **2. BLIP ITM** (Local)
@@ -383,12 +361,9 @@ Object detection
383
  Caption fusion
384
  """)
385
  st.markdown("---")
386
- st.markdown("**Local:** Moondream2, BLIP ITM, DINO")
387
  st.markdown("**API:** Jina, Qwen2.5")
388
 
389
- # ============================================================================
390
- # MAIN UI
391
- # ============================================================================
392
  st.title("Image Caption Fusion System")
393
  st.markdown("Upload an image to generate a refined, grounded caption.")
394
  st.markdown("---")
@@ -410,13 +385,15 @@ if uploaded_file is not None:
410
  if st.button("Generate Caption", type="primary", use_container_width=True):
411
 
412
  with st.spinner("Loading local models (first run takes 2-3 min)..."):
413
- moon_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
 
414
 
415
  progress = st.progress(0)
416
  status = st.empty()
417
 
418
- status.info("Step 1/7: Generating captions with Moondream2...")
419
- captions = generate_captions_moondream(input_image, moon_mod)
 
420
  progress.progress(14)
421
 
422
  with st.expander("5 Generated Captions", expanded=True):
 
12
  from sklearn.metrics.pairwise import cosine_similarity
13
  from sklearn.preprocessing import normalize
14
 
 
 
 
15
  st.set_page_config(
16
  page_title="Image Caption Fusion System",
17
  layout="wide",
18
  initial_sidebar_state="expanded"
19
  )
20
 
 
 
 
21
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
22
  JINA_KEY = os.environ.get("JINA_KEY", "")
23
 
 
 
 
 
 
24
  QWEN_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
25
  HF_HEADERS = {
26
  "Authorization": f"Bearer {HF_TOKEN}",
 
42
  "jacket . dress . shirt . hat . bag ."
43
  )
44
 
 
 
 
45
  if not HF_TOKEN:
46
  st.error("HF_TOKEN missing. Go to Space Settings β†’ Secrets and add it.")
47
  st.stop()
 
51
  st.stop()
52
 
53
  # ============================================================================
54
+ # CHANGE 1: load_local_models β€” replaced moondream with GIT-Large-COCO
 
 
 
55
  # ============================================================================
56
  @st.cache_resource
57
  def load_local_models():
 
58
  from transformers import (
59
+ AutoProcessor,
60
+ AutoModelForCausalLM,
61
  BlipProcessor,
62
  BlipForImageTextRetrieval,
 
63
  AutoModelForZeroShotObjectDetection
64
  )
65
  gc.collect()
66
 
67
+ # GIT-Large-COCO β€” local caption generation, no API, no auth needed
68
+ git_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
69
+ git_model = AutoModelForCausalLM.from_pretrained(
70
+ "microsoft/git-large-coco",
71
+ torch_dtype=torch.float32
72
+ )
73
+ git_model.eval()
74
 
75
  # BLIP β€” for ITM scoring and cosine similarity
76
  blip_processor = BlipProcessor.from_pretrained(
 
92
  )
93
  dino_model.eval()
94
 
95
+ return git_processor, git_model, blip_processor, blip_itm_model, dino_processor, dino_model
96
 
 
 
 
97
  def image_to_bytes(image: Image.Image) -> bytes:
98
  buf = BytesIO()
99
  image.save(buf, format="JPEG", quality=85)
 
105
  return f"data:image/jpeg;base64,{b64}"
106
 
107
  # ============================================================================
108
+ # CHANGE 2: generate_captions_git β€” replaced moondream caption function
 
 
109
  # ============================================================================
110
+ def generate_captions_git(image: Image.Image, git_proc, git_mod) -> list:
111
+ length_params = [30, 50, 60, 70, 40]
112
+ captions = []
 
 
 
 
 
 
 
 
113
 
114
+ for max_tokens in length_params:
115
  try:
116
+ pixel_values = git_proc(
117
+ images=image,
118
+ return_tensors="pt"
119
+ ).pixel_values
120
+
121
+ with torch.no_grad():
122
+ generated_ids = git_mod.generate(
123
+ pixel_values=pixel_values,
124
+ max_new_tokens=max_tokens
125
+ )
126
+
127
+ cap = git_proc.batch_decode(
128
+ generated_ids,
129
+ skip_special_tokens=True
130
+ )[0].strip().lower()
131
+
132
  captions.append(cap if cap else "a scene shown in the image")
133
+
134
  except Exception as e:
135
+ st.warning(f"GIT error: {str(e)[:80]}")
136
  captions.append("a scene shown in the image")
137
 
138
  seen, unique = set(), []
 
145
 
146
  return unique[:5]
147
 
148
+ # unchanged
 
 
149
  def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
150
  scores = []
151
  for cap in captions:
 
165
  scores.append(0.0)
166
  return scores
167
 
168
+ # unchanged
 
 
169
  def compute_jina_scores(image: Image.Image, captions: list) -> list:
170
  img_data_uri = image_to_data_uri(image)
171
  scores = []
 
199
  scores.append(0.0)
200
  return scores
201
 
202
+ # unchanged
 
 
203
  def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
204
  try:
205
  img_inp = blip_proc(images=image, return_tensors="pt")
 
227
  st.warning(f"Cosine error: {str(e)[:60]}")
228
  return [0.0] * len(captions)
229
 
230
+ # unchanged
 
 
231
  def majority_voting(captions, itm, jina, cosine) -> tuple:
232
  itm_r = np.argsort(itm)[::-1]
233
  jina_r = np.argsort(jina)[::-1]
 
245
 
246
  return captions[top2[0]], captions[top2[1]], top2, dict(counts)
247
 
248
+ # unchanged
 
 
249
  def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
250
  try:
251
  inputs = dino_proc(
 
288
  st.warning(f"DINO error: {str(e)[:80]}")
289
  return "Object detection unavailable", []
290
 
291
+ # unchanged
 
 
292
  def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
293
  system_prompt = (
294
  "You are an expert image captioning assistant. "
 
332
  return cap1
333
 
334
  # ============================================================================
335
+ # CHANGE 3: sidebar β€” updated step 1 label
336
  # ============================================================================
337
  with st.sidebar:
338
  st.title("Image Caption Fusion")
339
  st.markdown("---")
340
  st.markdown("### Pipeline Steps")
341
  st.markdown("""
342
+ **1. GIT-Large-COCO** (Local)
343
  Generate 5 captions
344
 
345
  **2. BLIP ITM** (Local)
 
361
  Caption fusion
362
  """)
363
  st.markdown("---")
364
+ st.markdown("**Local:** GIT-Large, BLIP ITM, DINO")
365
  st.markdown("**API:** Jina, Qwen2.5")
366
 
 
 
 
367
  st.title("Image Caption Fusion System")
368
  st.markdown("Upload an image to generate a refined, grounded caption.")
369
  st.markdown("---")
 
385
  if st.button("Generate Caption", type="primary", use_container_width=True):
386
 
387
  with st.spinner("Loading local models (first run takes 2-3 min)..."):
388
+ # CHANGE 4: updated unpacking β€” git_proc, git_mod instead of moon_mod
389
+ git_proc, git_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
390
 
391
  progress = st.progress(0)
392
  status = st.empty()
393
 
394
+ status.info("Step 1/7: Generating captions with GIT-Large-COCO...")
395
+ # CHANGE 4: updated function call
396
+ captions = generate_captions_git(input_image, git_proc, git_mod)
397
  progress.progress(14)
398
 
399
  with st.expander("5 Generated Captions", expanded=True):