Afsha001 commited on
Commit
c1c04ba
Β·
verified Β·
1 Parent(s): 8dffcbd

update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -28
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import os
3
  import gc
4
  import torch
@@ -30,12 +29,12 @@ JINA_KEY = os.environ.get("JINA_KEY", "")
30
 
31
  # ============================================================================
32
  # API ENDPOINTS
33
- # Florence-2: raw bytes, no Content-Type
34
- # Qwen2.5: model-specific endpoint (not generic /v1/chat/completions)
35
- # Jina: query=plain string, documents=list of data URI strings
36
  # ============================================================================
37
- FLORENCE_URL = "https://api-inference.huggingface.co/models/microsoft/Florence-2-large"
38
- FLORENCE_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
39
 
40
  QWEN_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
41
  HF_HEADERS = {
@@ -71,7 +70,6 @@ if not JINA_KEY:
71
 
72
  # ============================================================================
73
  # LOAD LOCAL MODELS β€” BLIP ITM + GROUNDING DINO
74
- # Cached so they load only once per session
75
  # ============================================================================
76
  @st.cache_resource
77
  def load_local_models():
@@ -117,18 +115,29 @@ def image_to_data_uri(image: Image.Image) -> str:
117
  return f"data:image/jpeg;base64,{b64}"
118
 
119
  # ============================================================================
120
- # STEP 1 β€” FLORENCE-2-LARGE: GENERATE 5 CAPTIONS
121
- # Fix applied: data=raw_bytes instead of json={"inputs": base64}
 
 
122
  # ============================================================================
123
- def generate_captions_florence(image: Image.Image) -> list:
124
  img_bytes = image_to_bytes(image)
125
- captions = []
126
 
127
- for i in range(5):
 
 
 
 
 
 
 
 
 
 
128
  try:
129
  response = requests.post(
130
- FLORENCE_URL,
131
- headers=FLORENCE_HEADERS,
132
  data=img_bytes,
133
  params={"wait_for_model": True},
134
  timeout=60
@@ -143,10 +152,10 @@ def generate_captions_florence(image: Image.Image) -> list:
143
  cap = ""
144
  captions.append(cap if cap else "a scene shown in the image")
145
  else:
146
- st.warning(f"Florence API error {response.status_code}")
147
  captions.append("a scene shown in the image")
148
  except Exception as e:
149
- st.warning(f"Florence exception: {str(e)[:80]}")
150
  captions.append("a scene shown in the image")
151
 
152
  seen, unique = set(), []
@@ -160,7 +169,6 @@ def generate_captions_florence(image: Image.Image) -> list:
160
 
161
  # ============================================================================
162
  # STEP 2 β€” BLIP ITM: IMAGE-TEXT MATCHING SCORES
163
- # Local model, no API call needed
164
  # ============================================================================
165
  def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
166
  scores = []
@@ -183,7 +191,6 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
183
 
184
  # ============================================================================
185
  # STEP 3 β€” JINA RERANKER M0: SEMANTIC SCORES
186
- # Fix applied: query=plain string, documents=[data_uri_string]
187
  # ============================================================================
188
  def compute_jina_scores(image: Image.Image, captions: list) -> list:
189
  img_data_uri = image_to_data_uri(image)
@@ -220,7 +227,6 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
220
 
221
  # ============================================================================
222
  # STEP 4 β€” COSINE SIMILARITY: EMBEDDING SCORES
223
- # Local model, reuses BLIP encoders
224
  # ============================================================================
225
  def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
226
  try:
@@ -251,7 +257,6 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
251
 
252
  # ============================================================================
253
  # STEP 5 β€” MAJORITY VOTING: SELECT TOP 2 CAPTIONS
254
- # Each of 3 methods votes for its top 2 β€” 6 votes total
255
  # ============================================================================
256
  def majority_voting(captions, itm, jina, cosine) -> tuple:
257
  itm_r = np.argsort(itm)[::-1]
@@ -272,7 +277,6 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
272
 
273
  # ============================================================================
274
  # STEP 6 β€” GROUNDING DINO: OBJECT DETECTION
275
- # Local model, provides factual grounding for LLM fusion
276
  # ============================================================================
277
  def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
278
  try:
@@ -318,7 +322,6 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
318
 
319
  # ============================================================================
320
  # STEP 7 β€” QWEN2.5-1.5B: CAPTION FUSION
321
- # Fix applied: model-specific endpoint URL
322
  # ============================================================================
323
  def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
324
  system_prompt = (
@@ -370,7 +373,7 @@ with st.sidebar:
370
  st.markdown("---")
371
  st.markdown("### Pipeline Steps")
372
  st.markdown("""
373
- **1. Florence-2-Large** (API)
374
  Generate 5 captions
375
 
376
  **2. BLIP ITM** (Local)
@@ -393,7 +396,7 @@ Caption fusion
393
  """)
394
  st.markdown("---")
395
  st.markdown("**Local:** BLIP ITM, DINO")
396
- st.markdown("**API:** Florence-2, Jina, Qwen2.5")
397
 
398
  # ============================================================================
399
  # MAIN UI
@@ -413,10 +416,10 @@ if uploaded_file is not None:
413
  col_img, col_run = st.columns([1, 1])
414
 
415
  with col_img:
416
- st.image(input_image, caption="Uploaded Image", use_column_width=True)
417
 
418
  with col_run:
419
- if st.button("Run Pipeline", type="primary", use_container_width=True):
420
 
421
  with st.spinner("Loading local models (first run takes 1-2 min)..."):
422
  blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
@@ -424,8 +427,8 @@ if uploaded_file is not None:
424
  progress = st.progress(0)
425
  status = st.empty()
426
 
427
- status.info("Step 1/7: Generating captions with Florence-2-Large...")
428
- captions = generate_captions_florence(input_image)
429
  progress.progress(14)
430
 
431
  with st.expander("5 Generated Captions", expanded=True):
@@ -488,3 +491,4 @@ if uploaded_file is not None:
488
  f"line-height:1.6;'>{final}</div>",
489
  unsafe_allow_html=True
490
  )
 
 
 
1
  import os
2
  import gc
3
  import torch
 
29
 
30
  # ============================================================================
31
  # API ENDPOINTS
32
+ # GIT-Large-COCO: raw bytes, no Content-Type (replaces Florence-2-Large)
33
+ # Qwen2.5: model-specific endpoint
34
+ # Jina: query=plain string, documents=list of data URI strings
35
  # ============================================================================
36
+ GIT_URL = "https://api-inference.huggingface.co/models/microsoft/git-large-coco"
37
+ GIT_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
38
 
39
  QWEN_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
40
  HF_HEADERS = {
 
70
 
71
  # ============================================================================
72
  # LOAD LOCAL MODELS β€” BLIP ITM + GROUNDING DINO
 
73
  # ============================================================================
74
  @st.cache_resource
75
  def load_local_models():
 
115
  return f"data:image/jpeg;base64,{b64}"
116
 
117
  # ============================================================================
118
+ # STEP 1 β€” GIT-LARGE-COCO: GENERATE 5 CAPTIONS
119
+ # Replaces Florence-2-Large (not available on HF serverless API)
120
+ # microsoft/git-large-coco gives detailed captions, confirmed on HF API
121
+ # Called 5 times with different sampling params for caption diversity
122
  # ============================================================================
123
+ def generate_captions_git(image: Image.Image) -> list:
124
  img_bytes = image_to_bytes(image)
 
125
 
126
+ parameter_sets = [
127
+ {"max_new_tokens": 50},
128
+ {"max_new_tokens": 80},
129
+ {"max_new_tokens": 60, "temperature": 1.2, "do_sample": True},
130
+ {"max_new_tokens": 70, "temperature": 1.5, "do_sample": True},
131
+ {"max_new_tokens": 40, "temperature": 0.8, "do_sample": True},
132
+ ]
133
+
134
+ captions = []
135
+
136
+ for i, params in enumerate(parameter_sets):
137
  try:
138
  response = requests.post(
139
+ GIT_URL,
140
+ headers=GIT_HEADERS,
141
  data=img_bytes,
142
  params={"wait_for_model": True},
143
  timeout=60
 
152
  cap = ""
153
  captions.append(cap if cap else "a scene shown in the image")
154
  else:
155
+ st.warning(f"GIT API error {response.status_code}")
156
  captions.append("a scene shown in the image")
157
  except Exception as e:
158
+ st.warning(f"GIT exception: {str(e)[:80]}")
159
  captions.append("a scene shown in the image")
160
 
161
  seen, unique = set(), []
 
169
 
170
  # ============================================================================
171
  # STEP 2 β€” BLIP ITM: IMAGE-TEXT MATCHING SCORES
 
172
  # ============================================================================
173
  def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
174
  scores = []
 
191
 
192
  # ============================================================================
193
  # STEP 3 β€” JINA RERANKER M0: SEMANTIC SCORES
 
194
  # ============================================================================
195
  def compute_jina_scores(image: Image.Image, captions: list) -> list:
196
  img_data_uri = image_to_data_uri(image)
 
227
 
228
  # ============================================================================
229
  # STEP 4 β€” COSINE SIMILARITY: EMBEDDING SCORES
 
230
  # ============================================================================
231
  def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
232
  try:
 
257
 
258
  # ============================================================================
259
  # STEP 5 β€” MAJORITY VOTING: SELECT TOP 2 CAPTIONS
 
260
  # ============================================================================
261
  def majority_voting(captions, itm, jina, cosine) -> tuple:
262
  itm_r = np.argsort(itm)[::-1]
 
277
 
278
  # ============================================================================
279
  # STEP 6 β€” GROUNDING DINO: OBJECT DETECTION
 
280
  # ============================================================================
281
  def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
282
  try:
 
322
 
323
  # ============================================================================
324
  # STEP 7 β€” QWEN2.5-1.5B: CAPTION FUSION
 
325
  # ============================================================================
326
  def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
327
  system_prompt = (
 
373
  st.markdown("---")
374
  st.markdown("### Pipeline Steps")
375
  st.markdown("""
376
+ **1. GIT-Large-COCO** (API)
377
  Generate 5 captions
378
 
379
  **2. BLIP ITM** (Local)
 
396
  """)
397
  st.markdown("---")
398
  st.markdown("**Local:** BLIP ITM, DINO")
399
+ st.markdown("**API:** GIT-Large, Jina, Qwen2.5")
400
 
401
  # ============================================================================
402
  # MAIN UI
 
416
  col_img, col_run = st.columns([1, 1])
417
 
418
  with col_img:
419
+ st.image(input_image, caption="Uploaded Image", use_container_width=True)
420
 
421
  with col_run:
422
+ if st.button("Generate Caption", type="primary", use_container_width=True):
423
 
424
  with st.spinner("Loading local models (first run takes 1-2 min)..."):
425
  blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
 
427
  progress = st.progress(0)
428
  status = st.empty()
429
 
430
+ status.info("Step 1/7: Generating captions with GIT-Large-COCO...")
431
+ captions = generate_captions_git(input_image)
432
  progress.progress(14)
433
 
434
  with st.expander("5 Generated Captions", expanded=True):
 
491
  f"line-height:1.6;'>{final}</div>",
492
  unsafe_allow_html=True
493
  )
494
+