Afsha001 commited on
Commit
1ae65de
·
verified ·
1 Parent(s): 58f42b0

return florence

Browse files
Files changed (1) hide show
  1. app.py +144 -122
app.py CHANGED
@@ -6,7 +6,6 @@ import pandas as pd
6
  import requests
7
  import base64
8
  import streamlit as st
9
- import google.generativeai as genai
10
  from PIL import Image
11
  from io import BytesIO
12
  from collections import Counter
@@ -19,11 +18,8 @@ st.set_page_config(
19
  initial_sidebar_state="expanded"
20
  )
21
 
22
- # ============================================================================
23
- # CREDENTIALS
24
- # ============================================================================
25
- JINA_KEY = os.environ.get("JINA_KEY", "")
26
- GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
27
 
28
  JINA_URL = "https://api.jina.ai/v1/rerank"
29
  JINA_HEADERS = {
@@ -45,38 +41,33 @@ DETECT_PROMPT = (
45
  "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
46
  )
47
 
48
- # ============================================================================
49
- # CREDENTIAL CHECK
50
- # ============================================================================
51
  if not JINA_KEY:
52
  st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
53
  st.stop()
54
 
55
- if not GOOGLE_API_KEY:
56
- st.error("GOOGLE_API_KEY missing. Go to Space Settings → Secrets and add it.")
57
- st.stop()
58
-
59
- # Configure Gemini after credentials are defined
60
- genai.configure(api_key=GOOGLE_API_KEY)
61
-
62
- # ============================================================================
63
- # LOAD LOCAL MODELS
64
- # Local: BLIP ITM, DINO, Qwen2.5
65
- # API: Gemini 2.0 Flash, Jina Reranker
66
- # ============================================================================
67
  @st.cache_resource
68
  def load_local_models():
69
  from transformers import (
 
70
  AutoModelForCausalLM,
71
  AutoTokenizer,
72
  BlipProcessor,
73
  BlipForImageTextRetrieval,
74
- AutoProcessor,
75
  AutoModelForZeroShotObjectDetection
76
  )
77
  gc.collect()
78
 
79
- # BLIP — ITM scoring and cosine similarity
 
 
 
 
 
 
 
 
 
 
80
  blip_processor = BlipProcessor.from_pretrained(
81
  "Salesforce/blip-image-captioning-large"
82
  )
@@ -86,7 +77,6 @@ def load_local_models():
86
  )
87
  blip_itm_model.eval()
88
 
89
- # DINO — object detection
90
  dino_processor = AutoProcessor.from_pretrained(
91
  "IDEA-Research/grounding-dino-base"
92
  )
@@ -96,7 +86,6 @@ def load_local_models():
96
  )
97
  dino_model.eval()
98
 
99
- # Qwen2.5-1.5B — caption fusion
100
  qwen_tokenizer = AutoTokenizer.from_pretrained(
101
  "Qwen/Qwen2.5-1.5B-Instruct"
102
  )
@@ -107,14 +96,12 @@ def load_local_models():
107
  qwen_model.eval()
108
 
109
  return (
 
110
  blip_processor, blip_itm_model,
111
  dino_processor, dino_model,
112
  qwen_tokenizer, qwen_model
113
  )
114
 
115
- # ============================================================================
116
- # HELPERS
117
- # ============================================================================
118
  def image_to_bytes(image: Image.Image) -> bytes:
119
  buf = BytesIO()
120
  image.save(buf, format="JPEG", quality=85)
@@ -125,74 +112,125 @@ def image_to_data_uri(image: Image.Image) -> str:
125
  b64 = base64.b64encode(raw).decode()
126
  return f"data:image/jpeg;base64,{b64}"
127
 
128
- # ============================================================================
129
- # STEP 1 — GEMINI 2.0 FLASH: GENERATE 5 DIVERSE CAPTIONS
130
- # Single API call — all 5 captions in one request
131
- # Retry logic: tries gemini-2.0-flash first, falls back to gemini-1.5-flash-8b
132
- # gemini-1.5-flash-8b has separate quota pool from gemini-2.0-flash
133
- # ============================================================================
134
- def generate_captions_gemini(image: Image.Image) -> list:
135
-
136
- prompt = """Look at this image carefully and write 5 different captions from different perspectives.
137
-
138
- 1. Overall scene: One sentence describing the general scene.
139
- 2. People: Describe the people, their clothing colors, style, and what they are doing in detail.
140
- 3. Background: Describe the background, setting, and surroundings.
141
- 4. Objects: Describe the objects, plants, and items visible in the image.
142
- 5. Full description: A complete description covering who is in the image, what they are doing, their appearance, and where the scene takes place.
143
-
144
- Reply in this exact format:
145
- CAPTION_1: [your caption here]
146
- CAPTION_2: [your caption here]
147
- CAPTION_3: [your caption here]
148
- CAPTION_4: [your caption here]
149
- CAPTION_5: [your caption here]"""
150
-
151
- # Try primary model first, fallback to secondary if quota exceeded
152
- models_to_try = [
153
- "gemini-2.0-flash",
154
- "gemini-1.5-flash-8b",
155
- "gemini-1.5-flash"
156
- ]
157
 
158
- raw_text = None
 
159
 
160
- for model_name in models_to_try:
161
- try:
162
- model = genai.GenerativeModel(model_name)
163
- response = model.generate_content([prompt, image])
164
- raw_text = response.text.strip()
165
- break
166
- except Exception as e:
167
- error_msg = str(e)
168
- if "429" in error_msg:
169
- st.warning(f"{model_name} quota exceeded, trying next model...")
170
- continue
171
- else:
172
- st.warning(f"Gemini error ({model_name}): {error_msg[:80]}")
173
- continue
174
-
175
- if raw_text is None:
176
- st.error(
177
- "All Gemini models hit quota limit. "
178
- "Quota resets at midnight (Pacific Time). "
179
- "Using fallback captions for now."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  )
181
- return ["a scene shown in the image"] * 5
182
-
183
- # Parse the 5 captions from structured response
184
- captions = []
185
- for i in range(1, 6):
186
- marker = f"CAPTION_{i}:"
187
- next_marker = f"CAPTION_{i+1}:" if i < 5 else None
188
-
189
- if marker in raw_text:
190
- start = raw_text.index(marker) + len(marker)
191
- end = raw_text.index(next_marker) if next_marker and next_marker in raw_text else len(raw_text)
192
- cap = raw_text[start:end].strip().lower()
193
- captions.append(cap if cap else "a scene shown in the image")
 
 
 
 
194
  else:
195
- captions.append("a scene shown in the image")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  seen, unique = set(), []
198
  for c in captions:
@@ -208,9 +246,6 @@ CAPTION_5: [your caption here]"""
208
 
209
  return unique[:5]
210
 
211
- # ============================================================================
212
- # STEP 2 — BLIP ITM: IMAGE-TEXT MATCHING SCORES
213
- # ============================================================================
214
  def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
215
  scores = []
216
  for cap in captions:
@@ -230,9 +265,6 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
230
  scores.append(0.0)
231
  return scores
232
 
233
- # ============================================================================
234
- # STEP 3 — JINA RERANKER M0: SEMANTIC SCORES
235
- # ============================================================================
236
  def compute_jina_scores(image: Image.Image, captions: list) -> list:
237
  img_data_uri = image_to_data_uri(image)
238
  scores = []
@@ -263,9 +295,6 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
263
  scores.append(0.0)
264
  return scores
265
 
266
- # ============================================================================
267
- # STEP 4 — COSINE SIMILARITY: EMBEDDING SCORES
268
- # ============================================================================
269
  def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
270
  try:
271
  img_inp = blip_proc(images=image, return_tensors="pt")
@@ -292,9 +321,6 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
292
  st.warning(f"Cosine error: {str(e)[:60]}")
293
  return [0.0] * len(captions)
294
 
295
- # ============================================================================
296
- # STEP 5 — MAJORITY VOTING: SELECT TOP 2 CAPTIONS
297
- # ============================================================================
298
  def majority_voting(captions, itm, jina, cosine) -> tuple:
299
  itm_r = np.argsort(itm)[::-1]
300
  jina_r = np.argsort(jina)[::-1]
@@ -312,9 +338,6 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
312
 
313
  return captions[top2[0]], captions[top2[1]], top2, dict(counts)
314
 
315
- # ============================================================================
316
- # STEP 6 — GROUNDING DINO: OBJECT DETECTION
317
- # ============================================================================
318
  def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
319
  try:
320
  inputs = dino_proc(
@@ -355,7 +378,11 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
355
  return "Object detection unavailable", []
356
 
357
  # ============================================================================
358
- # STEP 7 QWEN2.5-1.5B (LOCAL): CAPTION FUSION
 
 
 
 
359
  # ============================================================================
360
  def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
361
 
@@ -368,7 +395,7 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
368
  "what each person looks like and what they are doing, "
369
  "the objects and plants visible around them, "
370
  "and the setting or background of the scene. "
371
- "Write 3 to 4 sentences. Use simple, clear, everyday words. "
372
  "Do NOT summarize or shorten — keep every specific detail. "
373
  "Only include what is clearly visible. "
374
  "Return ONLY the caption, nothing else."
@@ -379,7 +406,7 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
379
  f"Caption B: {cap2}\n"
380
  f"{objects}\n\n"
381
  "Write a detailed caption that includes all the clothing, "
382
- "people, objects and background details:"
383
  )
384
 
385
  try:
@@ -416,15 +443,12 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
416
  st.warning(f"Qwen fusion error: {str(e)[:80]}")
417
  return cap1
418
 
419
- # ============================================================================
420
- # SIDEBAR
421
- # ============================================================================
422
  with st.sidebar:
423
  st.title("Image Caption Fusion")
424
  st.markdown("---")
425
  st.markdown("### Pipeline Steps")
426
  st.markdown("""
427
- **1. Gemini 2.0 Flash** (API)
428
  Generate 5 captions
429
 
430
  **2. BLIP ITM** (Local)
@@ -446,12 +470,9 @@ Object detection
446
  Caption fusion
447
  """)
448
  st.markdown("---")
449
- st.markdown("**Local:** BLIP ITM, DINO, Qwen2.5")
450
- st.markdown("**API:** Gemini 2.0 Flash, Jina")
451
 
452
- # ============================================================================
453
- # MAIN UI
454
- # ============================================================================
455
  st.title("Image Caption Fusion System")
456
  st.markdown("Upload an image to generate a refined, grounded caption.")
457
  st.markdown("---")
@@ -472,8 +493,9 @@ if uploaded_file is not None:
472
  with col_run:
473
  if st.button("Generate Caption", type="primary", use_container_width=True):
474
 
475
- with st.spinner("Loading local models (first run takes 2-3 min)..."):
476
  (
 
477
  blip_proc, blip_itm,
478
  dino_proc, dino_mod,
479
  qwen_tok, qwen_mod
@@ -482,8 +504,8 @@ if uploaded_file is not None:
482
  progress = st.progress(0)
483
  status = st.empty()
484
 
485
- status.info("Step 1/7: Generating captions with Gemini 2.0 Flash...")
486
- captions = generate_captions_gemini(input_image)
487
  progress.progress(14)
488
 
489
  with st.expander("5 Generated Captions", expanded=True):
 
6
  import requests
7
  import base64
8
  import streamlit as st
 
9
  from PIL import Image
10
  from io import BytesIO
11
  from collections import Counter
 
18
  initial_sidebar_state="expanded"
19
  )
20
 
21
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
22
+ JINA_KEY = os.environ.get("JINA_KEY", "")
 
 
 
23
 
24
  JINA_URL = "https://api.jina.ai/v1/rerank"
25
  JINA_HEADERS = {
 
41
  "car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
42
  )
43
 
 
 
 
44
  if not JINA_KEY:
45
  st.error("JINA_KEY missing. Go to Space Settings → Secrets and add it.")
46
  st.stop()
47
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  @st.cache_resource
49
  def load_local_models():
50
  from transformers import (
51
+ AutoProcessor,
52
  AutoModelForCausalLM,
53
  AutoTokenizer,
54
  BlipProcessor,
55
  BlipForImageTextRetrieval,
 
56
  AutoModelForZeroShotObjectDetection
57
  )
58
  gc.collect()
59
 
60
+ florence_processor = AutoProcessor.from_pretrained(
61
+ "microsoft/Florence-2-large",
62
+ trust_remote_code=True
63
+ )
64
+ florence_model = AutoModelForCausalLM.from_pretrained(
65
+ "microsoft/Florence-2-large",
66
+ trust_remote_code=True,
67
+ torch_dtype=torch.float32
68
+ )
69
+ florence_model.eval()
70
+
71
  blip_processor = BlipProcessor.from_pretrained(
72
  "Salesforce/blip-image-captioning-large"
73
  )
 
77
  )
78
  blip_itm_model.eval()
79
 
 
80
  dino_processor = AutoProcessor.from_pretrained(
81
  "IDEA-Research/grounding-dino-base"
82
  )
 
86
  )
87
  dino_model.eval()
88
 
 
89
  qwen_tokenizer = AutoTokenizer.from_pretrained(
90
  "Qwen/Qwen2.5-1.5B-Instruct"
91
  )
 
96
  qwen_model.eval()
97
 
98
  return (
99
+ florence_processor, florence_model,
100
  blip_processor, blip_itm_model,
101
  dino_processor, dino_model,
102
  qwen_tokenizer, qwen_model
103
  )
104
 
 
 
 
105
  def image_to_bytes(image: Image.Image) -> bytes:
106
  buf = BytesIO()
107
  image.save(buf, format="JPEG", quality=85)
 
112
  b64 = base64.b64encode(raw).decode()
113
  return f"data:image/jpeg;base64,{b64}"
114
 
115
+ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ captions = []
118
+ image_size = (image.width, image.height)
119
 
120
+ # Task 1: Short caption
121
+ try:
122
+ inputs = florence_proc(
123
+ text="<CAPTION>", images=image, return_tensors="pt"
124
+ )
125
+ with torch.no_grad():
126
+ ids = florence_mod.generate(
127
+ input_ids=inputs["input_ids"],
128
+ pixel_values=inputs["pixel_values"],
129
+ max_new_tokens=50, num_beams=3
130
+ )
131
+ raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
132
+ parsed = florence_proc.post_process_generation(raw, task="<CAPTION>", image_size=image_size)
133
+ cap = parsed.get("<CAPTION>", "").strip().lower()
134
+ captions.append(cap if cap else "a scene shown in the image")
135
+ except Exception as e:
136
+ st.warning(f"Florence CAPTION error: {str(e)[:80]}")
137
+ captions.append("a scene shown in the image")
138
+
139
+ # Task 2: Detailed caption
140
+ try:
141
+ inputs = florence_proc(
142
+ text="<DETAILED_CAPTION>", images=image, return_tensors="pt"
143
+ )
144
+ with torch.no_grad():
145
+ ids = florence_mod.generate(
146
+ input_ids=inputs["input_ids"],
147
+ pixel_values=inputs["pixel_values"],
148
+ max_new_tokens=100, num_beams=3
149
+ )
150
+ raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
151
+ parsed = florence_proc.post_process_generation(raw, task="<DETAILED_CAPTION>", image_size=image_size)
152
+ cap = parsed.get("<DETAILED_CAPTION>", "").strip().lower()
153
+ captions.append(cap if cap else "a scene shown in the image")
154
+ except Exception as e:
155
+ st.warning(f"Florence DETAILED_CAPTION error: {str(e)[:80]}")
156
+ captions.append("a scene shown in the image")
157
+
158
+ # Task 3: More detailed caption
159
+ try:
160
+ inputs = florence_proc(
161
+ text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt"
162
+ )
163
+ with torch.no_grad():
164
+ ids = florence_mod.generate(
165
+ input_ids=inputs["input_ids"],
166
+ pixel_values=inputs["pixel_values"],
167
+ max_new_tokens=150, num_beams=3
168
+ )
169
+ raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
170
+ parsed = florence_proc.post_process_generation(raw, task="<MORE_DETAILED_CAPTION>", image_size=image_size)
171
+ cap = parsed.get("<MORE_DETAILED_CAPTION>", "").strip().lower()
172
+ captions.append(cap if cap else "a scene shown in the image")
173
+ except Exception as e:
174
+ st.warning(f"Florence MORE_DETAILED_CAPTION error: {str(e)[:80]}")
175
+ captions.append("a scene shown in the image")
176
+
177
+ # Task 4: Dense region caption
178
+ try:
179
+ inputs = florence_proc(
180
+ text="<DENSE_REGION_CAPTION>", images=image, return_tensors="pt"
181
  )
182
+ with torch.no_grad():
183
+ ids = florence_mod.generate(
184
+ input_ids=inputs["input_ids"],
185
+ pixel_values=inputs["pixel_values"],
186
+ max_new_tokens=200, num_beams=3
187
+ )
188
+ raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
189
+ parsed = florence_proc.post_process_generation(raw, task="<DENSE_REGION_CAPTION>", image_size=image_size)
190
+ labels = parsed.get("<DENSE_REGION_CAPTION>", {}).get("labels", [])
191
+
192
+ if labels:
193
+ seen_r, unique_r = set(), []
194
+ for l in labels:
195
+ if l.lower() not in seen_r:
196
+ seen_r.add(l.lower())
197
+ unique_r.append(l.lower())
198
+ cap = ", ".join(unique_r[:6]) + " visible in the scene"
199
  else:
200
+ cap = "a scene shown in the image"
201
+ captions.append(cap)
202
+ except Exception as e:
203
+ st.warning(f"Florence DENSE_REGION error: {str(e)[:80]}")
204
+ captions.append("a scene shown in the image")
205
+
206
+ # Task 5: Object detection
207
+ try:
208
+ inputs = florence_proc(
209
+ text="<OD>", images=image, return_tensors="pt"
210
+ )
211
+ with torch.no_grad():
212
+ ids = florence_mod.generate(
213
+ input_ids=inputs["input_ids"],
214
+ pixel_values=inputs["pixel_values"],
215
+ max_new_tokens=200, num_beams=3
216
+ )
217
+ raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
218
+ parsed = florence_proc.post_process_generation(raw, task="<OD>", image_size=image_size)
219
+ labels = parsed.get("<OD>", {}).get("labels", [])
220
+
221
+ if labels:
222
+ seen_o, unique_o = set(), []
223
+ for l in labels:
224
+ if l.lower() not in seen_o:
225
+ seen_o.add(l.lower())
226
+ unique_o.append(l.lower())
227
+ cap = "a scene containing " + ", ".join(unique_o[:6])
228
+ else:
229
+ cap = "a scene shown in the image"
230
+ captions.append(cap)
231
+ except Exception as e:
232
+ st.warning(f"Florence OD error: {str(e)[:80]}")
233
+ captions.append("a scene shown in the image")
234
 
235
  seen, unique = set(), []
236
  for c in captions:
 
246
 
247
  return unique[:5]
248
 
 
 
 
249
  def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
250
  scores = []
251
  for cap in captions:
 
265
  scores.append(0.0)
266
  return scores
267
 
 
 
 
268
  def compute_jina_scores(image: Image.Image, captions: list) -> list:
269
  img_data_uri = image_to_data_uri(image)
270
  scores = []
 
295
  scores.append(0.0)
296
  return scores
297
 
 
 
 
298
  def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
299
  try:
300
  img_inp = blip_proc(images=image, return_tensors="pt")
 
321
  st.warning(f"Cosine error: {str(e)[:60]}")
322
  return [0.0] * len(captions)
323
 
 
 
 
324
  def majority_voting(captions, itm, jina, cosine) -> tuple:
325
  itm_r = np.argsort(itm)[::-1]
326
  jina_r = np.argsort(jina)[::-1]
 
338
 
339
  return captions[top2[0]], captions[top2[1]], top2, dict(counts)
340
 
 
 
 
341
  def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
342
  try:
343
  inputs = dino_proc(
 
378
  return "Object detection unavailable", []
379
 
380
  # ============================================================================
381
+ # fuse_captionsCHANGED
382
+ # system_prompt: explicitly covers clothing, colors, people, objects, setting
383
+ # user_prompt: asks for all specific details including clothing and background
384
+ # max_new_tokens: 100 → 180 (room for 3-4 full sentences)
385
+ # temperature: 0.2 → 0.4 (more expressive while staying factual)
386
  # ============================================================================
387
  def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
388
 
 
395
  "what each person looks like and what they are doing, "
396
  "the objects and plants visible around them, "
397
  "and the setting or background of the scene. "
398
+ "Write 5 to 6 sentences. Use simple, clear, everyday words. "
399
  "Do NOT summarize or shorten — keep every specific detail. "
400
  "Only include what is clearly visible. "
401
  "Return ONLY the caption, nothing else."
 
406
  f"Caption B: {cap2}\n"
407
  f"{objects}\n\n"
408
  "Write a detailed caption that includes all the clothing, "
409
+ "people, objects and background in details:"
410
  )
411
 
412
  try:
 
443
  st.warning(f"Qwen fusion error: {str(e)[:80]}")
444
  return cap1
445
 
 
 
 
446
  with st.sidebar:
447
  st.title("Image Caption Fusion")
448
  st.markdown("---")
449
  st.markdown("### Pipeline Steps")
450
  st.markdown("""
451
+ **1. Florence-2-Large** (Local)
452
  Generate 5 captions
453
 
454
  **2. BLIP ITM** (Local)
 
470
  Caption fusion
471
  """)
472
  st.markdown("---")
473
+ st.markdown("**Local:** Florence-2, BLIP ITM, DINO, Qwen2.5")
474
+ st.markdown("**API:** Jina")
475
 
 
 
 
476
  st.title("Image Caption Fusion System")
477
  st.markdown("Upload an image to generate a refined, grounded caption.")
478
  st.markdown("---")
 
493
  with col_run:
494
  if st.button("Generate Caption", type="primary", use_container_width=True):
495
 
496
+ with st.spinner("Loading local models (first run takes 3-4 min)..."):
497
  (
498
+ florence_proc, florence_mod,
499
  blip_proc, blip_itm,
500
  dino_proc, dino_mod,
501
  qwen_tok, qwen_mod
 
504
  progress = st.progress(0)
505
  status = st.empty()
506
 
507
+ status.info("Step 1/7: Generating captions with Florence-2-Large...")
508
+ captions = generate_captions_florence(input_image, florence_proc, florence_mod)
509
  progress.progress(14)
510
 
511
  with st.expander("5 Generated Captions", expanded=True):