Afsha001 commited on
Commit
9ef58b8
Β·
verified Β·
1 Parent(s): 4b0137c

update Florence2

Browse files
Files changed (1) hide show
  1. app.py +80 -52
app.py CHANGED
@@ -27,13 +27,26 @@ JINA_HEADERS = {
27
  "Content-Type": "application/json"
28
  }
29
 
 
 
 
 
30
  DETECT_PROMPT = (
31
- "person . child . man . woman . boy . girl . "
32
- "dog . cat . horse . bird . animal . "
33
- "ball . toy . bicycle . car . bench . "
34
- "tree . grass . water . sky . mountain . "
35
- "building . stairs . door . fence . floor . "
36
- "jacket . dress . shirt . hat . bag ."
 
 
 
 
 
 
 
 
 
37
  )
38
 
39
  if not JINA_KEY:
@@ -41,11 +54,8 @@ if not JINA_KEY:
41
  st.stop()
42
 
43
  # ============================================================================
44
- # LOAD LOCAL MODELS
45
- # GIT-Large-COCO: caption generation
46
- # BLIP ITM: image-text matching + cosine similarity
47
- # DINO: object detection
48
- # Qwen2.5-1.5B: caption fusion (moved local β€” API was returning 404)
49
  # ============================================================================
50
  @st.cache_resource
51
  def load_local_models():
@@ -59,13 +69,17 @@ def load_local_models():
59
  )
60
  gc.collect()
61
 
62
- # GIT-Large-COCO β€” caption generation
63
- git_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
64
- git_model = AutoModelForCausalLM.from_pretrained(
65
- "microsoft/git-large-coco",
 
 
 
 
66
  torch_dtype=torch.float32
67
  )
68
- git_model.eval()
69
 
70
  # BLIP β€” ITM scoring and cosine similarity
71
  blip_processor = BlipProcessor.from_pretrained(
@@ -87,7 +101,7 @@ def load_local_models():
87
  )
88
  dino_model.eval()
89
 
90
- # Qwen2.5-1.5B β€” caption fusion (local, no API)
91
  qwen_tokenizer = AutoTokenizer.from_pretrained(
92
  "Qwen/Qwen2.5-1.5B-Instruct"
93
  )
@@ -98,7 +112,7 @@ def load_local_models():
98
  qwen_model.eval()
99
 
100
  return (
101
- git_processor, git_model,
102
  blip_processor, blip_itm_model,
103
  dino_processor, dino_model,
104
  qwen_tokenizer, qwen_model
@@ -114,32 +128,53 @@ def image_to_data_uri(image: Image.Image) -> str:
114
  b64 = base64.b64encode(raw).decode()
115
  return f"data:image/jpeg;base64,{b64}"
116
 
117
- def generate_captions_git(image: Image.Image, git_proc, git_mod) -> list:
118
-
119
- strategies = [
120
- {"max_new_tokens": 30},
121
- {"max_new_tokens": 50, "num_beams": 5, "early_stopping": True},
122
- {"max_new_tokens": 60, "do_sample": True, "temperature": 0.7, "top_k": 50},
123
- {"max_new_tokens": 70, "do_sample": True, "temperature": 1.3, "top_k": 100},
124
- {"max_new_tokens": 55, "do_sample": True, "top_p": 0.9, "temperature": 1.0},
 
 
 
 
 
125
  ]
126
 
127
- captions = []
128
- pixel_values = git_proc(images=image, return_tensors="pt").pixel_values
129
 
130
- for strategy in strategies:
131
  try:
 
 
 
 
 
 
132
  with torch.no_grad():
133
- generated_ids = git_mod.generate(
134
- pixel_values=pixel_values,
135
- **strategy
 
136
  )
137
- cap = git_proc.batch_decode(
138
- generated_ids, skip_special_tokens=True
139
- )[0].strip().lower()
 
 
 
 
 
 
 
 
 
140
  captions.append(cap if cap else "a scene shown in the image")
 
141
  except Exception as e:
142
- st.warning(f"GIT error: {str(e)[:80]}")
143
  captions.append("a scene shown in the image")
144
 
145
  seen, unique = set(), []
@@ -287,14 +322,7 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
287
  st.warning(f"DINO error: {str(e)[:80]}")
288
  return "Object detection unavailable", []
289
 
290
- # ============================================================================
291
- # STEP 7 β€” QWEN2.5-1.5B (LOCAL): CAPTION FUSION
292
- # Moved from API to local β€” API was consistently returning 404
293
- # Uses chat template for proper instruct format
294
- # Prompt asks Qwen to enrich and add detail using detected objects
295
- # ============================================================================
296
  def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
297
-
298
  system_prompt = (
299
  "You are an expert image captioning assistant. "
300
  "Write ONE natural, fluent, detailed and descriptive caption. "
@@ -315,9 +343,7 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
315
  ]
316
 
317
  text = qwen_tok.apply_chat_template(
318
- messages,
319
- tokenize=False,
320
- add_generation_prompt=True
321
  )
322
 
323
  model_inputs = qwen_tok([text], return_tensors="pt")
@@ -331,7 +357,6 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
331
  top_p=0.9
332
  )
333
 
334
- # Strip input tokens from output
335
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
336
  fused = qwen_tok.decode(output_ids, skip_special_tokens=True).strip()
337
 
@@ -345,12 +370,15 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
345
  st.warning(f"Qwen fusion error: {str(e)[:80]}")
346
  return cap1
347
 
 
 
 
348
  with st.sidebar:
349
  st.title("Image Caption Fusion")
350
  st.markdown("---")
351
  st.markdown("### Pipeline Steps")
352
  st.markdown("""
353
- **1. GIT-Large-COCO** (Local)
354
  Generate 5 captions
355
 
356
  **2. BLIP ITM** (Local)
@@ -372,7 +400,7 @@ Object detection
372
  Caption fusion
373
  """)
374
  st.markdown("---")
375
- st.markdown("**Local:** GIT-Large, BLIP ITM, DINO, Qwen2.5")
376
  st.markdown("**API:** Jina")
377
 
378
  st.title("Image Caption Fusion System")
@@ -397,7 +425,7 @@ if uploaded_file is not None:
397
 
398
  with st.spinner("Loading local models (first run takes 3-4 min)..."):
399
  (
400
- git_proc, git_mod,
401
  blip_proc, blip_itm,
402
  dino_proc, dino_mod,
403
  qwen_tok, qwen_mod
@@ -406,8 +434,8 @@ if uploaded_file is not None:
406
  progress = st.progress(0)
407
  status = st.empty()
408
 
409
- status.info("Step 1/7: Generating captions with GIT-Large-COCO...")
410
- captions = generate_captions_git(input_image, git_proc, git_mod)
411
  progress.progress(14)
412
 
413
  with st.expander("5 Generated Captions", expanded=True):
 
27
  "Content-Type": "application/json"
28
  }
29
 
30
+ # ============================================================================
31
+ # CHANGE 1: DETECT_PROMPT β€” expanded with colours, furniture, objects
32
+ # More labels = richer grounding for Qwen fusion
33
+ # ============================================================================
34
  DETECT_PROMPT = (
35
+ "person . man . woman . boy . girl . child . baby . "
36
+ "red . blue . green . yellow . black . white . orange . purple . brown . "
37
+ "shirt . jacket . dress . coat . hat . glasses . bag . shoes . "
38
+ "table . chair . bench . sofa . desk . stool . wooden chair . dining table . "
39
+ "cup . glass . bottle . plate . bowl . fork . spoon . knife . "
40
+ "car . bicycle . motorcycle . bus . truck . "
41
+ "tree . grass . flower . sky . water . river . mountain . road . "
42
+ "building . wall . door . window . floor . ceiling . stairs . "
43
+ "lamp . light . candle . fire . smoke . "
44
+ "phone . laptop . book . bag . umbrella . "
45
+ "dog . cat . bird . horse . animal . "
46
+ "food . pizza . cake . bread . fruit . "
47
+ "bar . restaurant . pub . cafe . kitchen . "
48
+ "wood . metal . glass . brick . "
49
+ "dark . bright . colorful ."
50
  )
51
 
52
  if not JINA_KEY:
 
54
  st.stop()
55
 
56
  # ============================================================================
57
+ # CHANGE 2: load_local_models β€” replaced GIT with Florence-2-Large
58
+ # Florence-2 has 3 built-in task tokens β€” accurate, grounded, no hallucination
 
 
 
59
  # ============================================================================
60
  @st.cache_resource
61
  def load_local_models():
 
69
  )
70
  gc.collect()
71
 
72
+ # Florence-2-Large β€” accurate caption generation with task tokens
73
+ florence_processor = AutoProcessor.from_pretrained(
74
+ "microsoft/Florence-2-large",
75
+ trust_remote_code=True
76
+ )
77
+ florence_model = AutoModelForCausalLM.from_pretrained(
78
+ "microsoft/Florence-2-large",
79
+ trust_remote_code=True,
80
  torch_dtype=torch.float32
81
  )
82
+ florence_model.eval()
83
 
84
  # BLIP β€” ITM scoring and cosine similarity
85
  blip_processor = BlipProcessor.from_pretrained(
 
101
  )
102
  dino_model.eval()
103
 
104
+ # Qwen2.5-1.5B β€” caption fusion (local)
105
  qwen_tokenizer = AutoTokenizer.from_pretrained(
106
  "Qwen/Qwen2.5-1.5B-Instruct"
107
  )
 
112
  qwen_model.eval()
113
 
114
  return (
115
+ florence_processor, florence_model,
116
  blip_processor, blip_itm_model,
117
  dino_processor, dino_model,
118
  qwen_tokenizer, qwen_model
 
128
  b64 = base64.b64encode(raw).decode()
129
  return f"data:image/jpeg;base64,{b64}"
130
 
131
+ # ============================================================================
132
+ # CHANGE 3: generate_captions_florence β€” replaces generate_captions_git
133
+ # Uses Florence-2 task tokens for naturally diverse and accurate captions
134
+ # <CAPTION> / <DETAILED_CAPTION> / <MORE_DETAILED_CAPTION>
135
+ # ============================================================================
136
+ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
137
+
138
+ tasks = [
139
+ ("<CAPTION>", {"max_new_tokens": 50, "num_beams": 3}),
140
+ ("<DETAILED_CAPTION>", {"max_new_tokens": 100, "num_beams": 3}),
141
+ ("<MORE_DETAILED_CAPTION>", {"max_new_tokens": 150, "num_beams": 3}),
142
+ ("<DETAILED_CAPTION>", {"max_new_tokens": 100, "num_beams": 5}),
143
+ ("<CAPTION>", {"max_new_tokens": 80, "num_beams": 5}),
144
  ]
145
 
146
+ captions = []
 
147
 
148
+ for task_prompt, gen_kwargs in tasks:
149
  try:
150
+ inputs = florence_proc(
151
+ text=task_prompt,
152
+ images=image,
153
+ return_tensors="pt"
154
+ )
155
+
156
  with torch.no_grad():
157
+ generated_ids = florence_mod.generate(
158
+ input_ids=inputs["input_ids"],
159
+ pixel_values=inputs["pixel_values"],
160
+ **gen_kwargs
161
  )
162
+
163
+ generated_text = florence_proc.batch_decode(
164
+ generated_ids, skip_special_tokens=False
165
+ )[0]
166
+
167
+ parsed = florence_proc.post_process_generation(
168
+ generated_text,
169
+ task=task_prompt,
170
+ image_size=(image.width, image.height)
171
+ )
172
+
173
+ cap = parsed.get(task_prompt, "").strip().lower()
174
  captions.append(cap if cap else "a scene shown in the image")
175
+
176
  except Exception as e:
177
+ st.warning(f"Florence error: {str(e)[:80]}")
178
  captions.append("a scene shown in the image")
179
 
180
  seen, unique = set(), []
 
322
  st.warning(f"DINO error: {str(e)[:80]}")
323
  return "Object detection unavailable", []
324
 
 
 
 
 
 
 
325
  def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
 
326
  system_prompt = (
327
  "You are an expert image captioning assistant. "
328
  "Write ONE natural, fluent, detailed and descriptive caption. "
 
343
  ]
344
 
345
  text = qwen_tok.apply_chat_template(
346
+ messages, tokenize=False, add_generation_prompt=True
 
 
347
  )
348
 
349
  model_inputs = qwen_tok([text], return_tensors="pt")
 
357
  top_p=0.9
358
  )
359
 
 
360
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
361
  fused = qwen_tok.decode(output_ids, skip_special_tokens=True).strip()
362
 
 
370
  st.warning(f"Qwen fusion error: {str(e)[:80]}")
371
  return cap1
372
 
373
+ # ============================================================================
374
+ # CHANGE 4: sidebar β€” updated step 1 label to Florence-2-Large
375
+ # ============================================================================
376
  with st.sidebar:
377
  st.title("Image Caption Fusion")
378
  st.markdown("---")
379
  st.markdown("### Pipeline Steps")
380
  st.markdown("""
381
+ **1. Florence-2-Large** (Local)
382
  Generate 5 captions
383
 
384
  **2. BLIP ITM** (Local)
 
400
  Caption fusion
401
  """)
402
  st.markdown("---")
403
+ st.markdown("**Local:** Florence-2, BLIP ITM, DINO, Qwen2.5")
404
  st.markdown("**API:** Jina")
405
 
406
  st.title("Image Caption Fusion System")
 
425
 
426
  with st.spinner("Loading local models (first run takes 3-4 min)..."):
427
  (
428
+ florence_proc, florence_mod,
429
  blip_proc, blip_itm,
430
  dino_proc, dino_mod,
431
  qwen_tok, qwen_mod
 
434
  progress = st.progress(0)
435
  status = st.empty()
436
 
437
+ status.info("Step 1/7: Generating captions with Florence-2-Large...")
438
+ captions = generate_captions_florence(input_image, florence_proc, florence_mod)
439
  progress.progress(14)
440
 
441
  with st.expander("5 Generated Captions", expanded=True):