Afsha001 commited on
Commit
380cdcf
Β·
verified Β·
1 Parent(s): 6abcc47
Files changed (1) hide show
  1. app.py +29 -17
app.py CHANGED
@@ -112,16 +112,17 @@ def image_to_data_uri(image: Image.Image) -> str:
112
  return f"data:image/jpeg;base64,{b64}"
113
 
114
  # ============================================================================
115
- # CHANGED: generate_captions_florence β€” speed optimized
116
  #
117
- # What changed:
118
- # 1. num_beams 3 β†’ 1 (greedy decoding) β€” 3x faster, near-identical quality
119
- # 2. max_new_tokens reduced: 50β†’30, 100β†’80, 150β†’120 β€” only generate what needed
120
- # 3. Removed DENSE_REGION_CAPTION and OD tasks β€” slowest tasks (200 tokens each)
121
- # and they return structured bounding box data not natural captions anyway
122
  #
123
- # Speed result: ~2-3 min β†’ ~25 sec
124
- # Quality result: no meaningful loss β€” 3 caption tasks still give full diversity
 
 
 
125
  # ============================================================================
126
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
127
 
@@ -129,12 +130,24 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
129
  image_size = (image.width, image.height)
130
 
131
  tasks = [
132
- ("<CAPTION>", 30, 1),
133
- ("<DETAILED_CAPTION>", 80, 1),
134
- ("<MORE_DETAILED_CAPTION>", 120, 1),
 
 
 
 
 
 
 
 
 
 
 
 
135
  ]
136
 
137
- for task_prompt, max_tokens, num_beams in tasks:
138
  try:
139
  inputs = florence_proc(
140
  text=task_prompt, images=image, return_tensors="pt"
@@ -144,7 +157,7 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
144
  input_ids=inputs["input_ids"],
145
  pixel_values=inputs["pixel_values"],
146
  max_new_tokens=max_tokens,
147
- num_beams=num_beams
148
  )
149
  raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
150
  parsed = florence_proc.post_process_generation(
@@ -157,6 +170,7 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
157
  st.warning(f"Florence {task_prompt} error: {str(e)[:80]}")
158
  captions.append("a scene shown in the image")
159
 
 
160
  seen, unique = set(), []
161
  for c in captions:
162
  if c not in seen:
@@ -342,10 +356,8 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
342
  with torch.no_grad():
343
  generated_ids = qwen_mod.generate(
344
  **model_inputs,
345
- max_new_tokens=180,
346
- temperature=0.4,
347
- do_sample=True,
348
- top_p=0.9
349
  )
350
 
351
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
 
112
  return f"data:image/jpeg;base64,{b64}"
113
 
114
  # ============================================================================
115
+ # generate_captions_florence β€” speed optimized + diversity fixed
116
  #
117
+ # Problem: num_beams=1 greedy produces near-identical captions across tasks
118
+ # Fix: Task 1 stays greedy (baseline), Tasks 2 and 3 use sampling
119
+ # with increasing temperature β€” each task explores different word paths
 
 
120
  #
121
+ # Task 1: greedy β†’ deterministic, short, factual baseline
122
+ # Task 2: temp=0.7 β†’ slightly varied, focuses on detail
123
+ # Task 3: temp=1.1 β†’ more varied phrasing, different sentence structure
124
+ #
125
+ # Speed: sampling is as fast or faster than beam search β€” no regression
126
  # ============================================================================
127
  def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
128
 
 
130
  image_size = (image.width, image.height)
131
 
132
  tasks = [
133
+ (
134
+ "<CAPTION>",
135
+ 30,
136
+ {"num_beams": 1}
137
+ ),
138
+ (
139
+ "<DETAILED_CAPTION>",
140
+ 80,
141
+ {"do_sample": True, "temperature": 0.7, "top_p": 0.9}
142
+ ),
143
+ (
144
+ "<MORE_DETAILED_CAPTION>",
145
+ 120,
146
+ {"do_sample": True, "temperature": 1.1, "top_p": 0.95}
147
+ ),
148
  ]
149
 
150
+ for task_prompt, max_tokens, gen_params in tasks:
151
  try:
152
  inputs = florence_proc(
153
  text=task_prompt, images=image, return_tensors="pt"
 
157
  input_ids=inputs["input_ids"],
158
  pixel_values=inputs["pixel_values"],
159
  max_new_tokens=max_tokens,
160
+ **gen_params
161
  )
162
  raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
163
  parsed = florence_proc.post_process_generation(
 
170
  st.warning(f"Florence {task_prompt} error: {str(e)[:80]}")
171
  captions.append("a scene shown in the image")
172
 
173
+ # Deduplicate while keeping order
174
  seen, unique = set(), []
175
  for c in captions:
176
  if c not in seen:
 
356
  with torch.no_grad():
357
  generated_ids = qwen_mod.generate(
358
  **model_inputs,
359
+ max_new_tokens=120,
360
+ do_sample=False
 
 
361
  )
362
 
363
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]