Spaces:
Running
Running
update
Browse files
app.py
CHANGED
|
@@ -112,16 +112,17 @@ def image_to_data_uri(image: Image.Image) -> str:
|
|
| 112 |
return f"data:image/jpeg;base64,{b64}"
|
| 113 |
|
| 114 |
# ============================================================================
|
| 115 |
-
#
|
| 116 |
#
|
| 117 |
-
#
|
| 118 |
-
#
|
| 119 |
-
#
|
| 120 |
-
# 3. Removed DENSE_REGION_CAPTION and OD tasks β slowest tasks (200 tokens each)
|
| 121 |
-
# and they return structured bounding box data not natural captions anyway
|
| 122 |
#
|
| 123 |
-
#
|
| 124 |
-
#
|
|
|
|
|
|
|
|
|
|
| 125 |
# ============================================================================
|
| 126 |
def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
|
| 127 |
|
|
@@ -129,12 +130,24 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
|
|
| 129 |
image_size = (image.width, image.height)
|
| 130 |
|
| 131 |
tasks = [
|
| 132 |
-
(
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
]
|
| 136 |
|
| 137 |
-
for task_prompt, max_tokens,
|
| 138 |
try:
|
| 139 |
inputs = florence_proc(
|
| 140 |
text=task_prompt, images=image, return_tensors="pt"
|
|
@@ -144,7 +157,7 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
|
|
| 144 |
input_ids=inputs["input_ids"],
|
| 145 |
pixel_values=inputs["pixel_values"],
|
| 146 |
max_new_tokens=max_tokens,
|
| 147 |
-
|
| 148 |
)
|
| 149 |
raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
|
| 150 |
parsed = florence_proc.post_process_generation(
|
|
@@ -157,6 +170,7 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
|
|
| 157 |
st.warning(f"Florence {task_prompt} error: {str(e)[:80]}")
|
| 158 |
captions.append("a scene shown in the image")
|
| 159 |
|
|
|
|
| 160 |
seen, unique = set(), []
|
| 161 |
for c in captions:
|
| 162 |
if c not in seen:
|
|
@@ -342,10 +356,8 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
|
|
| 342 |
with torch.no_grad():
|
| 343 |
generated_ids = qwen_mod.generate(
|
| 344 |
**model_inputs,
|
| 345 |
-
max_new_tokens=
|
| 346 |
-
|
| 347 |
-
do_sample=True,
|
| 348 |
-
top_p=0.9
|
| 349 |
)
|
| 350 |
|
| 351 |
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
|
|
|
|
| 112 |
return f"data:image/jpeg;base64,{b64}"
|
| 113 |
|
| 114 |
# ============================================================================
|
| 115 |
+
# generate_captions_florence β speed optimized + diversity fixed
|
| 116 |
#
|
| 117 |
+
# Problem: num_beams=1 greedy produces near-identical captions across tasks
|
| 118 |
+
# Fix: Task 1 stays greedy (baseline), Tasks 2 and 3 use sampling
|
| 119 |
+
# with increasing temperature β each task explores different word paths
|
|
|
|
|
|
|
| 120 |
#
|
| 121 |
+
# Task 1: greedy β deterministic, short, factual baseline
|
| 122 |
+
# Task 2: temp=0.7 β slightly varied, focuses on detail
|
| 123 |
+
# Task 3: temp=1.1 β more varied phrasing, different sentence structure
|
| 124 |
+
#
|
| 125 |
+
# Speed: sampling is as fast or faster than beam search β no regression
|
| 126 |
# ============================================================================
|
| 127 |
def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
|
| 128 |
|
|
|
|
| 130 |
image_size = (image.width, image.height)
|
| 131 |
|
| 132 |
tasks = [
|
| 133 |
+
(
|
| 134 |
+
"<CAPTION>",
|
| 135 |
+
30,
|
| 136 |
+
{"num_beams": 1}
|
| 137 |
+
),
|
| 138 |
+
(
|
| 139 |
+
"<DETAILED_CAPTION>",
|
| 140 |
+
80,
|
| 141 |
+
{"do_sample": True, "temperature": 0.7, "top_p": 0.9}
|
| 142 |
+
),
|
| 143 |
+
(
|
| 144 |
+
"<MORE_DETAILED_CAPTION>",
|
| 145 |
+
120,
|
| 146 |
+
{"do_sample": True, "temperature": 1.1, "top_p": 0.95}
|
| 147 |
+
),
|
| 148 |
]
|
| 149 |
|
| 150 |
+
for task_prompt, max_tokens, gen_params in tasks:
|
| 151 |
try:
|
| 152 |
inputs = florence_proc(
|
| 153 |
text=task_prompt, images=image, return_tensors="pt"
|
|
|
|
| 157 |
input_ids=inputs["input_ids"],
|
| 158 |
pixel_values=inputs["pixel_values"],
|
| 159 |
max_new_tokens=max_tokens,
|
| 160 |
+
**gen_params
|
| 161 |
)
|
| 162 |
raw = florence_proc.batch_decode(ids, skip_special_tokens=False)[0]
|
| 163 |
parsed = florence_proc.post_process_generation(
|
|
|
|
| 170 |
st.warning(f"Florence {task_prompt} error: {str(e)[:80]}")
|
| 171 |
captions.append("a scene shown in the image")
|
| 172 |
|
| 173 |
+
# Deduplicate while keeping order
|
| 174 |
seen, unique = set(), []
|
| 175 |
for c in captions:
|
| 176 |
if c not in seen:
|
|
|
|
| 356 |
with torch.no_grad():
|
| 357 |
generated_ids = qwen_mod.generate(
|
| 358 |
**model_inputs,
|
| 359 |
+
max_new_tokens=120,
|
| 360 |
+
do_sample=False
|
|
|
|
|
|
|
| 361 |
)
|
| 362 |
|
| 363 |
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
|