Spaces:
Running
Running
update Florence2
Browse files
app.py
CHANGED
|
@@ -27,13 +27,26 @@ JINA_HEADERS = {
|
|
| 27 |
"Content-Type": "application/json"
|
| 28 |
}
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
DETECT_PROMPT = (
|
| 31 |
-
"person .
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
)
|
| 38 |
|
| 39 |
if not JINA_KEY:
|
|
@@ -41,11 +54,8 @@ if not JINA_KEY:
|
|
| 41 |
st.stop()
|
| 42 |
|
| 43 |
# ============================================================================
|
| 44 |
-
#
|
| 45 |
-
#
|
| 46 |
-
# BLIP ITM: image-text matching + cosine similarity
|
| 47 |
-
# DINO: object detection
|
| 48 |
-
# Qwen2.5-1.5B: caption fusion (moved local β API was returning 404)
|
| 49 |
# ============================================================================
|
| 50 |
@st.cache_resource
|
| 51 |
def load_local_models():
|
|
@@ -59,13 +69,17 @@ def load_local_models():
|
|
| 59 |
)
|
| 60 |
gc.collect()
|
| 61 |
|
| 62 |
-
#
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
torch_dtype=torch.float32
|
| 67 |
)
|
| 68 |
-
|
| 69 |
|
| 70 |
# BLIP β ITM scoring and cosine similarity
|
| 71 |
blip_processor = BlipProcessor.from_pretrained(
|
|
@@ -87,7 +101,7 @@ def load_local_models():
|
|
| 87 |
)
|
| 88 |
dino_model.eval()
|
| 89 |
|
| 90 |
-
# Qwen2.5-1.5B β caption fusion (local
|
| 91 |
qwen_tokenizer = AutoTokenizer.from_pretrained(
|
| 92 |
"Qwen/Qwen2.5-1.5B-Instruct"
|
| 93 |
)
|
|
@@ -98,7 +112,7 @@ def load_local_models():
|
|
| 98 |
qwen_model.eval()
|
| 99 |
|
| 100 |
return (
|
| 101 |
-
|
| 102 |
blip_processor, blip_itm_model,
|
| 103 |
dino_processor, dino_model,
|
| 104 |
qwen_tokenizer, qwen_model
|
|
@@ -114,32 +128,53 @@ def image_to_data_uri(image: Image.Image) -> str:
|
|
| 114 |
b64 = base64.b64encode(raw).decode()
|
| 115 |
return f"data:image/jpeg;base64,{b64}"
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
]
|
| 126 |
|
| 127 |
-
captions
|
| 128 |
-
pixel_values = git_proc(images=image, return_tensors="pt").pixel_values
|
| 129 |
|
| 130 |
-
for
|
| 131 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
with torch.no_grad():
|
| 133 |
-
generated_ids =
|
| 134 |
-
|
| 135 |
-
|
|
|
|
| 136 |
)
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
captions.append(cap if cap else "a scene shown in the image")
|
|
|
|
| 141 |
except Exception as e:
|
| 142 |
-
st.warning(f"
|
| 143 |
captions.append("a scene shown in the image")
|
| 144 |
|
| 145 |
seen, unique = set(), []
|
|
@@ -287,14 +322,7 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
|
|
| 287 |
st.warning(f"DINO error: {str(e)[:80]}")
|
| 288 |
return "Object detection unavailable", []
|
| 289 |
|
| 290 |
-
# ============================================================================
|
| 291 |
-
# STEP 7 β QWEN2.5-1.5B (LOCAL): CAPTION FUSION
|
| 292 |
-
# Moved from API to local β API was consistently returning 404
|
| 293 |
-
# Uses chat template for proper instruct format
|
| 294 |
-
# Prompt asks Qwen to enrich and add detail using detected objects
|
| 295 |
-
# ============================================================================
|
| 296 |
def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
|
| 297 |
-
|
| 298 |
system_prompt = (
|
| 299 |
"You are an expert image captioning assistant. "
|
| 300 |
"Write ONE natural, fluent, detailed and descriptive caption. "
|
|
@@ -315,9 +343,7 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
|
|
| 315 |
]
|
| 316 |
|
| 317 |
text = qwen_tok.apply_chat_template(
|
| 318 |
-
messages,
|
| 319 |
-
tokenize=False,
|
| 320 |
-
add_generation_prompt=True
|
| 321 |
)
|
| 322 |
|
| 323 |
model_inputs = qwen_tok([text], return_tensors="pt")
|
|
@@ -331,7 +357,6 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
|
|
| 331 |
top_p=0.9
|
| 332 |
)
|
| 333 |
|
| 334 |
-
# Strip input tokens from output
|
| 335 |
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
|
| 336 |
fused = qwen_tok.decode(output_ids, skip_special_tokens=True).strip()
|
| 337 |
|
|
@@ -345,12 +370,15 @@ def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str
|
|
| 345 |
st.warning(f"Qwen fusion error: {str(e)[:80]}")
|
| 346 |
return cap1
|
| 347 |
|
|
|
|
|
|
|
|
|
|
| 348 |
with st.sidebar:
|
| 349 |
st.title("Image Caption Fusion")
|
| 350 |
st.markdown("---")
|
| 351 |
st.markdown("### Pipeline Steps")
|
| 352 |
st.markdown("""
|
| 353 |
-
**1.
|
| 354 |
Generate 5 captions
|
| 355 |
|
| 356 |
**2. BLIP ITM** (Local)
|
|
@@ -372,7 +400,7 @@ Object detection
|
|
| 372 |
Caption fusion
|
| 373 |
""")
|
| 374 |
st.markdown("---")
|
| 375 |
-
st.markdown("**Local:**
|
| 376 |
st.markdown("**API:** Jina")
|
| 377 |
|
| 378 |
st.title("Image Caption Fusion System")
|
|
@@ -397,7 +425,7 @@ if uploaded_file is not None:
|
|
| 397 |
|
| 398 |
with st.spinner("Loading local models (first run takes 3-4 min)..."):
|
| 399 |
(
|
| 400 |
-
|
| 401 |
blip_proc, blip_itm,
|
| 402 |
dino_proc, dino_mod,
|
| 403 |
qwen_tok, qwen_mod
|
|
@@ -406,8 +434,8 @@ if uploaded_file is not None:
|
|
| 406 |
progress = st.progress(0)
|
| 407 |
status = st.empty()
|
| 408 |
|
| 409 |
-
status.info("Step 1/7: Generating captions with
|
| 410 |
-
captions =
|
| 411 |
progress.progress(14)
|
| 412 |
|
| 413 |
with st.expander("5 Generated Captions", expanded=True):
|
|
|
|
| 27 |
"Content-Type": "application/json"
|
| 28 |
}
|
| 29 |
|
| 30 |
+
# ============================================================================
|
| 31 |
+
# CHANGE 1: DETECT_PROMPT β expanded with colours, furniture, objects
|
| 32 |
+
# More labels = richer grounding for Qwen fusion
|
| 33 |
+
# ============================================================================
|
| 34 |
DETECT_PROMPT = (
|
| 35 |
+
"person . man . woman . boy . girl . child . baby . "
|
| 36 |
+
"red . blue . green . yellow . black . white . orange . purple . brown . "
|
| 37 |
+
"shirt . jacket . dress . coat . hat . glasses . bag . shoes . "
|
| 38 |
+
"table . chair . bench . sofa . desk . stool . wooden chair . dining table . "
|
| 39 |
+
"cup . glass . bottle . plate . bowl . fork . spoon . knife . "
|
| 40 |
+
"car . bicycle . motorcycle . bus . truck . "
|
| 41 |
+
"tree . grass . flower . sky . water . river . mountain . road . "
|
| 42 |
+
"building . wall . door . window . floor . ceiling . stairs . "
|
| 43 |
+
"lamp . light . candle . fire . smoke . "
|
| 44 |
+
"phone . laptop . book . bag . umbrella . "
|
| 45 |
+
"dog . cat . bird . horse . animal . "
|
| 46 |
+
"food . pizza . cake . bread . fruit . "
|
| 47 |
+
"bar . restaurant . pub . cafe . kitchen . "
|
| 48 |
+
"wood . metal . glass . brick . "
|
| 49 |
+
"dark . bright . colorful ."
|
| 50 |
)
|
| 51 |
|
| 52 |
if not JINA_KEY:
|
|
|
|
| 54 |
st.stop()
|
| 55 |
|
| 56 |
# ============================================================================
|
| 57 |
+
# CHANGE 2: load_local_models β replaced GIT with Florence-2-Large
|
| 58 |
+
# Florence-2 has 3 built-in task tokens β accurate, grounded, no hallucination
|
|
|
|
|
|
|
|
|
|
| 59 |
# ============================================================================
|
| 60 |
@st.cache_resource
|
| 61 |
def load_local_models():
|
|
|
|
| 69 |
)
|
| 70 |
gc.collect()
|
| 71 |
|
| 72 |
+
# Florence-2-Large β accurate caption generation with task tokens
|
| 73 |
+
florence_processor = AutoProcessor.from_pretrained(
|
| 74 |
+
"microsoft/Florence-2-large",
|
| 75 |
+
trust_remote_code=True
|
| 76 |
+
)
|
| 77 |
+
florence_model = AutoModelForCausalLM.from_pretrained(
|
| 78 |
+
"microsoft/Florence-2-large",
|
| 79 |
+
trust_remote_code=True,
|
| 80 |
torch_dtype=torch.float32
|
| 81 |
)
|
| 82 |
+
florence_model.eval()
|
| 83 |
|
| 84 |
# BLIP β ITM scoring and cosine similarity
|
| 85 |
blip_processor = BlipProcessor.from_pretrained(
|
|
|
|
| 101 |
)
|
| 102 |
dino_model.eval()
|
| 103 |
|
| 104 |
+
# Qwen2.5-1.5B β caption fusion (local)
|
| 105 |
qwen_tokenizer = AutoTokenizer.from_pretrained(
|
| 106 |
"Qwen/Qwen2.5-1.5B-Instruct"
|
| 107 |
)
|
|
|
|
| 112 |
qwen_model.eval()
|
| 113 |
|
| 114 |
return (
|
| 115 |
+
florence_processor, florence_model,
|
| 116 |
blip_processor, blip_itm_model,
|
| 117 |
dino_processor, dino_model,
|
| 118 |
qwen_tokenizer, qwen_model
|
|
|
|
| 128 |
b64 = base64.b64encode(raw).decode()
|
| 129 |
return f"data:image/jpeg;base64,{b64}"
|
| 130 |
|
| 131 |
+
# ============================================================================
|
| 132 |
+
# CHANGE 3: generate_captions_florence β replaces generate_captions_git
|
| 133 |
+
# Uses Florence-2 task tokens for naturally diverse and accurate captions
|
| 134 |
+
# <CAPTION> / <DETAILED_CAPTION> / <MORE_DETAILED_CAPTION>
|
| 135 |
+
# ============================================================================
|
| 136 |
+
def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
|
| 137 |
+
|
| 138 |
+
tasks = [
|
| 139 |
+
("<CAPTION>", {"max_new_tokens": 50, "num_beams": 3}),
|
| 140 |
+
("<DETAILED_CAPTION>", {"max_new_tokens": 100, "num_beams": 3}),
|
| 141 |
+
("<MORE_DETAILED_CAPTION>", {"max_new_tokens": 150, "num_beams": 3}),
|
| 142 |
+
("<DETAILED_CAPTION>", {"max_new_tokens": 100, "num_beams": 5}),
|
| 143 |
+
("<CAPTION>", {"max_new_tokens": 80, "num_beams": 5}),
|
| 144 |
]
|
| 145 |
|
| 146 |
+
captions = []
|
|
|
|
| 147 |
|
| 148 |
+
for task_prompt, gen_kwargs in tasks:
|
| 149 |
try:
|
| 150 |
+
inputs = florence_proc(
|
| 151 |
+
text=task_prompt,
|
| 152 |
+
images=image,
|
| 153 |
+
return_tensors="pt"
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
with torch.no_grad():
|
| 157 |
+
generated_ids = florence_mod.generate(
|
| 158 |
+
input_ids=inputs["input_ids"],
|
| 159 |
+
pixel_values=inputs["pixel_values"],
|
| 160 |
+
**gen_kwargs
|
| 161 |
)
|
| 162 |
+
|
| 163 |
+
generated_text = florence_proc.batch_decode(
|
| 164 |
+
generated_ids, skip_special_tokens=False
|
| 165 |
+
)[0]
|
| 166 |
+
|
| 167 |
+
parsed = florence_proc.post_process_generation(
|
| 168 |
+
generated_text,
|
| 169 |
+
task=task_prompt,
|
| 170 |
+
image_size=(image.width, image.height)
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
cap = parsed.get(task_prompt, "").strip().lower()
|
| 174 |
captions.append(cap if cap else "a scene shown in the image")
|
| 175 |
+
|
| 176 |
except Exception as e:
|
| 177 |
+
st.warning(f"Florence error: {str(e)[:80]}")
|
| 178 |
captions.append("a scene shown in the image")
|
| 179 |
|
| 180 |
seen, unique = set(), []
|
|
|
|
| 322 |
st.warning(f"DINO error: {str(e)[:80]}")
|
| 323 |
return "Object detection unavailable", []
|
| 324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
|
|
|
|
| 326 |
system_prompt = (
|
| 327 |
"You are an expert image captioning assistant. "
|
| 328 |
"Write ONE natural, fluent, detailed and descriptive caption. "
|
|
|
|
| 343 |
]
|
| 344 |
|
| 345 |
text = qwen_tok.apply_chat_template(
|
| 346 |
+
messages, tokenize=False, add_generation_prompt=True
|
|
|
|
|
|
|
| 347 |
)
|
| 348 |
|
| 349 |
model_inputs = qwen_tok([text], return_tensors="pt")
|
|
|
|
| 357 |
top_p=0.9
|
| 358 |
)
|
| 359 |
|
|
|
|
| 360 |
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
|
| 361 |
fused = qwen_tok.decode(output_ids, skip_special_tokens=True).strip()
|
| 362 |
|
|
|
|
| 370 |
st.warning(f"Qwen fusion error: {str(e)[:80]}")
|
| 371 |
return cap1
|
| 372 |
|
| 373 |
+
# ============================================================================
|
| 374 |
+
# CHANGE 4: sidebar β updated step 1 label to Florence-2-Large
|
| 375 |
+
# ============================================================================
|
| 376 |
with st.sidebar:
|
| 377 |
st.title("Image Caption Fusion")
|
| 378 |
st.markdown("---")
|
| 379 |
st.markdown("### Pipeline Steps")
|
| 380 |
st.markdown("""
|
| 381 |
+
**1. Florence-2-Large** (Local)
|
| 382 |
Generate 5 captions
|
| 383 |
|
| 384 |
**2. BLIP ITM** (Local)
|
|
|
|
| 400 |
Caption fusion
|
| 401 |
""")
|
| 402 |
st.markdown("---")
|
| 403 |
+
st.markdown("**Local:** Florence-2, BLIP ITM, DINO, Qwen2.5")
|
| 404 |
st.markdown("**API:** Jina")
|
| 405 |
|
| 406 |
st.title("Image Caption Fusion System")
|
|
|
|
| 425 |
|
| 426 |
with st.spinner("Loading local models (first run takes 3-4 min)..."):
|
| 427 |
(
|
| 428 |
+
florence_proc, florence_mod,
|
| 429 |
blip_proc, blip_itm,
|
| 430 |
dino_proc, dino_mod,
|
| 431 |
qwen_tok, qwen_mod
|
|
|
|
| 434 |
progress = st.progress(0)
|
| 435 |
status = st.empty()
|
| 436 |
|
| 437 |
+
status.info("Step 1/7: Generating captions with Florence-2-Large...")
|
| 438 |
+
captions = generate_captions_florence(input_image, florence_proc, florence_mod)
|
| 439 |
progress.progress(14)
|
| 440 |
|
| 441 |
with st.expander("5 Generated Captions", expanded=True):
|