Spaces:
Running
Running
update
Browse files
app.py
CHANGED
|
@@ -28,31 +28,16 @@ JINA_HEADERS = {
|
|
| 28 |
}
|
| 29 |
|
| 30 |
DETECT_PROMPT = (
|
| 31 |
-
# Core Subjects & Actions
|
| 32 |
"person . man . woman . boy . girl . child . baby . a group of people . "
|
| 33 |
"sitting on a chair . riding a bicycle . holding an object . walking on the road . "
|
| 34 |
-
|
| 35 |
-
# Textures & Materials
|
| 36 |
"wooden surface . shiny metal . smooth glass . brick wall . leather bag . denim clothing . "
|
| 37 |
-
|
| 38 |
-
# Detailed Apparel & Wearables
|
| 39 |
"shirt . jacket . dress . coat . hat . glasses . backpack . shoes . tie . "
|
| 40 |
-
|
| 41 |
-
# Common Interior Objects
|
| 42 |
"table . chair . bench . sofa . desk . laptop . phone . book . umbrella . "
|
| 43 |
"cup . glass . bottle . plate . bowl . fork . spoon . knife . "
|
| 44 |
-
|
| 45 |
-
# Environmental & Spatial Elements
|
| 46 |
"in the foreground . in the background . tree . grass . flower . sky . "
|
| 47 |
"water . river . mountain . road . building . wall . door . window . floor . "
|
| 48 |
-
|
| 49 |
-
# Lighting & Atmospheric Context
|
| 50 |
"dark shadow . bright light . sunny day . indoor lamp . reflection . colorful texture . "
|
| 51 |
-
|
| 52 |
-
# Animals & Food
|
| 53 |
"dog . cat . bird . horse . animal . pizza . cake . bread . fruit . "
|
| 54 |
-
|
| 55 |
-
# Transportation & Setting
|
| 56 |
"car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
|
| 57 |
)
|
| 58 |
|
|
@@ -127,22 +112,6 @@ def image_to_data_uri(image: Image.Image) -> str:
|
|
| 127 |
b64 = base64.b64encode(raw).decode()
|
| 128 |
return f"data:image/jpeg;base64,{b64}"
|
| 129 |
|
| 130 |
-
# ============================================================================
|
| 131 |
-
# CHANGE 1: generate_captions_florence
|
| 132 |
-
# 5 different Florence-2 task tokens — each gives a different perspective
|
| 133 |
-
#
|
| 134 |
-
# Task breakdown:
|
| 135 |
-
# <CAPTION> → short overall scene description
|
| 136 |
-
# <DETAILED_CAPTION> → longer overall scene description
|
| 137 |
-
# <MORE_DETAILED_CAPTION> → most detailed overall description
|
| 138 |
-
# <DENSE_REGION_CAPTION> → describes individual regions of the image
|
| 139 |
-
# (returns region labels → joined into a sentence)
|
| 140 |
-
# <OD> → object detection labels
|
| 141 |
-
# (returns detected objects → formatted as caption)
|
| 142 |
-
#
|
| 143 |
-
# OD and DENSE_REGION_CAPTION return structured data not plain text,
|
| 144 |
-
# so we extract their labels and convert to readable captions manually.
|
| 145 |
-
# ============================================================================
|
| 146 |
def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
|
| 147 |
|
| 148 |
captions = []
|
|
@@ -206,13 +175,11 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
|
|
| 206 |
captions.append("a scene shown in the image")
|
| 207 |
|
| 208 |
# Task 4: Dense region caption
|
| 209 |
-
# Returns descriptions per image region — join them into one sentence
|
| 210 |
try:
|
| 211 |
inputs = florence_proc(
|
| 212 |
text="<DENSE_REGION_CAPTION>", images=image, return_tensors="pt"
|
| 213 |
)
|
| 214 |
with torch.no_grad():
|
| 215 |
-
ids = florence_proc.post_process_generation
|
| 216 |
ids = florence_mod.generate(
|
| 217 |
input_ids=inputs["input_ids"],
|
| 218 |
pixel_values=inputs["pixel_values"],
|
|
@@ -223,7 +190,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
|
|
| 223 |
labels = parsed.get("<DENSE_REGION_CAPTION>", {}).get("labels", [])
|
| 224 |
|
| 225 |
if labels:
|
| 226 |
-
# Remove duplicates while preserving order
|
| 227 |
seen_r, unique_r = set(), []
|
| 228 |
for l in labels:
|
| 229 |
if l.lower() not in seen_r:
|
|
@@ -238,7 +204,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
|
|
| 238 |
captions.append("a scene shown in the image")
|
| 239 |
|
| 240 |
# Task 5: Object detection
|
| 241 |
-
# Returns detected object labels — format as descriptive caption
|
| 242 |
try:
|
| 243 |
inputs = florence_proc(
|
| 244 |
text="<OD>", images=image, return_tensors="pt"
|
|
@@ -267,7 +232,6 @@ def generate_captions_florence(image: Image.Image, florence_proc, florence_mod)
|
|
| 267 |
st.warning(f"Florence OD error: {str(e)[:80]}")
|
| 268 |
captions.append("a scene shown in the image")
|
| 269 |
|
| 270 |
-
# Deduplicate while preserving order
|
| 271 |
seen, unique = set(), []
|
| 272 |
for c in captions:
|
| 273 |
if c not in seen:
|
|
@@ -414,29 +378,30 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
|
|
| 414 |
return "Object detection unavailable", []
|
| 415 |
|
| 416 |
# ============================================================================
|
| 417 |
-
#
|
| 418 |
-
#
|
| 419 |
-
#
|
|
|
|
| 420 |
# ============================================================================
|
| 421 |
def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
|
| 422 |
|
| 423 |
system_prompt = (
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
)
|
| 433 |
|
| 434 |
-
user_prompt = (
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
)
|
| 440 |
|
| 441 |
try:
|
| 442 |
messages = [
|
|
@@ -453,7 +418,7 @@ user_prompt = (
|
|
| 453 |
with torch.no_grad():
|
| 454 |
generated_ids = qwen_mod.generate(
|
| 455 |
**model_inputs,
|
| 456 |
-
max_new_tokens=
|
| 457 |
temperature=0.2,
|
| 458 |
do_sample=True,
|
| 459 |
top_p=0.9
|
|
|
|
| 28 |
}
|
| 29 |
|
| 30 |
DETECT_PROMPT = (
|
|
|
|
| 31 |
"person . man . woman . boy . girl . child . baby . a group of people . "
|
| 32 |
"sitting on a chair . riding a bicycle . holding an object . walking on the road . "
|
|
|
|
|
|
|
| 33 |
"wooden surface . shiny metal . smooth glass . brick wall . leather bag . denim clothing . "
|
|
|
|
|
|
|
| 34 |
"shirt . jacket . dress . coat . hat . glasses . backpack . shoes . tie . "
|
|
|
|
|
|
|
| 35 |
"table . chair . bench . sofa . desk . laptop . phone . book . umbrella . "
|
| 36 |
"cup . glass . bottle . plate . bowl . fork . spoon . knife . "
|
|
|
|
|
|
|
| 37 |
"in the foreground . in the background . tree . grass . flower . sky . "
|
| 38 |
"water . river . mountain . road . building . wall . door . window . floor . "
|
|
|
|
|
|
|
| 39 |
"dark shadow . bright light . sunny day . indoor lamp . reflection . colorful texture . "
|
|
|
|
|
|
|
| 40 |
"dog . cat . bird . horse . animal . pizza . cake . bread . fruit . "
|
|
|
|
|
|
|
| 41 |
"car . bicycle . motorcycle . bus . truck . street . kitchen . restaurant . cafe"
|
| 42 |
)
|
| 43 |
|
|
|
|
| 112 |
b64 = base64.b64encode(raw).decode()
|
| 113 |
return f"data:image/jpeg;base64,{b64}"
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
def generate_captions_florence(image: Image.Image, florence_proc, florence_mod) -> list:
|
| 116 |
|
| 117 |
captions = []
|
|
|
|
| 175 |
captions.append("a scene shown in the image")
|
| 176 |
|
| 177 |
# Task 4: Dense region caption
|
|
|
|
| 178 |
try:
|
| 179 |
inputs = florence_proc(
|
| 180 |
text="<DENSE_REGION_CAPTION>", images=image, return_tensors="pt"
|
| 181 |
)
|
| 182 |
with torch.no_grad():
|
|
|
|
| 183 |
ids = florence_mod.generate(
|
| 184 |
input_ids=inputs["input_ids"],
|
| 185 |
pixel_values=inputs["pixel_values"],
|
|
|
|
| 190 |
labels = parsed.get("<DENSE_REGION_CAPTION>", {}).get("labels", [])
|
| 191 |
|
| 192 |
if labels:
|
|
|
|
| 193 |
seen_r, unique_r = set(), []
|
| 194 |
for l in labels:
|
| 195 |
if l.lower() not in seen_r:
|
|
|
|
| 204 |
captions.append("a scene shown in the image")
|
| 205 |
|
| 206 |
# Task 5: Object detection
|
|
|
|
| 207 |
try:
|
| 208 |
inputs = florence_proc(
|
| 209 |
text="<OD>", images=image, return_tensors="pt"
|
|
|
|
| 232 |
st.warning(f"Florence OD error: {str(e)[:80]}")
|
| 233 |
captions.append("a scene shown in the image")
|
| 234 |
|
|
|
|
| 235 |
seen, unique = set(), []
|
| 236 |
for c in captions:
|
| 237 |
if c not in seen:
|
|
|
|
| 378 |
return "Object detection unavailable", []
|
| 379 |
|
| 380 |
# ============================================================================
|
| 381 |
+
# fuse_captions — updated prompt + fixed indentation error from document
|
| 382 |
+
# Covers: who, what they are doing, objects around, where the scene is
|
| 383 |
+
# 2-3 sentences, simple language, only visible facts
|
| 384 |
+
# max_new_tokens increased to 100 for full 2-3 sentence output
|
| 385 |
# ============================================================================
|
| 386 |
def fuse_captions(cap1: str, cap2: str, objects: str, qwen_tok, qwen_mod) -> str:
|
| 387 |
|
| 388 |
system_prompt = (
|
| 389 |
+
"You write image captions. "
|
| 390 |
+
"Look at the two captions and detected objects provided. "
|
| 391 |
+
"Write ONE caption that covers: who is in the image, what they are doing, "
|
| 392 |
+
"what objects are around them, and where the scene is taking place. "
|
| 393 |
+
"Use simple, everyday words. Write 2 to 3 sentences. "
|
| 394 |
+
"Only describe what is clearly visible. "
|
| 395 |
+
"Do not guess, invent, or add dramatic language. "
|
| 396 |
+
"Return ONLY the caption, nothing else."
|
| 397 |
+
)
|
| 398 |
|
| 399 |
+
user_prompt = (
|
| 400 |
+
f"Caption A: {cap1}\n"
|
| 401 |
+
f"Caption B: {cap2}\n"
|
| 402 |
+
f"{objects}\n\n"
|
| 403 |
+
"Write a clear, natural caption covering the person, action, objects and setting:"
|
| 404 |
+
)
|
| 405 |
|
| 406 |
try:
|
| 407 |
messages = [
|
|
|
|
| 418 |
with torch.no_grad():
|
| 419 |
generated_ids = qwen_mod.generate(
|
| 420 |
**model_inputs,
|
| 421 |
+
max_new_tokens=100,
|
| 422 |
temperature=0.2,
|
| 423 |
do_sample=True,
|
| 424 |
top_p=0.9
|