Spaces:
Running
Running
add app.py and requirements.txt
Browse files
app.py
CHANGED
|
@@ -72,6 +72,54 @@ def load_local_models():
|
|
| 72 |
|
| 73 |
return blip_processor, itm_model, dino_processor, dino_model
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# ββ Step 2: BLIP ITM Scoring (local CPU) ββ
|
| 76 |
def compute_itm_scores(image, captions, blip_processor, itm_model):
|
| 77 |
scores = []
|
|
|
|
| 72 |
|
| 73 |
return blip_processor, itm_model, dino_processor, dino_model
|
| 74 |
|
| 75 |
+
# ββ Step 1: Generate 5 captions via Qwen2-VL API ββ
|
| 76 |
+
def generate_captions_api(image: Image.Image) -> list:
|
| 77 |
+
buffered = BytesIO()
|
| 78 |
+
image.save(buffered, format="JPEG")
|
| 79 |
+
img_bytes = buffered.getvalue()
|
| 80 |
+
|
| 81 |
+
PROMPTS = [
|
| 82 |
+
"Describe this image in one detailed sentence.",
|
| 83 |
+
"What is happening in this image? Write one descriptive sentence.",
|
| 84 |
+
"Describe the main subjects, actions and setting in one sentence.",
|
| 85 |
+
"Write a detailed caption focusing on people, animals and objects visible.",
|
| 86 |
+
"Describe this scene including background details and activities shown.",
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
captions = []
|
| 90 |
+
for prompt in PROMPTS:
|
| 91 |
+
payload = {"inputs": prompt, "image": img_bytes.hex()}
|
| 92 |
+
try:
|
| 93 |
+
response = requests.post(
|
| 94 |
+
QWEN_VL_URL,
|
| 95 |
+
headers = HF_HEADERS,
|
| 96 |
+
json = {"inputs": prompt},
|
| 97 |
+
files = {"image": img_bytes},
|
| 98 |
+
timeout = 30
|
| 99 |
+
)
|
| 100 |
+
if response.status_code == 200:
|
| 101 |
+
result = response.json()
|
| 102 |
+
if isinstance(result, list):
|
| 103 |
+
cap = result[0].get("generated_text", "").strip().lower()
|
| 104 |
+
else:
|
| 105 |
+
cap = str(result).strip().lower()
|
| 106 |
+
captions.append(cap if cap else "a scene with various objects and people")
|
| 107 |
+
else:
|
| 108 |
+
captions.append("a detailed scene with people and objects")
|
| 109 |
+
except Exception as e:
|
| 110 |
+
captions.append("a scene captured in the image")
|
| 111 |
+
|
| 112 |
+
# Deduplicate
|
| 113 |
+
seen, unique = set(), []
|
| 114 |
+
for c in captions:
|
| 115 |
+
if c not in seen:
|
| 116 |
+
seen.add(c)
|
| 117 |
+
unique.append(c)
|
| 118 |
+
while len(unique) < 5:
|
| 119 |
+
unique.append(unique[0])
|
| 120 |
+
|
| 121 |
+
return unique[:5]
|
| 122 |
+
|
| 123 |
# ββ Step 2: BLIP ITM Scoring (local CPU) ββ
|
| 124 |
def compute_itm_scores(image, captions, blip_processor, itm_model):
|
| 125 |
scores = []
|