Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 23 days ago

Commit

a0d9361

1 Parent(s): 25245f2

add app.py and requirements.txt

Browse files

Files changed (1) hide show

app.py +48 -0

app.py CHANGED Viewed

@@ -72,6 +72,54 @@ def load_local_models():
     return blip_processor, itm_model, dino_processor, dino_model
 # ── Step 2: BLIP ITM Scoring (local CPU) ──
 def compute_itm_scores(image, captions, blip_processor, itm_model):
     scores = []

     return blip_processor, itm_model, dino_processor, dino_model
+# ── Step 1: Generate 5 captions via Qwen2-VL API ──
+def generate_captions_api(image: Image.Image) -> list:
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_bytes = buffered.getvalue()
+    PROMPTS = [
+        "Describe this image in one detailed sentence.",
+        "What is happening in this image? Write one descriptive sentence.",
+        "Describe the main subjects, actions and setting in one sentence.",
+        "Write a detailed caption focusing on people, animals and objects visible.",
+        "Describe this scene including background details and activities shown.",
+    ]
+    captions = []
+    for prompt in PROMPTS:
+        payload  = {"inputs": prompt, "image": img_bytes.hex()}
+        try:
+            response = requests.post(
+                QWEN_VL_URL,
+                headers = HF_HEADERS,
+                json    = {"inputs": prompt},
+                files   = {"image": img_bytes},
+                timeout = 30
+            )
+            if response.status_code == 200:
+                result = response.json()
+                if isinstance(result, list):
+                    cap = result[0].get("generated_text", "").strip().lower()
+                else:
+                    cap = str(result).strip().lower()
+                captions.append(cap if cap else "a scene with various objects and people")
+            else:
+                captions.append("a detailed scene with people and objects")
+        except Exception as e:
+            captions.append("a scene captured in the image")
+    # Deduplicate
+    seen, unique = set(), []
+    for c in captions:
+        if c not in seen:
+            seen.add(c)
+            unique.append(c)
+    while len(unique) < 5:
+        unique.append(unique[0])
+    return unique[:5]
 # ── Step 2: BLIP ITM Scoring (local CPU) ──
 def compute_itm_scores(image, captions, blip_processor, itm_model):
     scores = []