Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 20 days ago

Commit

bdb5d8d

1 Parent(s): b7d8863

fix Qwen2-VL and Jina API call formats

Browse files

Files changed (1) hide show

app.py +88 -58

app.py CHANGED Viewed

@@ -18,10 +18,11 @@ HF_TOKEN = os.environ.get("HF_TOKEN", "")
 JINA_KEY = os.environ.get("JINA_KEY", "")
 DEVICE   = "cpu"
-QWEN_VL_URL  = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-2B-Instruct"
-QWEN_LM_URL  = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct"
 JINA_URL     = "https://api.jina.ai/v1/rerank"
-HF_HEADERS   = {"Authorization": "Bearer " + HF_TOKEN}
 JINA_HEADERS = {"Authorization": "Bearer " + JINA_KEY, "Content-Type": "application/json"}
 DETECT_PROMPT = (
@@ -51,10 +52,15 @@ def load_local_models():
     dino_model.eval()
     return blip_processor, itm_model, dino_processor, dino_model
-def generate_captions_api(image):
     buffered = BytesIO()
     image.save(buffered, format="JPEG")
-    img_bytes = buffered.getvalue()
     PROMPTS = [
         "Describe this image in one detailed sentence.",
@@ -67,22 +73,34 @@ def generate_captions_api(image):
     captions = []
     for prompt in PROMPTS:
         try:
             response = requests.post(
                 QWEN_VL_URL,
                 headers=HF_HEADERS,
-                json={"inputs": prompt},
-                timeout=30
             )
             if response.status_code == 200:
                 result = response.json()
-                if isinstance(result, list):
-                    cap = result[0].get("generated_text", "").strip().lower()
-                else:
-                    cap = str(result).strip().lower()
-                captions.append(cap if cap else "a scene with various objects and people")
             else:
-                captions.append("a detailed scene with people and objects")
-        except Exception:
             captions.append("a scene captured in the image")
     seen, unique = set(), []
@@ -104,28 +122,42 @@ def compute_itm_scores(image, captions, blip_processor, itm_model):
         scores.append(round(score, 4))
     return scores
 def compute_jina_scores(image, captions):
-    buffered = BytesIO()
-    image.save(buffered, format="JPEG")
-    img_b64  = base64.b64encode(buffered.getvalue()).decode()
-    scores   = []
     for cap in captions:
         try:
-            payload  = {
-                "model"     : "jina-reranker-m0",
-                "query"     : cap,
-                "documents" : [{"type": "image_url",
-                                "image_url": {"url": "data:image/jpeg;base64," + img_b64}}]
             }
-            response = requests.post(JINA_URL, headers=JINA_HEADERS, json=payload, timeout=30)
             if response.status_code == 200:
                 result = response.json()
                 score  = result["results"][0]["relevance_score"]
                 scores.append(round(float(score), 4))
             else:
-                scores.append(0.5)
-        except Exception:
-            scores.append(0.5)
     return scores
 def compute_cosine_scores(image, captions, blip_processor, itm_model):
@@ -187,44 +219,42 @@ def detect_objects(image, dino_processor, dino_model, threshold=0.3):
     label_str = "Detected: [" + ", ".join(sorted_labels) + "]"
     return label_str, sorted_labels
 def fuse_captions_api(cap1, cap2, dino_labels):
     prompt = (
         "You are given two captions and detected objects for the same image. "
         "Write ONE fluent, natural, descriptive caption combining the best details. "
-        "Return ONLY the caption, no explanation, no prefix. "
-        "Caption 1: " + cap1 + " "
-        "Caption 2: " + cap2 + " "
-        "Detected objects: " + dino_labels + " "
-        "Fused caption:"
     )
     try:
         response = requests.post(
             QWEN_LM_URL,
             headers=HF_HEADERS,
-            json={
-                "inputs": prompt,
-                "parameters": {
-                    "max_new_tokens"    : 80,
-                    "do_sample"         : False,
-                    "repetition_penalty": 1.1,
-                    "return_full_text"  : False
-                }
-            },
             timeout=40
         )
         if response.status_code == 200:
             result = response.json()
-            if isinstance(result, list):
-                fused = result[0].get("generated_text", "").strip()
-            else:
-                fused = str(result).strip()
-            for prefix in ["Fused caption:", "Caption:"]:
-                if fused.lower().startswith(prefix.lower()):
-                    fused = fused[len(prefix):].strip()
             return fused if fused else cap1
         else:
             return cap1
-    except Exception:
         return cap1
 # ── SIDEBAR ──
@@ -248,16 +278,16 @@ st.title(" Image Caption Fusion System")
 st.markdown("Upload any image and get a detailed, humanized caption.")
 st.markdown("---")
-uploaded = st.file_uploader(" Upload an image", type=["jpg","jpeg","png"])
 if uploaded:
     image = Image.open(uploaded).convert("RGB")
     col1, col2 = st.columns([1, 1])
     with col1:
-        st.image(image, caption="Uploaded Image", use_column_width=True)
     with col2:
         if st.button(" Generate Caption", type="primary", use_container_width=True):
-            with st.spinner("Loading local models..."):
                 blip_processor, itm_model, dino_processor, dino_model = load_local_models()
             progress = st.progress(0)
@@ -283,10 +313,10 @@ if uploaded:
             progress.progress(57)
             score_df = pd.DataFrame({
-                "Caption" : ["Cap " + str(i+1) + ": " + c[:50] for i, c in enumerate(captions)],
-                "ITM"     : itm_scores,
-                "Jina"    : jina_scores,
-                "Cosine"  : cosine_scores
             })
             with st.expander(" All Scores"):
                 st.dataframe(score_df, use_container_width=True)
@@ -310,7 +340,7 @@ if uploaded:
             st.markdown("###  Detected Objects")
             if label_list:
-                st.write(" | ".join(["🔍 " + l for l in label_list]))
             else:
                 st.write(label_str)
@@ -320,7 +350,7 @@ if uploaded:
             status.success(" Pipeline complete!")
             st.markdown("---")
-            st.markdown("###  Final Fused Caption")
             st.markdown(
                 "<div style='background:linear-gradient(135deg,#667eea,#764ba2);"
                 "padding:20px;border-radius:12px;color:white;font-size:18px;"

 JINA_KEY = os.environ.get("JINA_KEY", "")
 DEVICE   = "cpu"
+# ── Correct API endpoints ──
+QWEN_VL_URL  = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-2B-Instruct/v1/chat/completions"
+QWEN_LM_URL  = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
 JINA_URL     = "https://api.jina.ai/v1/rerank"
+HF_HEADERS   = {"Authorization": "Bearer " + HF_TOKEN, "Content-Type": "application/json"}
 JINA_HEADERS = {"Authorization": "Bearer " + JINA_KEY, "Content-Type": "application/json"}
 DETECT_PROMPT = (
     dino_model.eval()
     return blip_processor, itm_model, dino_processor, dino_model
+def image_to_base64(image):
     buffered = BytesIO()
     image.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode()
+# ── FIXED: Qwen2-VL via chat completions API ──
+def generate_captions_api(image):
+    img_b64 = image_to_base64(image)
+    img_url = "data:image/jpeg;base64," + img_b64
     PROMPTS = [
         "Describe this image in one detailed sentence.",
     captions = []
     for prompt in PROMPTS:
         try:
+            payload = {
+                "model": "Qwen/Qwen2-VL-2B-Instruct",
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image_url", "image_url": {"url": img_url}},
+                            {"type": "text", "text": prompt}
+                        ]
+                    }
+                ],
+                "max_tokens": 80
+            }
             response = requests.post(
                 QWEN_VL_URL,
                 headers=HF_HEADERS,
+                json=payload,
+                timeout=40
             )
             if response.status_code == 200:
                 result = response.json()
+                cap = result["choices"][0]["message"]["content"].strip().lower()
+                captions.append(cap if cap else "a scene with various objects")
             else:
+                st.warning("Qwen2-VL API error: " + str(response.status_code) + " " + response.text[:100])
+                captions.append("a scene with various objects and people")
+        except Exception as e:
+            st.warning("Qwen2-VL exception: " + str(e))
             captions.append("a scene captured in the image")
     seen, unique = set(), []
         scores.append(round(score, 4))
     return scores
+# ── FIXED: Jina Reranker M0 API ──
 def compute_jina_scores(image, captions):
+    img_b64 = image_to_base64(image)
+    scores  = []
     for cap in captions:
         try:
+            payload = {
+                "model": "jina-reranker-m0",
+                "query": {
+                    "type" : "text",
+                    "text" : cap
+                },
+                "documents": [
+                    {
+                        "type"     : "image_url",
+                        "image_url": {"url": "data:image/jpeg;base64," + img_b64}
+                    }
+                ],
+                "top_n": 1
             }
+            response = requests.post(
+                JINA_URL,
+                headers=JINA_HEADERS,
+                json=payload,
+                timeout=30
+            )
             if response.status_code == 200:
                 result = response.json()
                 score  = result["results"][0]["relevance_score"]
                 scores.append(round(float(score), 4))
             else:
+                st.warning("Jina API error: " + str(response.status_code) + " " + response.text[:100])
+                scores.append(0.0)
+        except Exception as e:
+            st.warning("Jina exception: " + str(e))
+            scores.append(0.0)
     return scores
 def compute_cosine_scores(image, captions, blip_processor, itm_model):
     label_str = "Detected: [" + ", ".join(sorted_labels) + "]"
     return label_str, sorted_labels
+# ── FIXED: Qwen2.5-1.5B via chat completions ──
 def fuse_captions_api(cap1, cap2, dino_labels):
     prompt = (
         "You are given two captions and detected objects for the same image. "
         "Write ONE fluent, natural, descriptive caption combining the best details. "
+        "Return ONLY the fused caption, nothing else. "
+        "Caption 1: " + cap1 + ". "
+        "Caption 2: " + cap2 + ". "
+        "Detected objects: " + dino_labels + "."
     )
     try:
+        payload = {
+            "model": "Qwen/Qwen2.5-1.5B-Instruct",
+            "messages": [
+                {"role": "system", "content": "You write accurate image captions. Return only the caption."},
+                {"role": "user",   "content": prompt}
+            ],
+            "max_tokens"        : 80,
+            "temperature"       : 0.1,
+            "repetition_penalty": 1.1
+        }
         response = requests.post(
             QWEN_LM_URL,
             headers=HF_HEADERS,
+            json=payload,
             timeout=40
         )
         if response.status_code == 200:
             result = response.json()
+            fused  = result["choices"][0]["message"]["content"].strip()
             return fused if fused else cap1
         else:
+            st.warning("Qwen fusion API error: " + str(response.status_code))
             return cap1
+    except Exception as e:
+        st.warning("Qwen fusion exception: " + str(e))
         return cap1
 # ── SIDEBAR ──
 st.markdown("Upload any image and get a detailed, humanized caption.")
 st.markdown("---")
+uploaded = st.file_uploader("Upload an image", type=["jpg","jpeg","png"])
 if uploaded:
     image = Image.open(uploaded).convert("RGB")
     col1, col2 = st.columns([1, 1])
     with col1:
+        st.image(image, caption="Uploaded Image", width=400)
     with col2:
         if st.button(" Generate Caption", type="primary", use_container_width=True):
+            with st.spinner("Loading local models (first time ~2 min)..."):
                 blip_processor, itm_model, dino_processor, dino_model = load_local_models()
             progress = st.progress(0)
             progress.progress(57)
             score_df = pd.DataFrame({
+                "Caption": ["Cap " + str(i+1) + ": " + c[:50] for i, c in enumerate(captions)],
+                "ITM"    : itm_scores,
+                "Jina"   : jina_scores,
+                "Cosine" : cosine_scores
             })
             with st.expander(" All Scores"):
                 st.dataframe(score_df, use_container_width=True)
             st.markdown("###  Detected Objects")
             if label_list:
+                st.write(" | ".join([" " + l for l in label_list]))
             else:
                 st.write(label_str)
             status.success(" Pipeline complete!")
             st.markdown("---")
+            st.markdown("### Final Fused Caption")
             st.markdown(
                 "<div style='background:linear-gradient(135deg,#667eea,#764ba2);"
                 "padding:20px;border-radius:12px;color:white;font-size:18px;"