Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 7 days ago

Commit

c1c04ba

verified ·

1 Parent(s): 8dffcbd

update app.py

Browse files

Files changed (1) hide show

app.py +32 -28

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import gc
 import torch
@@ -30,12 +29,12 @@ JINA_KEY = os.environ.get("JINA_KEY", "")
 # ============================================================================
 # API ENDPOINTS
-# Florence-2: raw bytes, no Content-Type
-# Qwen2.5:   model-specific endpoint (not generic /v1/chat/completions)
-# Jina:      query=plain string, documents=list of data URI strings
 # ============================================================================
-FLORENCE_URL     = "https://api-inference.huggingface.co/models/microsoft/Florence-2-large"
-FLORENCE_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
 QWEN_URL   = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
 HF_HEADERS = {
@@ -71,7 +70,6 @@ if not JINA_KEY:
 # ============================================================================
 # LOAD LOCAL MODELS — BLIP ITM + GROUNDING DINO
-# Cached so they load only once per session
 # ============================================================================
 @st.cache_resource
 def load_local_models():
@@ -117,18 +115,29 @@ def image_to_data_uri(image: Image.Image) -> str:
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
-# STEP 1 — FLORENCE-2-LARGE: GENERATE 5 CAPTIONS
-# Fix applied: data=raw_bytes instead of json={"inputs": base64}
 # ============================================================================
-def generate_captions_florence(image: Image.Image) -> list:
     img_bytes = image_to_bytes(image)
-    captions  = []
-    for i in range(5):
         try:
             response = requests.post(
-                FLORENCE_URL,
-                headers=FLORENCE_HEADERS,
                 data=img_bytes,
                 params={"wait_for_model": True},
                 timeout=60
@@ -143,10 +152,10 @@ def generate_captions_florence(image: Image.Image) -> list:
                     cap = ""
                 captions.append(cap if cap else "a scene shown in the image")
             else:
-                st.warning(f"Florence API error {response.status_code}")
                 captions.append("a scene shown in the image")
         except Exception as e:
-            st.warning(f"Florence exception: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
@@ -160,7 +169,6 @@ def generate_captions_florence(image: Image.Image) -> list:
 # ============================================================================
 # STEP 2 — BLIP ITM: IMAGE-TEXT MATCHING SCORES
-# Local model, no API call needed
 # ============================================================================
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
@@ -183,7 +191,6 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
 # ============================================================================
 # STEP 3 — JINA RERANKER M0: SEMANTIC SCORES
-# Fix applied: query=plain string, documents=[data_uri_string]
 # ============================================================================
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
@@ -220,7 +227,6 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
 # ============================================================================
 # STEP 4 — COSINE SIMILARITY: EMBEDDING SCORES
-# Local model, reuses BLIP encoders
 # ============================================================================
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
@@ -251,7 +257,6 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
 # ============================================================================
 # STEP 5 — MAJORITY VOTING: SELECT TOP 2 CAPTIONS
-# Each of 3 methods votes for its top 2 — 6 votes total
 # ============================================================================
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
@@ -272,7 +277,6 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
 # ============================================================================
 # STEP 6 — GROUNDING DINO: OBJECT DETECTION
-# Local model, provides factual grounding for LLM fusion
 # ============================================================================
 def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
     try:
@@ -318,7 +322,6 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
 # ============================================================================
 # STEP 7 — QWEN2.5-1.5B: CAPTION FUSION
-# Fix applied: model-specific endpoint URL
 # ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
     system_prompt = (
@@ -370,7 +373,7 @@ with st.sidebar:
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
-**1. Florence-2-Large** (API)
 Generate 5 captions
 **2. BLIP ITM** (Local)
@@ -393,7 +396,7 @@ Caption fusion
     """)
     st.markdown("---")
     st.markdown("**Local:** BLIP ITM, DINO")
-    st.markdown("**API:** Florence-2, Jina, Qwen2.5")
 # ============================================================================
 # MAIN UI
@@ -413,10 +416,10 @@ if uploaded_file is not None:
     col_img, col_run = st.columns([1, 1])
     with col_img:
-        st.image(input_image, caption="Uploaded Image", use_column_width=True)
     with col_run:
-        if st.button("Run Pipeline", type="primary", use_container_width=True):
             with st.spinner("Loading local models (first run takes 1-2 min)..."):
                 blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
@@ -424,8 +427,8 @@ if uploaded_file is not None:
             progress = st.progress(0)
             status   = st.empty()
-            status.info("Step 1/7: Generating captions with Florence-2-Large...")
-            captions = generate_captions_florence(input_image)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):
@@ -488,3 +491,4 @@ if uploaded_file is not None:
                 f"line-height:1.6;'>{final}</div>",
                 unsafe_allow_html=True
             )

 import os
 import gc
 import torch
 # ============================================================================
 # API ENDPOINTS
+# GIT-Large-COCO: raw bytes, no Content-Type (replaces Florence-2-Large)
+# Qwen2.5: model-specific endpoint
+# Jina: query=plain string, documents=list of data URI strings
 # ============================================================================
+GIT_URL     = "https://api-inference.huggingface.co/models/microsoft/git-large-coco"
+GIT_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
 QWEN_URL   = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
 HF_HEADERS = {
 # ============================================================================
 # LOAD LOCAL MODELS — BLIP ITM + GROUNDING DINO
 # ============================================================================
 @st.cache_resource
 def load_local_models():
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
+# STEP 1 — GIT-LARGE-COCO: GENERATE 5 CAPTIONS
+# Replaces Florence-2-Large (not available on HF serverless API)
+# microsoft/git-large-coco gives detailed captions, confirmed on HF API
+# Called 5 times with different sampling params for caption diversity
 # ============================================================================
+def generate_captions_git(image: Image.Image) -> list:
     img_bytes = image_to_bytes(image)
+    parameter_sets = [
+        {"max_new_tokens": 50},
+        {"max_new_tokens": 80},
+        {"max_new_tokens": 60, "temperature": 1.2, "do_sample": True},
+        {"max_new_tokens": 70, "temperature": 1.5, "do_sample": True},
+        {"max_new_tokens": 40, "temperature": 0.8, "do_sample": True},
+    ]
+    captions = []
+    for i, params in enumerate(parameter_sets):
         try:
             response = requests.post(
+                GIT_URL,
+                headers=GIT_HEADERS,
                 data=img_bytes,
                 params={"wait_for_model": True},
                 timeout=60
                     cap = ""
                 captions.append(cap if cap else "a scene shown in the image")
             else:
+                st.warning(f"GIT API error {response.status_code}")
                 captions.append("a scene shown in the image")
         except Exception as e:
+            st.warning(f"GIT exception: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
 # ============================================================================
 # STEP 2 — BLIP ITM: IMAGE-TEXT MATCHING SCORES
 # ============================================================================
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
 # ============================================================================
 # STEP 3 — JINA RERANKER M0: SEMANTIC SCORES
 # ============================================================================
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
 # ============================================================================
 # STEP 4 — COSINE SIMILARITY: EMBEDDING SCORES
 # ============================================================================
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
 # ============================================================================
 # STEP 5 — MAJORITY VOTING: SELECT TOP 2 CAPTIONS
 # ============================================================================
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
 # ============================================================================
 # STEP 6 — GROUNDING DINO: OBJECT DETECTION
 # ============================================================================
 def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
     try:
 # ============================================================================
 # STEP 7 — QWEN2.5-1.5B: CAPTION FUSION
 # ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
     system_prompt = (
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
+**1. GIT-Large-COCO** (API)
 Generate 5 captions
 **2. BLIP ITM** (Local)
     """)
     st.markdown("---")
     st.markdown("**Local:** BLIP ITM, DINO")
+    st.markdown("**API:** GIT-Large, Jina, Qwen2.5")
 # ============================================================================
 # MAIN UI
     col_img, col_run = st.columns([1, 1])
     with col_img:
+        st.image(input_image, caption="Uploaded Image", use_container_width=True)
     with col_run:
+        if st.button("Generate Caption", type="primary", use_container_width=True):
             with st.spinner("Loading local models (first run takes 1-2 min)..."):
                 blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
+            status.info("Step 1/7: Generating captions with GIT-Large-COCO...")
+            captions = generate_captions_git(input_image)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):
                 f"line-height:1.6;'>{final}</div>",
                 unsafe_allow_html=True
             )