Spaces:

Afsha001
/

Image_captioning

Running

App Files Files Community

Afsha001 commited on 17 days ago

Commit

e67f5ad

verified ·

1 Parent(s): 7be1ea6

update aap.py

Browse files

Files changed (1) hide show

app.py +48 -71

app.py CHANGED Viewed

@@ -12,26 +12,15 @@ from collections import Counter
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.preprocessing import normalize
-# ============================================================================
-# PAGE CONFIG
-# ============================================================================
 st.set_page_config(
     page_title="Image Caption Fusion System",
     layout="wide",
     initial_sidebar_state="expanded"
 )
-# ============================================================================
-# CREDENTIALS
-# ============================================================================
 HF_TOKEN = os.environ.get("HF_TOKEN", "")
 JINA_KEY = os.environ.get("JINA_KEY", "")
-# ============================================================================
-# API ENDPOINTS
-# Qwen2.5: model-specific endpoint for caption fusion
-# Jina: query=plain string, documents=list of data URI strings
-# ============================================================================
 QWEN_URL   = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
 HF_HEADERS = {
     "Authorization": f"Bearer {HF_TOKEN}",
@@ -53,9 +42,6 @@ DETECT_PROMPT = (
     "jacket . dress . shirt . hat . bag ."
 )
-# ============================================================================
-# CREDENTIAL CHECK
-# ============================================================================
 if not HF_TOKEN:
     st.error("HF_TOKEN missing. Go to Space Settings → Secrets and add it.")
     st.stop()
@@ -65,24 +51,26 @@ if not JINA_KEY:
     st.stop()
 # ============================================================================
-# LOAD LOCAL MODELS
-# Moondream2: caption generation via official moondream package
-# BLIP ITM:   image-text matching + cosine similarity
-# DINO:       object detection
 # ============================================================================
 @st.cache_resource
 def load_local_models():
-    import moondream as md
     from transformers import (
         BlipProcessor,
         BlipForImageTextRetrieval,
-        AutoProcessor,
         AutoModelForZeroShotObjectDetection
     )
     gc.collect()
-    # Moondream2 — official package avoids transformers version conflict
-    moon_model = md.vl(model="moondream-2b")
     # BLIP — for ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
@@ -104,11 +92,8 @@ def load_local_models():
     )
     dino_model.eval()
-    return moon_model, blip_processor, blip_itm_model, dino_processor, dino_model
-# ============================================================================
-# HELPERS
-# ============================================================================
 def image_to_bytes(image: Image.Image) -> bytes:
     buf = BytesIO()
     image.save(buf, format="JPEG", quality=85)
@@ -120,29 +105,34 @@ def image_to_data_uri(image: Image.Image) -> str:
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
-# STEP 1 — MOONDREAM2 (LOCAL): GENERATE 5 DIVERSE CAPTIONS
-# Official moondream package — no transformers conflict
-# 5 different prompts produce diverse caption perspectives
 # ============================================================================
-def generate_captions_moondream(image: Image.Image, moon_mod) -> list:
-    prompts = [
-        "Describe this image in detail.",
-        "What is happening in this image?",
-        "Describe the people, objects, and setting in this image.",
-        "What do you see in this photograph?",
-        "Describe the scene including background and foreground in detail."
-    ]
-    captions = []
-    for prompt in prompts:
         try:
-            result = moon_mod.query(image, prompt)
-            cap    = result["answer"].strip().lower()
             captions.append(cap if cap else "a scene shown in the image")
         except Exception as e:
-            st.warning(f"Moondream error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
@@ -155,9 +145,7 @@ def generate_captions_moondream(image: Image.Image, moon_mod) -> list:
     return unique[:5]
-# ============================================================================
-# STEP 2 — BLIP ITM: IMAGE-TEXT MATCHING SCORES
-# ============================================================================
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
@@ -177,9 +165,7 @@ def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
             scores.append(0.0)
     return scores
-# ============================================================================
-# STEP 3 — JINA RERANKER M0: SEMANTIC SCORES
-# ============================================================================
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
@@ -213,9 +199,7 @@ def compute_jina_scores(image: Image.Image, captions: list) -> list:
             scores.append(0.0)
     return scores
-# ============================================================================
-# STEP 4 — COSINE SIMILARITY: EMBEDDING SCORES
-# ============================================================================
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
@@ -243,9 +227,7 @@ def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
-# ============================================================================
-# STEP 5 — MAJORITY VOTING: SELECT TOP 2 CAPTIONS
-# ============================================================================
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
@@ -263,9 +245,7 @@ def majority_voting(captions, itm, jina, cosine) -> tuple:
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
-# ============================================================================
-# STEP 6 — GROUNDING DINO: OBJECT DETECTION
-# ============================================================================
 def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
     try:
         inputs = dino_proc(
@@ -308,9 +288,7 @@ def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
         st.warning(f"DINO error: {str(e)[:80]}")
         return "Object detection unavailable", []
-# ============================================================================
-# STEP 7 — QWEN2.5-1.5B: CAPTION FUSION
-# ============================================================================
 def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
     system_prompt = (
         "You are an expert image captioning assistant. "
@@ -354,14 +332,14 @@ def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
         return cap1
 # ============================================================================
-# SIDEBAR
 # ============================================================================
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
-**1. Moondream2** (Local)
 Generate 5 captions
 **2. BLIP ITM** (Local)
@@ -383,12 +361,9 @@ Object detection
 Caption fusion
     """)
     st.markdown("---")
-    st.markdown("**Local:** Moondream2, BLIP ITM, DINO")
     st.markdown("**API:** Jina, Qwen2.5")
-# ============================================================================
-# MAIN UI
-# ============================================================================
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
@@ -410,13 +385,15 @@ if uploaded_file is not None:
         if st.button("Generate Caption", type="primary", use_container_width=True):
             with st.spinner("Loading local models (first run takes 2-3 min)..."):
-                moon_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
-            status.info("Step 1/7: Generating captions with Moondream2...")
-            captions = generate_captions_moondream(input_image, moon_mod)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):

 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.preprocessing import normalize
 st.set_page_config(
     page_title="Image Caption Fusion System",
     layout="wide",
     initial_sidebar_state="expanded"
 )
 HF_TOKEN = os.environ.get("HF_TOKEN", "")
 JINA_KEY = os.environ.get("JINA_KEY", "")
 QWEN_URL   = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct/v1/chat/completions"
 HF_HEADERS = {
     "Authorization": f"Bearer {HF_TOKEN}",
     "jacket . dress . shirt . hat . bag ."
 )
 if not HF_TOKEN:
     st.error("HF_TOKEN missing. Go to Space Settings → Secrets and add it.")
     st.stop()
     st.stop()
 # ============================================================================
+# CHANGE 1: load_local_models — replaced moondream with GIT-Large-COCO
 # ============================================================================
 @st.cache_resource
 def load_local_models():
     from transformers import (
+        AutoProcessor,
+        AutoModelForCausalLM,
         BlipProcessor,
         BlipForImageTextRetrieval,
         AutoModelForZeroShotObjectDetection
     )
     gc.collect()
+    # GIT-Large-COCO — local caption generation, no API, no auth needed
+    git_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
+    git_model     = AutoModelForCausalLM.from_pretrained(
+        "microsoft/git-large-coco",
+        torch_dtype=torch.float32
+    )
+    git_model.eval()
     # BLIP — for ITM scoring and cosine similarity
     blip_processor = BlipProcessor.from_pretrained(
     )
     dino_model.eval()
+    return git_processor, git_model, blip_processor, blip_itm_model, dino_processor, dino_model
 def image_to_bytes(image: Image.Image) -> bytes:
     buf = BytesIO()
     image.save(buf, format="JPEG", quality=85)
     return f"data:image/jpeg;base64,{b64}"
 # ============================================================================
+# CHANGE 2: generate_captions_git — replaced moondream caption function
 # ============================================================================
+def generate_captions_git(image: Image.Image, git_proc, git_mod) -> list:
+    length_params = [30, 50, 60, 70, 40]
+    captions      = []
+    for max_tokens in length_params:
         try:
+            pixel_values = git_proc(
+                images=image,
+                return_tensors="pt"
+            ).pixel_values
+            with torch.no_grad():
+                generated_ids = git_mod.generate(
+                    pixel_values=pixel_values,
+                    max_new_tokens=max_tokens
+                )
+            cap = git_proc.batch_decode(
+                generated_ids,
+                skip_special_tokens=True
+            )[0].strip().lower()
             captions.append(cap if cap else "a scene shown in the image")
         except Exception as e:
+            st.warning(f"GIT error: {str(e)[:80]}")
             captions.append("a scene shown in the image")
     seen, unique = set(), []
     return unique[:5]
+# unchanged
 def compute_itm_scores(image, captions, blip_proc, blip_itm) -> list:
     scores = []
     for cap in captions:
             scores.append(0.0)
     return scores
+# unchanged
 def compute_jina_scores(image: Image.Image, captions: list) -> list:
     img_data_uri = image_to_data_uri(image)
     scores       = []
             scores.append(0.0)
     return scores
+# unchanged
 def compute_cosine_scores(image, captions, blip_proc, blip_itm) -> list:
     try:
         img_inp = blip_proc(images=image, return_tensors="pt")
         st.warning(f"Cosine error: {str(e)[:60]}")
         return [0.0] * len(captions)
+# unchanged
 def majority_voting(captions, itm, jina, cosine) -> tuple:
     itm_r    = np.argsort(itm)[::-1]
     jina_r   = np.argsort(jina)[::-1]
     return captions[top2[0]], captions[top2[1]], top2, dict(counts)
+# unchanged
 def detect_objects(image, dino_proc, dino_mod, threshold=0.3) -> tuple:
     try:
         inputs = dino_proc(
         st.warning(f"DINO error: {str(e)[:80]}")
         return "Object detection unavailable", []
+# unchanged
 def fuse_captions(cap1: str, cap2: str, objects: str) -> str:
     system_prompt = (
         "You are an expert image captioning assistant. "
         return cap1
 # ============================================================================
+# CHANGE 3: sidebar — updated step 1 label
 # ============================================================================
 with st.sidebar:
     st.title("Image Caption Fusion")
     st.markdown("---")
     st.markdown("### Pipeline Steps")
     st.markdown("""
+**1. GIT-Large-COCO** (Local)
 Generate 5 captions
 **2. BLIP ITM** (Local)
 Caption fusion
     """)
     st.markdown("---")
+    st.markdown("**Local:** GIT-Large, BLIP ITM, DINO")
     st.markdown("**API:** Jina, Qwen2.5")
 st.title("Image Caption Fusion System")
 st.markdown("Upload an image to generate a refined, grounded caption.")
 st.markdown("---")
         if st.button("Generate Caption", type="primary", use_container_width=True):
             with st.spinner("Loading local models (first run takes 2-3 min)..."):
+                # CHANGE 4: updated unpacking — git_proc, git_mod instead of moon_mod
+                git_proc, git_mod, blip_proc, blip_itm, dino_proc, dino_mod = load_local_models()
             progress = st.progress(0)
             status   = st.empty()
+            status.info("Step 1/7: Generating captions with GIT-Large-COCO...")
+            # CHANGE 4: updated function call
+            captions = generate_captions_git(input_image, git_proc, git_mod)
             progress.progress(14)
             with st.expander("5 Generated Captions", expanded=True):