Spaces:

skodan
/

multimodal-caption-retrieval

Sleeping

App Files Files Community

skodan commited on Jan 20

Commit

aa6ef7a

1 Parent(s): 8fc4a6f

fixing incorrect references

Browse files

Files changed (4) hide show

app.py +19 -18
models/resnet_lstm_attention/model.py +8 -0
models/resnet_lstm_attention/retrieval.py +123 -28
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ def is_port_free(port):
 if is_port_free(8001):
     subprocess.Popen(["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8001"])
 else:
-    st.warning("Port 8001 already in use — backend may not start. Restart Space.")
 time.sleep(5)  # longer wait
 API_BASE = "http://localhost:8001"
@@ -75,20 +75,21 @@ with tab_text2img:
         if resp.status_code == 200:
             results = resp.json()
             if results:
                 cols = st.columns(3)
                 for idx, res in enumerate(results):
                     with cols[idx % 3]:
-                        try:
-                            st.image(res["image_path"],
-                                   caption=f"Score: {res['score']:.3f}",
-                                   use_column_width=True)
-                        except Exception as e:
-                            st.warning(f"Could not load: {res['image_path']}")
-                            st.write(f"Score: {res['score']:.3f}")
             else:
-                st.info("No matching images in demo set.")
         else:
-            st.error(f"Backend error: {resp.status_code} - {resp.text}")
 with tab_img2text:
     if image_file and st.button("Retrieve Text"):
@@ -121,20 +122,20 @@ with tab_img2img:
         if resp.status_code == 200:
             results = resp.json()
             if results:
                 cols = st.columns(3)
                 for idx, res in enumerate(results):
                     with cols[idx % 3]:
-                        try:
                             st.image(
-                                res["image_path"],
-                                caption=f"Score: {res['score']:.3f}",
-                                use_column_width=True
                             )
-                        except Exception as e:
-                            st.warning(f"Could not load image: {res['image_path']}")
-                            st.write(f"Score: {res['score']:.3f}")
             else:
-                st.info("No similar images found in the demo set.")
         else:
             st.error(f"Backend error: {resp.status_code} - {resp.text}")

 if is_port_free(8001):
     subprocess.Popen(["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8001"])
 else:
+    print("Port 8001 in use - skipping backend startup")
 time.sleep(5)  # longer wait
 API_BASE = "http://localhost:8001"
         if resp.status_code == 200:
             results = resp.json()
             if results:
+                st.subheader("Retrieved Images")
                 cols = st.columns(3)
                 for idx, res in enumerate(results):
                     with cols[idx % 3]:
+                        if res["image"] is not None:
+                            st.image(res["image"], width=200)
+                            st.caption(f"Score: {res['score']:.3f}")
+                            if "caption" in res:  # if you add caption to results later
+                                st.write(res["caption"])
+                        else:
+                            st.caption(f"Score: {res['score']:.3f} (Image not found)")
             else:
+                st.info("No results found.")
         else:
+            st.error(f"Error: {resp.status_code} - {resp.text}")
 with tab_img2text:
     if image_file and st.button("Retrieve Text"):
         if resp.status_code == 200:
             results = resp.json()
             if results:
+                st.subheader("Retrieved Similar Images")
                 cols = st.columns(3)
                 for idx, res in enumerate(results):
                     with cols[idx % 3]:
+                        if "image" in res and res["image"] is not None:
                             st.image(
+                                res["image"],
+                                width=200,        # recommended instead of use_column_width
+                                caption=f"Score: {res['score']:.3f}"
                             )
+                        else:
+                            st.caption(f"Score: {res['score']:.3f} (Image not available)")
             else:
+                st.info("No similar images found in the dataset.")
         else:
             st.error(f"Backend error: {resp.status_code} - {resp.text}")

models/resnet_lstm_attention/model.py CHANGED Viewed

@@ -5,6 +5,7 @@ from huggingface_hub import hf_hub_download
 from PIL import Image
 import numpy as np
 from typing import List, Dict, Any
 from models.resnet_lstm_attention.loader import load_captioning_model
 from models.resnet_lstm_attention.retrieval import RetrievalService
@@ -17,6 +18,7 @@ class ResNetLSTMAttentionModel(UnifiedModelInterface):
         self.caption_bundle = None
         self.retrieval_service = None
         self.device = torch.device("cpu")
         #self.model_repo = "skodan/resnet-lstm-attention-weights"
     def load(self) -> None:
@@ -91,6 +93,12 @@ class ResNetLSTMAttentionModel(UnifiedModelInterface):
             preprocess=preprocess_cfg
         )
         print("Model components loaded successfully.")
     @torch.no_grad()

 from PIL import Image
 import numpy as np
 from typing import List, Dict, Any
+from datasets import load_dataset
 from models.resnet_lstm_attention.loader import load_captioning_model
 from models.resnet_lstm_attention.retrieval import RetrievalService
         self.caption_bundle = None
         self.retrieval_service = None
         self.device = torch.device("cpu")
+        self.dataset = None
         #self.model_repo = "skodan/resnet-lstm-attention-weights"
     def load(self) -> None:
             preprocess=preprocess_cfg
         )
+        if self.dataset is None:
+            print("Loading Flickr8k test split from Hugging Face...")
+            ds = load_dataset("jxie/flickr8k")
+            self.dataset = ds["train"].concatenate(ds["validation"]).concatenate(ds["test"])
+            print(f"Loaded {len(self.dataset)} images/captions from full dataset.")
         print("Model components loaded successfully.")
     @torch.no_grad()

models/resnet_lstm_attention/retrieval.py CHANGED Viewed

@@ -2,6 +2,7 @@ import faiss
 import pickle
 import torch
 import numpy as np
 from PIL import Image
 from torchvision import transforms
@@ -32,20 +33,67 @@ class RetrievalService:
     def _normalize(self, x):
         return x / np.linalg.norm(x, axis=1, keepdims=True)
-    def text_to_image(self, text, top_k=5):
-        with torch.no_grad():
-            emb = self.clip_model.encode_text(text).cpu().numpy()
-        emb = self._normalize(emb)
-        scores, idxs = self.image_index.search(emb, top_k)
-        return [
-            {
-                "image_path": self.image_id_map[i],
-                "score": float(scores[0][j])
-            }
-            for j, i in enumerate(idxs[0])
-        ]
     def image_to_text(self, image: Image.Image, top_k=5):
         image = self.image_transform(image).unsqueeze(0)
@@ -58,6 +106,7 @@ class RetrievalService:
         print(f"DEBUG: Returning results: {results}")
         return results
     def text_to_text(self, text: str, top_k: int = 5):
         with torch.no_grad():
             emb = self.clip_model.encode_text(text).cpu().numpy()
@@ -76,20 +125,66 @@ class RetrievalService:
         print(f"DEBUG: Text-to-text results: {results}")
         return results
-    def image_to_image(self, image: Image.Image, top_k=5):
-        """
-        Image → Image retrieval: encode input image, search image index, return image IDs and scores.
-        """
-        image = self.image_transform(image).unsqueeze(0).to(self.device)
-        with torch.no_grad():
-            emb = self.clip_model.encode_image(image).cpu().numpy()
-        emb = self._normalize(emb)
-        scores, idxs = self.image_index.search(emb, top_k)
-        return [
-            {
-                "image_path": self.image_id_map[i],  # integer ID
-                "score": float(scores[0][j])
-            }
-            for j, i in enumerate(idxs[0])
-        ]

 import pickle
 import torch
 import numpy as np
+import os
 from PIL import Image
 from torchvision import transforms
     def _normalize(self, x):
         return x / np.linalg.norm(x, axis=1, keepdims=True)
+    def text_to_image(self, text: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        raw_results = self.retrieval_service.text_to_image(text, top_k)
+        formatted = []
+        for res in raw_results:
+            idx = int(res["image_path"])  # the FAISS index (integer)
+            try:
+                pil_img = self.dataset[idx]["image"]  # directly get PIL.Image
+                formatted.append({
+                    "image": pil_img,  # ← pass PIL.Image to UI
+                    "score": float(res["score"])
+                })
+            except (IndexError, KeyError):
+                formatted.append({
+                    "image": None,
+                    "score": float(res["score"])
+                })
+        return formatted
+    # def text_to_image(self, text: str, top_k: int = 5) -> List[Dict[str, Any]]:
+    #     raw_results = self.retrieval_service.text_to_image(text, top_k)
+    #     formatted = []
+    #     for res in raw_results:
+    #         img_id = res["image_path"]  # int or str
+    #         img_id_str = str(img_id)
+    #         img_filename = f"{img_id_str}.jpg"  # always append .jpg, no .endswith
+    #         full_path = os.path.join("flickr8k_images", img_filename)
+    #         if os.path.exists(full_path):
+    #             formatted.append({
+    #                 "image_path": full_path,
+    #                 "score": float(res["score"])
+    #             })
+    #         else:
+    #             formatted.append({
+    #                 "image_path": "https://via.placeholder.com/300?text=Not+in+demo",
+    #                 "score": float(res["score"])
+    #             })
+    #     return formatted
+    # def text_to_image(self, text, top_k=5):
+    #     with torch.no_grad():
+    #         emb = self.clip_model.encode_text(text).cpu().numpy()
+    #     emb = self._normalize(emb)
+    #     scores, idxs = self.image_index.search(emb, top_k)
+    #     return [
+    #         {
+    #             "image_path": self.image_id_map[i],
+    #             "score": float(scores[0][j])
+    #         }
+    #         for j, i in enumerate(idxs[0])
+    #     ]
     def image_to_text(self, image: Image.Image, top_k=5):
         image = self.image_transform(image).unsqueeze(0)
         print(f"DEBUG: Returning results: {results}")
         return results
     def text_to_text(self, text: str, top_k: int = 5):
         with torch.no_grad():
             emb = self.clip_model.encode_text(text).cpu().numpy()
         print(f"DEBUG: Text-to-text results: {results}")
         return results
+    # def image_to_image(self, image: Image.Image, top_k=5):
+    #     """
+    #     Image → Image retrieval: encode input image, search image index, return image IDs and scores.
+    #     """
+    #     image = self.image_transform(image).unsqueeze(0).to(self.device)
+    #     with torch.no_grad():
+    #         emb = self.clip_model.encode_image(image).cpu().numpy()
+    #     emb = self._normalize(emb)
+    #     scores, idxs = self.image_index.search(emb, top_k)
+    #     return [
+    #         {
+    #             "image_path": self.image_id_map[i],  # integer ID
+    #             "score": float(scores[0][j])
+    #         }
+    #         for j, i in enumerate(idxs[0])
+    #     ]
+    # def image_to_image(self, image: Image.Image, top_k: int = 5) -> List[Dict[str, Any]]:
+    #     raw_results = self.retrieval_service.image_to_image(image, top_k)  # now exists
+    #     # ... same logic as above ...
+    #     formatted = []
+    #     for res in raw_results:
+    #         img_id = res["image_path"]
+    #         img_id_str = str(img_id)
+    #         img_filename = f"{img_id_str}.jpg"
+    #         full_path = os.path.join("flickr8k_images", img_filename)
+    #         if os.path.exists(full_path):
+    #             formatted.append({
+    #                 "image_path": full_path,
+    #                 "score": float(res["score"])
+    #             })
+    #         else:
+    #             formatted.append({
+    #                 "image_path": "https://via.placeholder.com/300?text=Not+in+demo",
+    #                 "score": float(res["score"])
+    #             })
+    #     return formatted
+    def image_to_image(self, image: Image.Image, top_k: int = 5) -> List[Dict[str, Any]]:
+        raw_results = self.retrieval_service.image_to_image(image, top_k)
+        formatted = []
+        for res in raw_results:
+            idx = int(res["image_path"])
+            try:
+                pil_img = self.dataset[idx]["image"]
+                formatted.append({
+                    "image": pil_img,
+                    "score": float(res["score"])
+                })
+            except (IndexError, KeyError):
+                formatted.append({
+                    "image": None,
+                    "score": float(res["score"])
+                })
+        return formatted

requirements.txt CHANGED Viewed

@@ -11,4 +11,5 @@ numpy>=1.26.0
 altair
 pandas
 python-multipart>=0.0.9
-matplotlib>=3.9.0

 altair
 pandas
 python-multipart>=0.0.9
+matplotlib>=3.9.0
+datasets>=2.18.0