Spaces:

skodan
/

multimodal-caption-retrieval

Sleeping

App Files Files Community

skodan commited on Jan 19

Commit

c33d894

1 Parent(s): 0f916ad

added project files

Browse files

Files changed (19) hide show

.gitignore +8 -0
requirements.txt +11 -2
src/api.py +56 -0
src/app.py +82 -0
src/configs/caption_config.json +24 -0
src/configs/preprocess_config.json +27 -0
src/model_registry.py +26 -0
src/models/resnet_lstm_attention/__init__.py +0 -0
src/models/resnet_lstm_attention/cap_mod_defs.py +101 -0
src/models/resnet_lstm_attention/captioning.py +48 -0
src/models/resnet_lstm_attention/clip_loader.py +63 -0
src/models/resnet_lstm_attention/loader.py +79 -0
src/models/resnet_lstm_attention/model.py +130 -0
src/models/resnet_lstm_attention/ret_mod_defs.py +57 -0
src/models/resnet_lstm_attention/retrieval.py +59 -0
src/models/resnet_lstm_attention/schemas.py +13 -0
src/models/resnet_lstm_attention/utils.py +10 -0
src/{streamlit_app.py → streamlit_app_old.py} +0 -0
src/utils/interfaces.py +30 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+__pycache__/
+*.pyc
+.venv/
+.ipynb_checkpoints/
+.vscode/
+*.pth
+*.faiss
+*.pkl

requirements.txt CHANGED Viewed

@@ -1,3 +1,12 @@
 altair
-pandas
-streamlit

+streamlit>=1.38.0
+fastapi>=0.115.0
+uvicorn>=0.30.0
+torch>=2.4.0
+torchvision>=0.19.0
+pillow>=10.0.0
+huggingface_hub>=0.25.0
+faiss-cpu>=1.8.0
+pydantic>=2.0.0
+numpy>=1.26.0
 altair
+pandas

src/api.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# api.py
+from fastapi import FastAPI, UploadFile, File, Form
+from fastapi.middleware.cors import CORSMiddleware
+from PIL import Image
+from typing import List
+from pydantic import BaseModel
+from model_registry import get_model
+from models.resnet_lstm_attention.schemas import CaptionResult, ImageResult, TextQuery
+app = FastAPI(title="Multimodal Retrieval & Captioning API")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class InferenceRequest(BaseModel):
+    model_name: str
+    top_k: int = 5
+#@app.post("/caption", response_model=CaptionResult)
+@app.post("/caption")
+async def caption_image(model_name: str = Form(...), file: UploadFile = File(...)):
+    image = Image.open(file.file).convert("RGB")
+    model = get_model(model_name)
+    caption = model.generate_caption(image)
+    return {"caption": caption}
+#@app.post("/search/text2img", response_model=List[ImageResult])
+@app.post("/search/text2img")
+async def text_to_image(model_name: str = Form(...), query: str = Form(...), top_k: int = Form(5)):
+    model = get_model(model_name)
+    results = model.text_to_image(query, top_k)
+    return results
+@app.post("/search/img2text")
+async def image_to_text(model_name: str = Form(...), file: UploadFile = File(...), top_k: int = Form(5)):
+    image = Image.open(file.file).convert("RGB")
+    model = get_model(model_name)
+    results = model.image_to_text(image, top_k)
+    return results
+#@app.post("/search/img2img", response_model=List[ImageResult])
+@app.post("/search/img2img")
+async def image_to_image(model_name: str = Form(...), file: UploadFile = File(...), top_k: int = Form(5)):
+    image = Image.open(file.file).convert("RGB")
+    model = get_model(model_name)
+    results = model.image_to_image(image, top_k)
+    return results
+@app.get("/health")
+def health_check():
+    return {"status": "healthy"}

src/app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# app.py
+import streamlit as st
+import requests
+import subprocess
+import time
+from PIL import Image
+import io
+import base64  # For displaying retrieved images if needed
+# Start FastAPI server in background
+subprocess.Popen(["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8001"])
+time.sleep(2)  # Wait for server to start
+API_BASE = "http://localhost:8001"
+st.set_page_config(page_title="Multimodal Retrieval & Captioning", layout="wide")
+st.title("Multimodal Retrieval & Captioning System")
+# Model selection (add more later)
+model_name = st.sidebar.selectbox("Select Model", ["resnet_lstm_attention", "vit_lstm_attention", "vit_transformer"], index=0)
+# Common inputs
+input_method = st.sidebar.radio("Image Input", ["Upload", "Camera"])
+image_file = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"]) if input_method == "Upload" else st.camera_input("Capture Image")
+text_input = st.text_input("Text Input")
+top_k = st.sidebar.slider("Top K", 1, 10, 5)
+# Tabs for tasks
+tab_caption, tab_text2img, tab_img2text, tab_img2img, tab_text2text = st.tabs([
+    "Image → Caption",
+    "Text → Image",
+    "Image → Text",
+    "Image → Image",
+    "Text → Text"
+])
+with tab_caption:
+    if image_file and st.button("Generate Caption"):
+        files = {"file": image_file.getvalue()}
+        data = {"model_name": model_name}
+        resp = requests.post(f"{API_BASE}/caption", files=files, data=data)
+        if resp.status_code == 200:
+            st.write("Caption:", resp.json()["caption"])
+        else:
+            st.error("Error: " + resp.text)
+with tab_text2img:
+    if text_input and st.button("Search Images"):
+        data = {"model_name": model_name, "query": text_input, "top_k": top_k}
+        resp = requests.post(f"{API_BASE}/search/text2img", data=data)
+        if resp.status_code == 200:
+            results = resp.json()
+            for res in results:
+                st.image(res["image_path"], caption=f"Score: {res['score']:.3f}")
+        else:
+            st.error("Error: " + resp.text)
+with tab_img2text:
+    if image_file and st.button("Retrieve Text"):
+        files = {"file": image_file.getvalue()}
+        data = {"model_name": model_name, "top_k": top_k}
+        resp = requests.post(f"{API_BASE}/search/img2text", files=files, data=data)
+        if resp.status_code == 200:
+            st.write("Retrieved Texts:", resp.json())
+        else:
+            st.error("Error: " + resp.text)
+with tab_img2img:
+    if image_file and st.button("Retrieve Similar Images"):
+        files = {"file": image_file.getvalue()}
+        data = {"model_name": model_name, "top_k": top_k}
+        resp = requests.post(f"{API_BASE}/search/img2img", files=files, data=data)
+        if resp.status_code == 200:
+            results = resp.json()
+            for res in results:
+                st.image(res["image_path"], caption=f"Score: {res['score']:.3f}")
+        else:
+            st.error("Error: " + resp.text)
+with tab_text2text:
+    st.info("Text → Text not implemented yet. Add to model interface if needed.")

src/configs/caption_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "max_caption_len": 25,
+  "bos_token": "<start>",
+  "eos_token": "<end>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>",
+  "bos_index": 2,
+  "eos_index": 3,
+  "pad_index": 0,
+  "unk_index": 1,
+  "vocab_size": 2541,
+  "trained_epochs": 15,
+  "image_size": 224,
+  "mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "std": [
+    0.229,
+    0.224,
+    0.225
+  ]
+}

src/configs/preprocess_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "image_size": 224,
+  "mean": [0.485, 0.456, 0.406],
+  "std": [0.229, 0.224, 0.225],
+  "embed_dim": 512,
+  "model_type": "resnet50_lstm_attention_retrieval",
+  "trained_epochs": 40,
+  "batch_size_train": 128,
+  "batch_size_eval": 256,
+  "learning_rate_image": 3e-5,
+  "learning_rate_text": 1e-4,
+  "weight_decay": 0.01,
+  "temperature": 0.07,
+  "scheduler": "ReduceLROnPlateau",
+  "scheduler_factor": 0.5,
+  "scheduler_patience": 3,
+  "unfrozen_layers": "layer3 and layer4",
+  "train_augmentation": [
+    "RandomResizedCrop(scale=(0.8,1.0))",
+    "RandomHorizontalFlip",
+    "ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2)"
+  ],
+  "val_test_augmentation": "None (only resize + normalize)",
+  "dataset": "jxie/flickr8k",
+  "date_trained": "January 18, 2026",
+  "note": "Final configuration with all recommended improvements applied"
+}

src/model_registry.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from typing import Dict
+from utils.interfaces import UnifiedModelInterface
+_LOADED_MODELS: Dict[str, UnifiedModelInterface] = {}
+def get_model(model_name: str) -> UnifiedModelInterface:
+    if model_name in _LOADED_MODELS:
+        return _LOADED_MODELS[model_name]
+    if model_name == "resnet_lstm_attention":
+        from models.resnet_lstm_attention.model import ResNetLSTMAttentionModel
+        model = ResNetLSTMAttentionModel()
+    elif model_name == "vit_lstm_attention":
+        # Add later: from models.vit_lstm_attention.model import VitLSTMAttentionModel
+        # model = VitLSTMAttentionModel()
+        raise NotImplementedError("ViT + LSTM Attention not implemented yet")
+    elif model_name == "vit_transformer":
+        # Add later: from models.vit_transformer.model import VitTransformerModel
+        # model = VitTransformerModel()
+        raise NotImplementedError("ViT + Transformer not implemented yet")
+    else:
+        raise ValueError(f"Unknown model: {model_name}")
+    model.load()
+    _LOADED_MODELS[model_name] = model
+    return model

src/models/resnet_lstm_attention/__init__.py ADDED Viewed

File without changes

src/models/resnet_lstm_attention/cap_mod_defs.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+import torchvision.transforms as transforms
+import torchvision.models as models
+from torchvision.models import resnet50, ResNet50_Weights
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f"Using device: {device}")
+class EncoderCNN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        resnet = resnet50(weights=ResNet50_Weights.DEFAULT)
+        self.features = nn.Sequential(*list(resnet.children())[:-2])
+        for param in self.features.parameters():
+            param.requires_grad = False
+    def forward(self, images):
+        features = self.features(images)           # (B, 2048, 7, 7)
+        features = features.permute(0,2,3,1)       # (B,7,7,2048)
+        features = features.view(features.size(0), -1, 2048)  # (B,49,2048)
+        return features
+class Attention(nn.Module):
+    def __init__(self, hidden_dim):
+        super().__init__()
+        self.W_h = nn.Linear(hidden_dim, hidden_dim, bias=False)    # for decoder hidden
+        self.W_e = nn.Linear(2048, hidden_dim, bias=False)          # for encoder features
+        self.v   = nn.Linear(hidden_dim, 1, bias=False)             # final score
+    def forward(self, hidden, encoder_outputs):
+        # Project and add
+        hidden_proj = self.W_h(hidden).unsqueeze(1)               # (B, 1, hidden_dim)
+        encoder_proj = self.W_e(encoder_outputs)                  # (B, seq_len, hidden_dim)
+        # Compute energy: tanh(W_h * h + W_e * e)
+        energy = torch.tanh(hidden_proj + encoder_proj)           # (B, seq_len, hidden_dim)
+        # Score: v^T * energy
+        scores = self.v(energy).squeeze(2)                        # (B, seq_len)
+        # Attention weights
+        attn_weights = torch.softmax(scores, dim=1)               # (B, seq_len)
+        # Context vector: weighted sum of encoder outputs
+        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)
+        context = context.squeeze(1)                              # (B, 2048)
+        return context
+class DecoderRNN(nn.Module):
+    def __init__(self, vocab_size, embed_size=256, hidden_size=512):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_size)
+        self.attention = Attention(hidden_size)
+        self.lstm = nn.LSTMCell(embed_size + 2048, hidden_size)
+        self.fc = nn.Linear(hidden_size, vocab_size)
+    def forward(self, captions, encoder_outputs):
+        embedded = self.embedding(captions[:, :-1])  # (B, seq_len-1, embed_size)
+        h = torch.zeros(embedded.size(0), 512, device=device)
+        c = torch.zeros(embedded.size(0), 512, device=device)
+        outputs = []
+        for t in range(embedded.size(1)):
+            context = self.attention(h, encoder_outputs)          # ← now correct
+            inp = torch.cat([embedded[:, t, :], context], dim=1)
+            h, c = self.lstm(inp, (h, c))
+            outputs.append(self.fc(h))
+        return torch.stack(outputs, dim=1)  # (B, seq_len-1, vocab_size)
+    @torch.no_grad()
+    def generate(self, encoder_outputs, vocab, inv_vocab,max_len=25):
+        h = torch.zeros(1, 512, device=device)
+        c = torch.zeros(1, 512, device=device)
+        token = torch.tensor([[vocab['<start>']]], device=device)
+        caption = []
+        for _ in range(max_len):
+            emb = self.embedding(token)
+            context = self.attention(h, encoder_outputs)
+            inp = torch.cat([emb.squeeze(0), context], dim=1)
+            h, c = self.lstm(inp, (h, c))
+            pred = self.fc(h).argmax(1)
+            caption.append(pred.item())
+            token = pred.unsqueeze(0)
+            if pred.item() == vocab['<end>']:
+                break
+        return [inv_vocab.get(i, '<unk>') for i in caption if i != vocab['<end>']]

src/models/resnet_lstm_attention/captioning.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+import pickle
+from torchvision import transforms
+from PIL import Image
+from .cap_mod_defs import EncoderCNN, DecoderRNN  # reuse your exact classes
+class CaptioningService:
+    def __init__(self, model_path, vocab_path, config):
+        self.device = torch.device("cpu")
+        # Load vocab
+        with open(vocab_path, "rb") as f:
+            self.vocab = pickle.load(f)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        # Load checkpoint
+        ckpt = torch.load(model_path, map_location=self.device)
+        self.encoder = EncoderCNN().to(self.device)
+        self.encoder.load_state_dict(ckpt["encoder_state_dict"])
+        self.encoder.eval()
+        self.decoder = DecoderRNN(vocab_size=ckpt["vocab_size"]).to(self.device)
+        self.decoder.load_state_dict(ckpt["decoder_state_dict"])
+        self.decoder.eval()
+        self.max_len = config["max_length"]
+        self.transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]
+            )
+        ])
+    @torch.no_grad()
+    def generate_caption(self, image: Image.Image) -> str:
+        image = self.transform(image).unsqueeze(0).to(self.device)
+        features = self.encoder(image)
+        tokens = self.decoder.generate(
+            features,
+            vocab=self.vocab,
+            inv_vocab=self.inv_vocab,
+            max_len=self.max_len
+        )
+        return " ".join(tokens)

src/models/resnet_lstm_attention/clip_loader.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from matplotlib import text
+import torch
+from .ret_mod_defs import ImageEncoder, TextEncoder
+from .utils import simple_tokenize
+print("LOADED clip_loader.py - UPDATED VERSION with encode_text(text: str)")
+class CLIPRetrievalModel:
+    """
+    Wrapper to expose a CLIP-like interface:
+    - encode_image(images)
+    - encode_text(captions, lengths)
+    """
+    def __init__(self, image_encoder, text_encoder, vocab, max_caption_len=30):
+        print("CLIPRetrievalModel initialized - single-arg encode_text version")
+        self.image_encoder = image_encoder
+        self.text_encoder = text_encoder
+        self.vocab = vocab
+        self.max_caption_len = max_caption_len
+    @torch.no_grad()
+    def encode_image(self, images):
+        return self.image_encoder(images)
+    @torch.no_grad()
+    def encode_text(self, text: str):
+        """
+        Accept raw text string → tokenize → pad → encode
+        No need to pass 'lengths' from outside anymore
+        """
+        # Tokenize using the same logic as training
+        words = [self.vocab.get(t, self.vocab['<unk>']) for t in simple_tokenize(text.lower())]
+        tokens = words[:self.max_caption_len]
+        tokens = [self.vocab['<start>']] + tokens + [self.vocab['<end>']]
+        length = len(tokens)
+        padded = tokens + [self.vocab['<pad>']] * (self.max_caption_len + 2 - length)
+        captions = torch.tensor([padded], dtype=torch.long).to(self.text_encoder.embedding.weight.device)
+        lengths = torch.tensor([length], dtype=torch.long).to(self.text_encoder.embedding.weight.device)
+        return self.text_encoder(captions, lengths)
+def load_clip_model(model_path: str, vocab: dict, device: torch.device = torch.device("cpu")):
+    checkpoint = torch.load(model_path, map_location=device)
+    image_encoder = ImageEncoder(embed_dim=512).to(device)
+    # IMPORTANT: use vocab_size from checkpoint
+    text_encoder = TextEncoder(
+        vocab_size=checkpoint["vocab_size"]
+    ).to(device)
+    image_encoder.load_state_dict(checkpoint["image_encoder_state_dict"])
+    text_encoder.load_state_dict(checkpoint["text_encoder_state_dict"])
+    image_encoder.eval()
+    text_encoder.eval()
+    return CLIPRetrievalModel(image_encoder, text_encoder, vocab)

src/models/resnet_lstm_attention/loader.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+import pickle
+import json
+from torchvision import transforms
+from .cap_mod_defs import EncoderCNN, DecoderRNN
+def load_captioning_model(
+    model_path: str,
+    vocab_path: str,
+    config_path: str,
+    device: torch.device = torch.device("cpu")
+):
+    # -----------------------------
+    # Load config
+    # -----------------------------
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    max_len = config["max_caption_len"]
+    # -----------------------------
+    # Load vocab
+    # -----------------------------
+    with open(vocab_path, "rb") as f:
+        vocab_data = pickle.load(f)
+    # Handle both vocab formats safely
+    if isinstance(vocab_data, dict) and "vocab" in vocab_data:
+        vocab = vocab_data["vocab"]
+        inv_vocab = vocab_data.get("inv_vocab")
+        if inv_vocab is None:
+            inv_vocab = {v: k for k, v in vocab.items()}
+    else:
+        vocab = vocab_data
+        inv_vocab = {v: k for k, v in vocab.items()}
+    # -----------------------------
+    # Load checkpoint
+    # -----------------------------
+    checkpoint = torch.load(model_path, map_location=device)
+    # -----------------------------
+    # Recreate models
+    # -----------------------------
+    encoder = EncoderCNN().to(device)
+    decoder = DecoderRNN(vocab_size=checkpoint["vocab_size"]).to(device)
+    encoder.load_state_dict(checkpoint["encoder_state_dict"])
+    decoder.load_state_dict(checkpoint["decoder_state_dict"])
+    encoder.eval()
+    decoder.eval()
+    # -----------------------------
+    # Image preprocessing
+    # -----------------------------
+    transform = transforms.Compose([
+        transforms.Resize((224, 224)),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225]
+        )
+    ])
+    # -----------------------------
+    # Return bundle
+    # -----------------------------
+    return {
+        "encoder": encoder,
+        "decoder": decoder,
+        "vocab": vocab,
+        "inv_vocab": inv_vocab,
+        "max_len": max_len,
+        "transform": transform
+    }

src/models/resnet_lstm_attention/model.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+import json
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+import numpy as np
+from typing import List, Dict, Any
+from .loader import load_captioning_model
+from .retrieval import RetrievalService
+from .clip_loader import load_clip_model
+from .captioning import CaptioningService  # Not directly used, but for reference
+from utils.interfaces import UnifiedModelInterface  # Adjust path if needed
+class ResNetLSTMAttentionModel(UnifiedModelInterface):
+    def __init__(self):
+        self.caption_bundle = None
+        self.retrieval_service = None
+        self.device = torch.device("cpu")
+        #self.model_repo = "skodan/resnet-lstm-attention-weights"
+    def load(self) -> None:
+        if self.caption_bundle is not None and self.retrieval_service is not None:
+            return
+        MODEL_REPO = "skodan/resnet-lstm-attention-weights"
+        files_to_download = [
+                "caption_model.pth",
+                "flickr8k_retrieval_model.pth",
+                "image_embeddings.faiss",
+                "text_embeddings.faiss",
+                "image_id_map.pkl",
+                "text_id_map.pkl",
+                "vocab.pkl"
+            ]
+        downloaded_paths = {}
+        for fname in files_to_download:
+            try:
+                path = hf_hub_download(
+                    repo_id=MODEL_REPO,
+                    filename=fname,
+                    repo_type="model",
+                )
+                downloaded_paths[fname] = path
+            except Exception as e:
+                raise RuntimeError(f"Failed to download {fname} from {MODEL_REPO}: {e}")
+        # Download large files from HF Hub
+        caption_pth = downloaded_paths["caption_model.pth"]
+        retrieval_pth = downloaded_paths["flickr8k_retrieval_model.pth"]
+        image_index_faiss = downloaded_paths["image_embeddings.faiss"]
+        text_index_faiss = downloaded_paths["text_embeddings.faiss"]
+        image_map_pkl = downloaded_paths["image_id_map.pkl"]
+        text_map_pkl = downloaded_paths["text_id_map.pkl"]
+        vocab_pkl = downloaded_paths["vocab.pkl"]
+        # Load configs (assume small, committed to repo)
+        base_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))  # go up to project root
+        config_path = os.path.join(base_dir, "configs", "caption_config.json")
+        preprocess_cfg_path = os.path.join(base_dir, "configs", "preprocess_config.json")
+        with open(config_path, "r") as f:
+            caption_config = json.load(f)
+        with open(preprocess_cfg_path, "r") as f:
+            preprocess_cfg = json.load(f)
+        # Load captioning
+        self.caption_bundle = load_captioning_model(
+            model_path=caption_pth,
+            vocab_path=vocab_pkl,
+            config_path=config_path,
+            device=self.device
+        )
+        # Load retrieval
+        clip_model = load_clip_model(
+            model_path=retrieval_pth,
+            vocab=self.caption_bundle["vocab"],
+            device=self.device
+        )
+        self.retrieval_service = RetrievalService(
+            clip_model=clip_model,
+            image_index_path=image_index_faiss,
+            text_index_path=text_index_faiss,
+            image_map_path=image_map_pkl,
+            text_map_path=text_map_pkl,
+            preprocess=preprocess_cfg
+        )
+        print("Model components loaded successfully.")
+    @torch.no_grad()
+    def generate_caption(self, image: Image.Image) -> str:
+        encoder = self.caption_bundle["encoder"]
+        decoder = self.caption_bundle["decoder"]
+        vocab = self.caption_bundle["vocab"]
+        inv_vocab = self.caption_bundle["inv_vocab"]
+        max_len = self.caption_bundle["max_len"]
+        transform = self.caption_bundle["transform"]
+        image_tensor = transform(image).unsqueeze(0).to(self.device)
+        features = encoder(image_tensor)
+        tokens = decoder.generate(
+            features,
+            vocab=vocab,
+            inv_vocab=inv_vocab,
+            max_len=max_len
+        )
+        return " ".join(tokens)
+    def text_to_image(self, text: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        return self.retrieval_service.text_to_image(text, top_k)
+    def image_to_text(self, image: Image.Image, top_k: int = 5) -> List[str]:
+        return self.retrieval_service.image_to_text(image, top_k)
+    def image_to_image(self, image: Image.Image, top_k: int = 5) -> List[Dict[str, Any]]:
+        image_tensor = self.retrieval_service.image_transform(image).unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            emb = self.retrieval_service.clip_model.encode_image(image_tensor).cpu().numpy()
+        emb = self.retrieval_service._normalize(emb)
+        scores, idxs = self.retrieval_service.image_index.search(emb, top_k)
+        return [
+            {"image_path": self.retrieval_service.image_id_map[i], "score": float(scores[0][j])}
+            for j, i in enumerate(idxs[0])
+        ]

src/models/resnet_lstm_attention/ret_mod_defs.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+import torchvision.transforms as transforms
+import torchvision.models as models
+from torchvision.models import resnet50, ResNet50_Weights
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f"Using device: {device}")
+class ImageEncoder(nn.Module):
+    def __init__(self, embed_dim=512):
+        super().__init__()
+        resnet = resnet50(weights=ResNet50_Weights.DEFAULT)
+        self.backbone = nn.Sequential(*list(resnet.children())[:-2])
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(resnet.fc.in_features, embed_dim)
+        # Freeze early layers (up to layer2)
+        freeze_until = 7 # Unfreezed more layers to improve the retrieval values
+        for child in list(self.backbone.children())[:freeze_until]:
+            for p in child.parameters():
+                p.requires_grad = False
+        print("ImageEncoder: layers 0-6 frozen, layer3+layer4 trainable.")
+    def forward(self, x):
+        feat = self.backbone(x)
+        feat = self.pool(feat)
+        feat = torch.flatten(feat, 1)
+        emb = self.fc(feat)
+        return F.normalize(emb, p=2, dim=1)
+class TextEncoder(nn.Module):
+    def __init__(self, vocab_size, embed_dim=300, hidden_dim=512, out_dim=512):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
+        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
+        self.attn = nn.Linear(hidden_dim, 1)
+        self.fc = nn.Linear(hidden_dim, out_dim)
+    def forward(self, captions, lengths):
+        embedded = self.embedding(captions)
+        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
+        lstm_out, _ = self.lstm(packed)
+        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
+        attn_w = F.softmax(self.attn(lstm_out), dim=1)
+        context = torch.sum(attn_w * lstm_out, dim=1)
+        emb = self.fc(context)
+        return F.normalize(emb, p=2, dim=1)

src/models/resnet_lstm_attention/retrieval.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import faiss
+import pickle
+import torch
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+class RetrievalService:
+    def __init__(self, clip_model, image_index_path, text_index_path,
+                 image_map_path, text_map_path, preprocess):
+        self.device = torch.device("cpu")
+        self.clip_model = clip_model
+        self.image_index = faiss.read_index(image_index_path)
+        self.text_index = faiss.read_index(text_index_path)
+        with open(image_map_path, "rb") as f:
+            self.image_id_map = pickle.load(f)
+        with open(text_map_path, "rb") as f:
+            self.text_id_map = pickle.load(f)
+        self.image_transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=preprocess["mean"],
+                std=preprocess["std"]
+            )
+        ])
+    def _normalize(self, x):
+        return x / np.linalg.norm(x, axis=1, keepdims=True)
+    def text_to_image(self, text, top_k=5):
+        with torch.no_grad():
+            emb = self.clip_model.encode_text(text).cpu().numpy()
+        emb = self._normalize(emb)
+        scores, idxs = self.image_index.search(emb, top_k)
+        return [
+            {
+                "image_path": self.image_id_map[i],
+                "score": float(scores[0][j])
+            }
+            for j, i in enumerate(idxs[0])
+        ]
+    def image_to_text(self, image: Image.Image, top_k=5):
+        image = self.image_transform(image).unsqueeze(0)
+        with torch.no_grad():
+            emb = self.clip_model.encode_image(image).cpu().numpy()
+        emb = self._normalize(emb)
+        scores, idxs = self.text_index.search(emb, top_k)
+        results = [self.text_id_map[i] for i in idxs[0]]
+        print(f"DEBUG: Returning results: {results}")
+        return results

src/models/resnet_lstm_attention/schemas.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from pydantic import BaseModel
+from typing import List
+class TextQuery(BaseModel):
+    query: str
+    top_k: int = 5
+class ImageResult(BaseModel):
+    image_path: str
+    score: float
+class CaptionResult(BaseModel):
+    caption: str

src/models/resnet_lstm_attention/utils.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# backend/utils.py
+import re
+def simple_tokenize(text: str) -> list[str]:
+    """
+    Same simple tokenizer used during training
+    """
+    text = re.sub(r'[^\w\s]', '', text.lower())
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text.split()

src/{streamlit_app.py → streamlit_app_old.py} RENAMED Viewed

File without changes

src/utils/interfaces.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from abc import ABC, abstractmethod
+from typing import List, Dict, Any
+from PIL import Image
+class UnifiedModelInterface(ABC):
+    @abstractmethod
+    def load(self) -> None:
+        """Lazy load all required components (models, indices, vocab, etc.)"""
+        pass
+    @abstractmethod
+    def generate_caption(self, image: Image.Image) -> str:
+        pass
+    @abstractmethod
+    def text_to_image(self, text: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        """Returns [{'image_path': str, 'score': float}, ...]"""
+        pass
+    @abstractmethod
+    def image_to_text(self, image: Image.Image, top_k: int = 5) -> List[str]:
+        """Returns list of caption strings"""
+        pass
+    @abstractmethod
+    def image_to_image(self, image: Image.Image, top_k: int = 5) -> List[Dict[str, Any]]:
+        pass
+    def text_to_text(self, text: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        raise NotImplementedError("Text-to-text not supported by this model")