Spaces:

YoungjaeDev
/

multimodal-search

Sleeping

App Files Files Community

YoungjaeDev commited on Jan 4

Commit

2e15a8b

verified ·

1 Parent(s): 150e5f4

Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitattributes +2 -0
README.md +36 -5
app.py +1018 -0
assets/logo.png +0 -0
core/__init__.py +7 -0
core/embeddings.py +177 -0
core/index.py +190 -0
core/search.py +146 -0
data/embeddings/image_index.faiss +3 -0
data/embeddings/image_index.json +3 -0
requirements.txt +13 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/embeddings/image_index.faiss filter=lfs diff=lfs merge=lfs -text
+data/embeddings/image_index.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,43 @@
 ---
 title: Multimodal Search
-emoji: 🐨
-colorFrom: indigo
-colorTo: purple
 sdk: gradio
-sdk_version: 6.2.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Multimodal Search
+emoji: 🔍
+colorFrom: green
+colorTo: gray
 sdk: gradio
+sdk_version: 5.50.0
 app_file: app.py
 pinned: false
+license: mit
 ---
+# Multimodal Search
+Search Flickr30k images using **Text**, **Image**, or **Composed queries (CIR)**.
+## Features
+- **Text Search**: Find images matching text descriptions
+- **Image Search**: Find similar images using a reference image
+- **Composed Image Retrieval (CIR)**: Combine reference image with text modification
+  - Formula: `q = normalize(image_embedding + lambda * text_embedding)`
+## Tech Stack
+- **Model**: SigLIP 2 (`google/siglip2-so400m-patch14-384`)
+- **Index**: FAISS (IndexFlatIP for cosine similarity)
+- **Dataset**: Flickr30k (31,014 images, 155,070 captions)
+- **UI**: Gradio with custom theme
+## Usage
+1. **Text Search**: Enter text in the query box
+2. **Image Search**: Upload or click an image from results
+3. **CIR**: Combine text + image for composed search
+Adjust **Lambda** weight to balance image vs text influence in CIR.
+## Links
+- [GitHub Repository](https://github.com/YoungjaeDev/multimodal-search-mvp)
+- [SigLIP 2 Model](https://huggingface.co/google/siglip2-so400m-patch14-384)
+- [Flickr30k Dataset](https://huggingface.co/datasets/nlphuji/flickr30k)

app.py ADDED Viewed

	@@ -0,0 +1,1018 @@

+"""Gradio application for Multimodal Search MVP - HF Spaces Edition."""
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import TYPE_CHECKING, Iterable
+import gradio as gr
+import spaces
+from gradio.themes import Soft
+from gradio.themes.utils import colors, fonts, sizes
+from PIL import Image as PILImage
+if TYPE_CHECKING:
+    from core.search import MultimodalSearch
+# Global search engine (lazy loaded)
+_search_engine: MultimodalSearch | None = None
+# Global dataset (lazy loaded)
+_flickr30k_dataset = None
+# Data paths (HF Spaces uses HF Hub for data)
+DATA_DIR = Path(__file__).parent / "data"
+EMBEDDINGS_DIR = DATA_DIR / "embeddings"
+def get_flickr30k_dataset():
+    """Get or load the Flickr30k dataset (lazy loading).
+    Returns:
+        Flickr30k dataset with images.
+    """
+    global _flickr30k_dataset
+    if _flickr30k_dataset is None:
+        from datasets import load_dataset
+        _flickr30k_dataset = load_dataset(
+            "nlphuji/flickr30k",
+            split="test",
+        )
+    return _flickr30k_dataset
+class RefinedTheme(Soft):
+    """Editorial/Documentation style theme.
+    Features:
+    - No gradients, solid colors only
+    - Single accent color (Emerald)
+    - High contrast, professional look
+    - Pretendard font (Korean support)
+    """
+    def __init__(
+        self,
+        *,
+        primary_hue: colors.Color | str = colors.zinc,
+        secondary_hue: colors.Color | str = colors.emerald,
+        neutral_hue: colors.Color | str = colors.zinc,
+        text_size: sizes.Size | str = sizes.text_md,
+        font: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("Pretendard"),
+            "Pretendard",
+            "-apple-system",
+            "BlinkMacSystemFont",
+            "system-ui",
+            "sans-serif",
+        ),
+        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("JetBrains Mono"),
+            "ui-monospace",
+            "monospace",
+        ),
+    ):
+        super().__init__(
+            primary_hue=primary_hue,
+            secondary_hue=secondary_hue,
+            neutral_hue=neutral_hue,
+            text_size=text_size,
+            font=font,
+            font_mono=font_mono,
+        )
+        super().set(
+            # Background - Light mode (solid colors, no gradients)
+            body_background_fill="#fafafa",
+            background_fill_primary="#ffffff",
+            background_fill_secondary="#f4f4f5",
+            # Background - Dark mode
+            body_background_fill_dark="#18181b",
+            background_fill_primary_dark="#27272a",
+            background_fill_secondary_dark="#3f3f46",
+            # Text colors
+            body_text_color="*neutral_800",
+            body_text_color_dark="#fafafa",
+            # Buttons - Light mode (solid colors, no gradients)
+            button_primary_background_fill="*secondary_600",
+            button_primary_background_fill_hover="*secondary_700",
+            button_primary_text_color="white",
+            button_secondary_background_fill="*neutral_100",
+            button_secondary_background_fill_hover="*neutral_200",
+            # Buttons - Dark mode
+            button_primary_background_fill_dark="*secondary_500",
+            button_primary_background_fill_hover_dark="*secondary_600",
+            button_secondary_background_fill_dark="*neutral_700",
+            button_secondary_background_fill_hover_dark="*neutral_600",
+            # Minimal styling
+            block_border_width="1px",
+            block_border_color="*neutral_200",
+            block_border_color_dark="*neutral_700",
+            block_shadow="none",
+            button_primary_shadow="none",
+            button_secondary_shadow="none",
+            # Title styling
+            block_title_text_weight="600",
+            block_title_text_size="*text_md",
+            # Input fields - Light mode
+            input_background_fill="*neutral_50",
+            input_border_color="*neutral_300",
+            input_border_width="1px",
+            # Input fields - Dark mode
+            input_background_fill_dark="*neutral_800",
+            input_border_color_dark="*neutral_600",
+            # Accent colors - for tabs, links, and interactive elements
+            # Use secondary (emerald) instead of primary (zinc) for visibility
+            color_accent="*secondary_500",
+            color_accent_soft="*secondary_100",
+            color_accent_soft_dark="*secondary_800",
+            border_color_accent="*secondary_400",
+            border_color_accent_dark="*secondary_600",
+        )
+css = """
+/* Container */
+#col-container {
+    margin: 0 auto;
+    max-width: 1400px;
+}
+/* Header row - 3 column grid layout */
+#header-row {
+    display: grid !important;
+    grid-template-columns: auto 1fr auto;
+    align-items: center;
+    gap: 16px;
+}
+/* Header logo - left column */
+#header-logo {
+    background: transparent !important;
+    border: none !important;
+    min-width: 120px;
+    max-width: 120px;
+}
+/* Title - center column (screen-centered) */
+#main-title {
+    text-align: center;
+    justify-self: center;
+}
+#main-title h1 {
+    font-size: 1.75rem;
+    font-weight: 600;
+    color: var(--body-text-color);
+    margin: 0;
+}
+#main-title p {
+    color: var(--body-text-color-subdued);
+    font-size: 0.95rem;
+    margin: 0;
+}
+/* Header controls - right column (inner div for horizontal layout) */
+#header-controls-inner {
+    display: flex;
+    flex-direction: row;
+    align-items: center;
+    gap: 8px;
+}
+/* Theme transition */
+body, .gradio-container {
+    transition: background-color 0.2s ease, color 0.2s ease;
+}
+/* Theme toggle button (native HTML button) */
+.theme-toggle-btn {
+    min-width: 40px;
+    height: 40px;
+    padding: 8px;
+    border: 1px solid var(--border-color-primary);
+    border-radius: 8px;
+    background-color: var(--background-fill-primary);
+    color: var(--body-text-color);
+    cursor: pointer;
+    display: inline-flex;
+    align-items: center;
+    justify-content: center;
+    transition: border-color 0.2s ease, background-color 0.2s ease;
+}
+.theme-toggle-btn:hover {
+    border-color: var(--color-accent);
+    background-color: var(--background-fill-secondary);
+}
+.theme-toggle-btn:focus {
+    outline: none;
+    border-color: var(--color-accent);
+    box-shadow: 0 0 0 2px rgba(var(--color-accent-rgb), 0.2);
+}
+/* Theme toggle icons - show moon in light, sun in dark */
+#theme-toggle .icon-moon { display: inline-flex; }
+#theme-toggle .icon-sun { display: none; }
+.dark #theme-toggle .icon-moon { display: none; }
+.dark #theme-toggle .icon-sun { display: inline-flex; }
+/* Language selector (native select) */
+#lang-selector {
+    min-width: 100px;
+    padding: 8px 12px;
+    font-size: 14px;
+    font-family: inherit;
+    border: 1px solid var(--border-color-primary);
+    border-radius: 8px;
+    background-color: var(--background-fill-primary);
+    color: var(--body-text-color);
+    cursor: pointer;
+    outline: none;
+    appearance: none;
+    background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 24 24' fill='none' stroke='%23666' stroke-width='2'%3E%3Cpath d='M6 9l6 6 6-6'/%3E%3C/svg%3E");
+    background-repeat: no-repeat;
+    background-position: right 8px center;
+    padding-right: 28px;
+}
+#lang-selector:hover {
+    border-color: var(--color-accent);
+}
+#lang-selector:focus {
+    border-color: var(--color-accent);
+    box-shadow: 0 0 0 2px rgba(var(--color-accent-rgb), 0.2);
+}
+.dark #lang-selector {
+    background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 24 24' fill='none' stroke='%23aaa' stroke-width='2'%3E%3Cpath d='M6 9l6 6 6-6'/%3E%3C/svg%3E");
+}
+/* Buttons */
+.submit-btn {
+    font-weight: 500 !important;
+}
+/* Text areas */
+textarea {
+    font-size: 0.9rem !important;
+}
+/* Labels */
+.label-wrap {
+    font-weight: 500 !important;
+}
+/* Gallery styling - unified interface */
+.gallery-container {
+    min-height: 500px;
+}
+/* Main content layout - unified interface */
+#main-content {
+    display: flex;
+    gap: 24px;
+}
+/* Input panel styling */
+#input-panel {
+    min-width: 280px;
+    max-width: 320px;
+}
+/* Results panel styling */
+#results-panel {
+    flex: 1;
+}
+/* Reference image container */
+#ref-image-container {
+    border: 2px dashed var(--border-color-primary);
+    border-radius: 12px;
+    padding: 8px;
+    background: var(--background-fill-secondary);
+    transition: border-color 0.2s ease;
+}
+#ref-image-container:hover {
+    border-color: var(--color-accent);
+}
+/* Search mode indicator */
+#search-mode-indicator {
+    padding: 8px 16px;
+    border-radius: 8px;
+    font-size: 0.875rem;
+    font-weight: 500;
+    text-align: center;
+}
+.mode-text {
+    background: var(--secondary-100);
+    color: var(--secondary-700);
+}
+.mode-image {
+    background: var(--secondary-200);
+    color: var(--secondary-800);
+}
+.mode-composed {
+    background: var(--secondary-300);
+    color: var(--secondary-900);
+}
+.mode-none {
+    background: var(--background-fill-secondary);
+    color: var(--body-text-color-subdued);
+}
+/* Click hint text */
+.click-hint {
+    font-size: 0.8rem;
+    color: var(--body-text-color-subdued);
+    text-align: center;
+    margin-top: 8px;
+}
+/* Clear button */
+#clear-image-btn {
+    margin-top: 8px;
+}
+/* Slider group */
+.slider-group {
+    margin-top: 16px;
+}
+.slider-group p {
+    white-space: nowrap;
+}
+/* CIR info box */
+.cir-info {
+    background: var(--background-fill-secondary);
+    border-radius: 8px;
+    padding: 12px;
+    margin-top: 16px;
+    font-size: 0.85rem;
+}
+.cir-info code {
+    background: var(--background-fill-primary);
+    padding: 2px 6px;
+    border-radius: 4px;
+    font-family: var(--font-mono);
+}
+"""
+# Header controls HTML (combined for horizontal layout - avoids gr.Group column issue)
+HEADER_CONTROLS_HTML = """
+<div id="header-controls-inner">
+    <select id="lang-selector" class="lang-select" aria-label="Select language">
+        <option value="en">English</option>
+        <option value="ko">한국어</option>
+    </select>
+    <button id="theme-toggle" class="theme-toggle-btn" type="button" aria-label="Toggle theme">
+        <span class="icon-moon"><svg xmlns="http://www.w3.org/2000/svg" width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 3a6 6 0 0 0 9 9 9 9 0 1 1-9-9Z"/></svg></span>
+        <span class="icon-sun"><svg xmlns="http://www.w3.org/2000/svg" width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="4"/><path d="M12 2v2"/><path d="M12 20v2"/><path d="m4.93 4.93 1.41 1.41"/><path d="m17.66 17.66 1.41 1.41"/><path d="M2 12h2"/><path d="M20 12h2"/><path d="m6.34 17.66-1.41 1.41"/><path d="m19.07 4.93-1.41 1.41"/></svg></span>
+    </button>
+</div>
+"""
+# Theme initialization JavaScript (runs on page load, includes click handler)
+INIT_THEME_JS = """
+() => {
+    // Initialize theme from localStorage or system preference
+    const saved = localStorage.getItem('theme');
+    const prefersDark = window.matchMedia('(prefers-color-scheme: dark)').matches;
+    const shouldBeDark = saved === 'dark' || (!saved && prefersDark);
+    if (shouldBeDark) {
+        document.documentElement.classList.add('dark');
+    }
+    // Theme toggle click handler
+    const toggleBtn = document.getElementById('theme-toggle');
+    if (toggleBtn) {
+        toggleBtn.addEventListener('click', (e) => {
+            e.preventDefault();
+            document.documentElement.classList.toggle('dark');
+            const isDark = document.documentElement.classList.contains('dark');
+            localStorage.setItem('theme', isDark ? 'dark' : 'light');
+        });
+    }
+}
+"""
+# Initial language detection and selector initialization JavaScript
+INIT_LANG_JS = """
+() => {
+    const params = new URLSearchParams(window.location.search);
+    const lang = params.get('lang') || 'en';
+    // Set language selector value based on URL parameter
+    const langSelector = document.querySelector('#lang-selector');
+    if (langSelector) {
+        langSelector.value = lang;
+        // Add change event listener for language switching
+        langSelector.addEventListener('change', (e) => {
+            const targetLang = e.target.value;
+            const currentUrl = new URL(window.location);
+            currentUrl.searchParams.set('lang', targetLang);
+            window.location.href = currentUrl.toString();
+        });
+    }
+    // Update UI text based on language
+    if (lang === 'ko') {
+        const titleContainer = document.querySelector('#main-title');
+        if (titleContainer) {
+            const h1 = titleContainer.querySelector('h1');
+            if (h1) h1.textContent = '멀티모달 검색';
+            const p = titleContainer.querySelector('p');
+            if (p) p.textContent = '텍스트, 이미지 또는 합성 쿼리(CIR)를 사용하여 Flickr30k 이미지를 검색합니다.';
+        }
+    }
+    // English is the default, no need to update text
+}
+"""
+# Internationalization labels
+LABELS = {
+    "en": {
+        "title": "Multimodal Search",
+        "subtitle": "Search Flickr30k images using text, images, or composed queries (CIR).",
+        "text_search": "Text Search",
+        "image_search": "Image Search",
+        "composed_search": "Composed Search",
+        "top_k": "Top-K Results",
+        "top_k_info": "Number of results to return",
+        "search_query": "Search Query",
+        "search_query_placeholder": "e.g., a dog running on the beach",
+        "text_search_desc": "Enter a text query to find matching images.",
+        "search_results": "Search Results",
+        "query_image": "Query Image",
+        "image_search_desc": "Upload an image to find similar images.",
+        "similar_images": "Similar Images",
+        "reference_image": "Reference Image",
+        "modification_text": "Modification Text",
+        "modification_placeholder": "e.g., make it red, on the beach, with a dog",
+        "lambda_weight": "Lambda Weight",
+        "lambda_info": "< 1: Image | 1: Balanced | > 1: Text",
+        "composed_results": "Composed Search Results",
+        "cir_desc": "**Composed Image Retrieval (CIR)** combines a reference image with text modification.",
+        "cir_formula": "Formula: `q = normalize(image_embedding + lambda * text_embedding)`",
+        "footer": "Multimodal Search MVP | SigLIP 2 + FAISS + Flickr30k",
+        # Unified interface labels
+        "text_query_label": "Text Query",
+        "text_query_info": "Text only: text search | Text + Image: composed search (CIR)",
+        "click_to_select": "Click any result image to use as reference",
+        "search_mode_text": "Mode: Text Search",
+        "search_mode_image": "Mode: Image Search",
+        "search_mode_composed": "Mode: Composed Search (CIR)",
+        "search_mode_none": "Enter text or upload an image to search",
+        "clear_image": "Clear Image",
+    },
+    "ko": {
+        "title": "멀티모달 검색",
+        "subtitle": "텍스트, 이미지 또는 합성 쿼리(CIR)를 사용하여 Flickr30k 이미지를 검색합니다.",
+        "text_search": "텍스트 검색",
+        "image_search": "이미지 검색",
+        "composed_search": "합성 검색",
+        "top_k": "결과 개수",
+        "top_k_info": "반환할 결과 수",
+        "search_query": "검색어",
+        "search_query_placeholder": "예: 해변을 달리는 강아지",
+        "text_search_desc": "텍스트 쿼리를 입력하여 일치하는 이미지를 찾습니다.",
+        "search_results": "검색 결과",
+        "query_image": "쿼리 이미지",
+        "image_search_desc": "이미지를 업로드하여 유사한 이미지를 찾습니다.",
+        "similar_images": "유사 이미지",
+        "reference_image": "참조 이미지",
+        "modification_text": "수정 텍스트",
+        "modification_placeholder": "예: 빨간색으로, 해변에서, 강아지와 함께",
+        "lambda_weight": "람다 가중치",
+        "lambda_info": "< 1: 이미지 | 1: 균형 | > 1: 텍스트",
+        "composed_results": "합성 검색 결과",
+        "cir_desc": "**합성 이미지 검색 (CIR)**은 참조 이미지와 텍스트 수정을 결합합니다.",
+        "cir_formula": "공식: `q = normalize(이미지_임베딩 + lambda * 텍스트_임베딩)`",
+        "footer": "멀티모달 검색 MVP | SigLIP 2 + FAISS + Flickr30k",
+        # Unified interface labels
+        "text_query_label": "텍스트 쿼리",
+        "text_query_info": "텍스트만: 텍스트 검색 | 텍스트 + 이미지: 합성 검색 (CIR)",
+        "click_to_select": "결과 이미지를 클릭하면 참조 이미지로 설정됩니다",
+        "search_mode_text": "모드: 텍스트 검색",
+        "search_mode_image": "모드: 이미지 검색",
+        "search_mode_composed": "모드: 합성 검색 (CIR)",
+        "search_mode_none": "텍스트를 입력하거나 이미지를 업로드하세요",
+        "clear_image": "이미지 초기화",
+    },
+}
+def L(key: str, lang: str = "en") -> str:
+    """Get localized label.
+    Args:
+        key: Label key to look up.
+        lang: Language code ('en' or 'ko').
+    Returns:
+        Localized string, or key if not found.
+    """
+    return LABELS.get(lang, LABELS["en"]).get(key, key)
+@spaces.GPU
+def get_search_engine() -> MultimodalSearch:
+    """Get or create the search engine (lazy loading with GPU).
+    Returns:
+        MultimodalSearch instance.
+    """
+    global _search_engine
+    if _search_engine is None:
+        from core.embeddings import EmbeddingModel
+        from core.index import FaissIndex
+        from core.search import MultimodalSearch
+        # Initialize embedding model with GPU
+        device = "cuda"
+        embedding_model = EmbeddingModel(device=device)
+        # Load FAISS index
+        index = FaissIndex(device=device)
+        index_path = EMBEDDINGS_DIR / "image_index"
+        index.load(index_path)
+        # Create search engine
+        _search_engine = MultimodalSearch(
+            embedding_model=embedding_model,
+            index=index,
+            default_lambda=1.0,
+        )
+    return _search_engine
+def get_image_path(filename: str) -> str | None:
+    """Get the full path to a Flickr30k image.
+    Note: This function is kept for backwards compatibility.
+    The actual image retrieval is done via get_image_by_index().
+    Args:
+        filename: Image filename (e.g., "1000092795.jpg").
+    Returns:
+        The filename itself (used as a key for image lookup).
+    """
+    return filename
+def get_image_by_index(index: int) -> PILImage.Image | None:
+    """Get a Flickr30k image by its index.
+    Args:
+        index: Index in the dataset (0-31013).
+    Returns:
+        PIL Image or None if not found.
+    """
+    try:
+        dataset = get_flickr30k_dataset()
+        if 0 <= index < len(dataset):
+            return dataset[index]["image"]
+        return None
+    except Exception:
+        return None
+def format_results(results: list[dict]) -> list[tuple[PILImage.Image | str, str]]:
+    """Format search results for Gradio Gallery.
+    Args:
+        results: List of result dicts from MultimodalSearch.
+    Returns:
+        List of (image, caption) tuples for gr.Gallery.
+        Images are PIL Image objects or paths.
+    """
+    if not results:
+        return []
+    formatted = []
+    for result in results:
+        index = result.get("index", -1)
+        image = get_image_by_index(index)
+        if image is None:
+            continue
+        # Get first caption and score
+        captions = result.get("captions", [])
+        score = result.get("score", 0.0)
+        # Format caption with score
+        caption = captions[0] if captions else "No caption"
+        caption_with_score = f"[{score:.3f}] {caption}"
+        formatted.append((image, caption_with_score))
+    return formatted
+@spaces.GPU
+def search_by_text_handler(query: str, top_k: int) -> list[tuple[str, str]]:
+    """Handle text search requests with GPU acceleration.
+    Args:
+        query: Text query string.
+        top_k: Number of results to return.
+    Returns:
+        List of (image_path, caption) tuples.
+    """
+    if not query or not query.strip():
+        return []
+    try:
+        engine = get_search_engine()
+        results = engine.search_by_text(query.strip(), k=int(top_k))
+        return format_results(results)
+    except Exception as e:
+        raise gr.Error(f"Search failed: {e}")
+@spaces.GPU
+def search_by_image_handler(
+    image: PILImage.Image | None, top_k: int
+) -> list[tuple[str, str]]:
+    """Handle image search requests with GPU acceleration.
+    Args:
+        image: Query image (PIL Image).
+        top_k: Number of results to return.
+    Returns:
+        List of (image_path, caption) tuples.
+    """
+    if image is None:
+        return []
+    try:
+        engine = get_search_engine()
+        results = engine.search_by_image(image, k=int(top_k))
+        return format_results(results)
+    except Exception as e:
+        raise gr.Error(f"Search failed: {e}")
+@spaces.GPU
+def search_composed_handler(
+    image: PILImage.Image | None,
+    modification_text: str,
+    top_k: int,
+    lambda_weight: float,
+) -> list[tuple[str, str]]:
+    """Handle composed image retrieval requests with GPU acceleration.
+    Args:
+        image: Reference image (PIL Image).
+        modification_text: Text describing desired modification.
+        top_k: Number of results to return.
+        lambda_weight: Weight for text embedding in CIR.
+    Returns:
+        List of (image_path, caption) tuples.
+    """
+    if image is None:
+        return []
+    if not modification_text or not modification_text.strip():
+        return []
+    try:
+        engine = get_search_engine()
+        results = engine.search_composed(
+            image,
+            modification_text.strip(),
+            k=int(top_k),
+            lambda_weight=float(lambda_weight),
+        )
+        return format_results(results)
+    except Exception as e:
+        raise gr.Error(f"Search failed: {e}")
+def get_mode_label(mode: str, lang: str = "en") -> str:
+    """Get localized label for search mode."""
+    mode_labels = {
+        "text": "search_mode_text",
+        "image": "search_mode_image",
+        "composed": "search_mode_composed",
+        "none": "search_mode_none",
+    }
+    label_key = mode_labels.get(mode, "search_mode_none")
+    return L(label_key, lang)
+def get_random_samples(top_k: int = 10) -> list[tuple[PILImage.Image | str, str]]:
+    """Get random sample images for initial display.
+    Args:
+        top_k: Number of random samples to return.
+    Returns:
+        List of (image, caption) tuples.
+    """
+    import random
+    try:
+        dataset = get_flickr30k_dataset()
+        total = len(dataset)
+        indices = random.sample(range(total), min(top_k, total))
+        samples = []
+        for idx in indices:
+            try:
+                item = dataset[idx]
+                image = item["image"]
+                captions = item.get("captions", item.get("caption", []))
+                caption = captions[0] if captions else "No caption"
+                samples.append((image, caption))
+            except Exception:
+                continue
+        return samples
+    except Exception as e:
+        print(f"Error loading samples: {e}")
+        return []
+@spaces.GPU
+def unified_search_handler(
+    text_query: str,
+    image: PILImage.Image | None,
+    top_k: int,
+    lambda_weight: float,
+) -> tuple[list[tuple[PILImage.Image | str, str]], str]:
+    """Handle unified search requests with GPU acceleration.
+    Automatically determines search mode based on inputs:
+    - Text only: text_search
+    - Image only: image_search
+    - Text + Image: composed_search (CIR)
+    Args:
+        text_query: Text query string.
+        image: Reference image (PIL Image) or None.
+        top_k: Number of results to return.
+        lambda_weight: Weight for text embedding in CIR.
+    Returns:
+        Tuple of (gallery_results, search_mode_indicator).
+    """
+    has_text = text_query and text_query.strip()
+    has_image = image is not None
+    # Determine search mode
+    if has_text and has_image:
+        mode = "composed"
+    elif has_text:
+        mode = "text"
+    elif has_image:
+        mode = "image"
+    else:
+        mode = "none"
+        return [], mode
+    # Execute search with unified error handling
+    results: list = []
+    try:
+        engine = get_search_engine()
+        if mode == "composed":
+            results = engine.search_composed(
+                image,
+                text_query.strip(),
+                k=int(top_k),
+                lambda_weight=float(lambda_weight),
+            )
+        elif mode == "text":
+            results = engine.search_by_text(text_query.strip(), k=int(top_k))
+        elif mode == "image":
+            results = engine.search_by_image(image, k=int(top_k))
+    except Exception as e:
+        raise gr.Error(f"Search failed: {e}")
+    gallery = format_results(results)
+    return gallery, mode
+def on_gallery_select(
+    evt: gr.SelectData,
+    gallery_data: list[tuple[PILImage.Image | str, str]],
+) -> PILImage.Image | None:
+    """Handle gallery selection to set reference image.
+    Args:
+        evt: Gradio SelectData event containing selection index.
+        gallery_data: Current gallery data.
+    Returns:
+        Selected PIL Image or None.
+    """
+    if gallery_data is None or len(gallery_data) == 0:
+        return None
+    try:
+        idx = evt.index
+        if 0 <= idx < len(gallery_data):
+            image_data = gallery_data[idx]
+            # Gallery data is (image, caption) tuple
+            if isinstance(image_data, tuple):
+                image = image_data[0]
+                if isinstance(image, PILImage.Image):
+                    return image
+        return None
+    except Exception:
+        return None
+def create_app(lang: str = "en") -> gr.Blocks:
+    """Create and configure the Gradio application.
+    Args:
+        lang: Language code ('en' or 'ko').
+    Returns:
+        Gradio Blocks application.
+    """
+    theme = RefinedTheme()
+    with gr.Blocks(title=L("title", lang), theme=theme, css=css) as app:
+        # State for gallery data (used by gallery select handler)
+        gallery_state = gr.State([])
+        with gr.Column(elem_id="col-container"):
+            # Header: 3-column grid [logo] [title - centered] [controls - right]
+            with gr.Row(elem_id="header-row"):
+                # Left: Logo
+                gr.Image(
+                    value="assets/logo.png",
+                    show_label=False,
+                    show_download_button=False,
+                    show_fullscreen_button=False,
+                    interactive=False,
+                    height=80,
+                    width=80,
+                    elem_id="header-logo",
+                )
+                # Center: Title (screen-centered via CSS Grid)
+                gr.Markdown(
+                    f"# {L('title', lang)}\n{L('subtitle', lang)}",
+                    elem_id="main-title",
+                )
+                # Right: Controls (single HTML for horizontal layout)
+                gr.HTML(value=HEADER_CONTROLS_HTML)
+            # Main content area - unified interface
+            with gr.Row(elem_id="main-content"):
+                # Left panel: Input controls
+                with gr.Column(scale=1, elem_id="input-panel"):
+                    # Reference Image
+                    ref_image = gr.Image(
+                        label=L("reference_image", lang),
+                        type="pil",
+                        height=200,
+                        elem_id="ref-image-container",
+                    )
+                    # Clear image button
+                    clear_btn = gr.Button(
+                        L("clear_image", lang),
+                        variant="secondary",
+                        size="sm",
+                        elem_id="clear-image-btn",
+                    )
+                    # Text query
+                    text_query = gr.Textbox(
+                        label=L("text_query_label", lang),
+                        placeholder=L("search_query_placeholder", lang),
+                        lines=2,
+                    )
+                    # Sliders group
+                    with gr.Group(elem_classes=["slider-group"]):
+                        lambda_slider = gr.Slider(
+                            minimum=0.3,
+                            maximum=2.0,
+                            value=1.0,
+                            step=0.1,
+                            label=L("lambda_weight", lang),
+                            info=L("lambda_info", lang),
+                        )
+                        top_k_slider = gr.Slider(
+                            minimum=1,
+                            maximum=50,
+                            value=10,
+                            step=1,
+                            label=L("top_k", lang),
+                            info=L("top_k_info", lang),
+                        )
+                # Right panel: Results gallery
+                with gr.Column(scale=3, elem_id="results-panel"):
+                    # Search mode indicator
+                    mode_indicator = gr.Markdown(
+                        f"<div id='search-mode-indicator' class='mode-none'>"
+                        f"{L('search_mode_none', lang)}</div>",
+                        elem_id="mode-indicator-container",
+                    )
+                    # Click hint
+                    gr.Markdown(
+                        f"<p class='click-hint'>{L('click_to_select', lang)}</p>",
+                    )
+                    # Results gallery
+                    results_gallery = gr.Gallery(
+                        label=L("search_results", lang),
+                        show_label=True,
+                        columns=5,
+                        rows=4,
+                        height="auto",
+                        object_fit="cover",
+                        elem_classes=["gallery-container"],
+                        allow_preview=True,
+                    )
+        # Helper function to run unified search and update state
+        def search_and_update_state(
+            text_query: str,
+            image: PILImage.Image | None,
+            top_k: int,
+            lambda_weight: float,
+        ) -> tuple[list, list, str]:
+            """Run search and return gallery data, state, and mode indicator."""
+            gallery, mode = unified_search_handler(
+                text_query, image, top_k, lambda_weight
+            )
+            mode_html = (
+                f"<div id='search-mode-indicator' class='mode-{mode}'>"
+                f"{get_mode_label(mode, lang)}</div>"
+            )
+            return gallery, gallery, mode_html
+        # Real-time search on any input change
+        search_inputs = [text_query, ref_image, top_k_slider, lambda_slider]
+        search_outputs = [results_gallery, gallery_state, mode_indicator]
+        for input_component in [text_query, ref_image, lambda_slider, top_k_slider]:
+            input_component.change(
+                fn=search_and_update_state,
+                inputs=search_inputs,
+                outputs=search_outputs,
+                show_progress="hidden",
+            )
+        # Clear image button
+        clear_btn.click(
+            fn=lambda: None,
+            inputs=[],
+            outputs=[ref_image],
+        )
+        # Load initial samples on app start
+        def load_initial_samples(top_k: int) -> tuple[list, list]:
+            """Load random samples for initial display."""
+            samples = get_random_samples(int(top_k))
+            return samples, samples
+        app.load(
+            fn=load_initial_samples,
+            inputs=[top_k_slider],
+            outputs=[results_gallery, gallery_state],
+        )
+        # Initialize theme on page load
+        app.load(fn=None, js=INIT_THEME_JS)
+        # Initialize language from URL query parameter
+        app.load(fn=None, js=INIT_LANG_JS)
+    return app
+def get_default_lang() -> str:
+    """Get default language from environment or URL."""
+    return os.environ.get("APP_LANG", "en")
+# Create the demo instance for module-level access
+demo = create_app(get_default_lang())
+if __name__ == "__main__":
+    lang = get_default_lang()
+    app = create_app(lang)
+    app.queue(max_size=30).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+    )

assets/logo.png ADDED Viewed

core/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Core module for multimodal search MVP."""
+from core.embeddings import EmbeddingModel
+from core.index import FaissIndex
+from core.search import MultimodalSearch
+__all__ = ["EmbeddingModel", "FaissIndex", "MultimodalSearch"]

core/embeddings.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""SigLIP 2 embedding model wrapper."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from transformers import AutoModel, AutoProcessor
+if TYPE_CHECKING:
+    from PIL import Image
+class EmbeddingModel:
+    """SigLIP 2 embedding model for text and image encoding.
+    Model: google/siglip2-so400m-patch14-384
+    Dimension: 1152
+    """
+    MODEL_ID = "google/siglip2-so400m-patch14-384"
+    EMBEDDING_DIM = 1152
+    def __init__(self, device: str = "cpu") -> None:
+        """Initialize the embedding model.
+        Args:
+            device: Device to run the model on ('cpu' or 'cuda').
+        """
+        self.device = device
+        self.model = None
+        self.processor = None
+    def load(self) -> None:
+        """Load the model and processor."""
+        self.processor = AutoProcessor.from_pretrained(self.MODEL_ID)
+        self.model = AutoModel.from_pretrained(self.MODEL_ID)
+        self.model.to(self.device)
+        # Set model to evaluation mode (disable dropout, etc.)
+        self.model.train(False)
+    def _ensure_loaded(self) -> None:
+        """Ensure model is loaded before inference."""
+        if self.model is None or self.processor is None:
+            self.load()
+    def encode_image(self, image: Image.Image) -> np.ndarray:
+        """Encode a single image to embedding vector.
+        Args:
+            image: PIL Image to encode.
+        Returns:
+            Normalized embedding vector of shape (1152,).
+        """
+        self._ensure_loaded()
+        inputs = self.processor(images=image, return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            features = self.model.get_image_features(**inputs)
+            features = F.normalize(features, dim=-1)
+        return features.cpu().numpy().squeeze(0)
+    def encode_images(
+        self,
+        images: list[Image.Image],
+        batch_size: int = 32,
+        show_progress: bool = True,
+    ) -> np.ndarray:
+        """Encode multiple images to embedding vectors.
+        Args:
+            images: List of PIL Images to encode.
+            batch_size: Batch size for processing.
+            show_progress: Show progress bar.
+        Returns:
+            Normalized embedding vectors of shape (N, 1152).
+        """
+        if not images:
+            return np.empty((0, self.EMBEDDING_DIM), dtype=np.float32)
+        self._ensure_loaded()
+        all_embeddings = []
+        iterator = range(0, len(images), batch_size)
+        if show_progress:
+            iterator = tqdm(iterator, desc="Encoding images", unit="batch")
+        for i in iterator:
+            batch_images = images[i : i + batch_size]
+            inputs = self.processor(images=batch_images, return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                features = self.model.get_image_features(**inputs)
+                features = F.normalize(features, dim=-1)
+            all_embeddings.append(features.cpu().numpy())
+        return np.concatenate(all_embeddings, axis=0)
+    def encode_text(self, text: str) -> np.ndarray:
+        """Encode a single text to embedding vector.
+        Args:
+            text: Text string to encode.
+        Returns:
+            Normalized embedding vector of shape (1152,).
+        """
+        self._ensure_loaded()
+        # SigLIP requires padding="max_length" as trained
+        inputs = self.processor(
+            text=text,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            features = self.model.get_text_features(**inputs)
+            features = F.normalize(features, dim=-1)
+        return features.cpu().numpy().squeeze(0)
+    def encode_texts(
+        self,
+        texts: list[str],
+        batch_size: int = 32,
+        show_progress: bool = True,
+    ) -> np.ndarray:
+        """Encode multiple texts to embedding vectors.
+        Args:
+            texts: List of text strings to encode.
+            batch_size: Batch size for processing.
+            show_progress: Show progress bar.
+        Returns:
+            Normalized embedding vectors of shape (N, 1152).
+        """
+        if not texts:
+            return np.empty((0, self.EMBEDDING_DIM), dtype=np.float32)
+        self._ensure_loaded()
+        all_embeddings = []
+        iterator = range(0, len(texts), batch_size)
+        if show_progress:
+            iterator = tqdm(iterator, desc="Encoding texts", unit="batch")
+        for i in iterator:
+            batch_texts = texts[i : i + batch_size]
+            inputs = self.processor(
+                text=batch_texts,
+                padding="max_length",
+                truncation=True,
+                return_tensors="pt",
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                features = self.model.get_text_features(**inputs)
+                features = F.normalize(features, dim=-1)
+            all_embeddings.append(features.cpu().numpy())
+        return np.concatenate(all_embeddings, axis=0)

core/index.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""FAISS index wrapper for efficient similarity search."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import TYPE_CHECKING
+import faiss
+import numpy as np
+if TYPE_CHECKING:
+    pass
+class FaissIndex:
+    """FAISS IndexFlatIP wrapper for normalized vector search.
+    Uses Inner Product (IP) for cosine similarity with pre-normalized vectors.
+    """
+    def __init__(self, dimension: int = 1152, device: str = "cpu") -> None:
+        """Initialize FAISS index.
+        Args:
+            dimension: Embedding dimension (1152 for SigLIP 2).
+            device: Device to use ('cpu' or 'cuda').
+        """
+        self.dimension = dimension
+        self.device = device
+        self.index: faiss.Index | None = None
+        self.metadata: list[dict] = []
+        self._gpu_resources: faiss.StandardGpuResources | None = None
+    def _cleanup_gpu_resources(self) -> None:
+        """Release GPU resources to prevent memory leaks."""
+        if self._gpu_resources is not None:
+            self._gpu_resources = None
+            self.index = None
+            import gc
+            gc.collect()
+    def __del__(self) -> None:
+        """Destructor to clean up GPU resources."""
+        self._cleanup_gpu_resources()
+    def build(self, embeddings: np.ndarray, metadata: list[dict] | None = None) -> None:
+        """Build the index from embeddings.
+        Args:
+            embeddings: Normalized embedding vectors of shape (N, dimension).
+            metadata: Optional list of metadata dicts for each embedding.
+        Raises:
+            ValueError: If embeddings are empty, wrong dimension, or metadata mismatch.
+        """
+        if embeddings.size == 0:
+            raise ValueError("Cannot build index from empty embeddings")
+        if embeddings.shape[1] != self.dimension:
+            raise ValueError(
+                f"Embedding dimension {embeddings.shape[1]} does not match "
+                f"index dimension {self.dimension}"
+            )
+        if metadata is not None and len(metadata) != len(embeddings):
+            raise ValueError(
+                f"Metadata length {len(metadata)} does not match "
+                f"embeddings count {len(embeddings)}"
+            )
+        # Convert to float32 if needed (FAISS requirement)
+        if embeddings.dtype != np.float32:
+            embeddings = embeddings.astype(np.float32)
+        # Clean up existing GPU resources before rebuilding
+        self._cleanup_gpu_resources()
+        # Create IndexFlatIP for inner product (cosine similarity with normalized vectors)
+        self.index = faiss.IndexFlatIP(self.dimension)
+        # Move to GPU if requested
+        if self.device == "cuda":
+            self._gpu_resources = faiss.StandardGpuResources()
+            self.index = faiss.index_cpu_to_gpu(self._gpu_resources, 0, self.index)
+        # Add embeddings to index
+        self.index.add(embeddings)
+        # Store metadata
+        self.metadata = metadata if metadata is not None else [{} for _ in range(len(embeddings))]
+    def search(
+        self, query: np.ndarray, k: int = 10
+    ) -> tuple[np.ndarray, np.ndarray, list[dict]]:
+        """Search for k nearest neighbors.
+        Args:
+            query: Query embedding of shape (1, dimension) or (dimension,).
+            k: Number of results to return.
+        Returns:
+            Tuple of (scores, indices, metadata_list).
+        Raises:
+            ValueError: If index is not built.
+        """
+        if self.index is None:
+            raise ValueError("Index not built. Call build() first.")
+        # Reshape 1D query to 2D
+        if query.ndim == 1:
+            query = query.reshape(1, -1)
+        # Convert to float32 if needed
+        if query.dtype != np.float32:
+            query = query.astype(np.float32)
+        # Limit k to index size
+        k = min(k, self.index.ntotal)
+        # Perform search
+        scores, indices = self.index.search(query, k)
+        # Flatten results (single query)
+        scores = scores[0]
+        indices = indices[0]
+        # Get metadata for results
+        result_metadata = [self.metadata[idx] for idx in indices]
+        return scores, indices, result_metadata
+    def save(self, path: str | Path) -> None:
+        """Save index and metadata to disk.
+        Args:
+            path: Path to save the index (without extension).
+                  Creates {path}.faiss and {path}.json files.
+        Raises:
+            ValueError: If index is not built.
+        """
+        if self.index is None:
+            raise ValueError("Index not built. Call build() first.")
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        # Convert GPU index to CPU before saving
+        index_to_save = self.index
+        if self.device == "cuda":
+            index_to_save = faiss.index_gpu_to_cpu(self.index)
+        # Save FAISS index
+        faiss.write_index(index_to_save, str(path.with_suffix(".faiss")))
+        # Save metadata as JSON
+        with open(path.with_suffix(".json"), "w", encoding="utf-8") as f:
+            json.dump(
+                {"dimension": self.dimension, "metadata": self.metadata},
+                f,
+                ensure_ascii=False,
+            )
+    def load(self, path: str | Path) -> None:
+        """Load index and metadata from disk.
+        Args:
+            path: Path to load the index from (without extension).
+                  Expects {path}.faiss and {path}.json files.
+        """
+        path = Path(path)
+        # Clean up existing GPU resources before loading
+        self._cleanup_gpu_resources()
+        # Load FAISS index
+        self.index = faiss.read_index(str(path.with_suffix(".faiss")))
+        # Move to GPU if requested
+        if self.device == "cuda":
+            self._gpu_resources = faiss.StandardGpuResources()
+            self.index = faiss.index_cpu_to_gpu(self._gpu_resources, 0, self.index)
+        # Load metadata
+        with open(path.with_suffix(".json"), encoding="utf-8") as f:
+            data = json.load(f)
+            self.dimension = data["dimension"]
+            self.metadata = data["metadata"]

core/search.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""Multimodal search logic for Text, Image, and Composed queries."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import numpy as np
+if TYPE_CHECKING:
+    from PIL import Image
+    from core.embeddings import EmbeddingModel
+    from core.index import FaissIndex
+class MultimodalSearch:
+    """Unified search interface for Text, Image, and CIR (Composed Image Retrieval).
+    CIR Zero-shot formula: q = normalize(img_emb + lambda * text_emb)
+    """
+    def __init__(
+        self,
+        embedding_model: EmbeddingModel,
+        index: FaissIndex,
+        default_lambda: float = 1.0,
+    ) -> None:
+        """Initialize multimodal search.
+        Args:
+            embedding_model: Loaded EmbeddingModel instance.
+            index: Loaded FaissIndex instance.
+            default_lambda: Default weight for text embedding in CIR.
+        """
+        self.embedding_model = embedding_model
+        self.index = index
+        self.default_lambda = default_lambda
+    def _format_results(
+        self,
+        scores: np.ndarray,
+        indices: np.ndarray,
+        metadata_list: list[dict],
+    ) -> list[dict]:
+        """Format FAISS search results into list of result dicts.
+        Args:
+            scores: Score array from FAISS.
+            indices: Index array from FAISS.
+            metadata_list: Metadata list from FAISS.
+        Returns:
+            List of result dicts with 'score', 'index', and metadata fields.
+        """
+        if len(scores) == 0:
+            return []
+        results = []
+        for score, idx, meta in zip(scores, indices, metadata_list):
+            result = {
+                "score": float(score),
+                "index": int(idx),
+            }
+            result.update(meta)
+            results.append(result)
+        return results
+    def search_by_text(self, query: str, k: int = 10) -> list[dict]:
+        """Search images by text query.
+        Args:
+            query: Text query string.
+            k: Number of results to return.
+        Returns:
+            List of result dicts with 'score', 'index', and metadata.
+        """
+        text_embedding = self.embedding_model.encode_text(query)
+        scores, indices, metadata_list = self.index.search(text_embedding, k=k)
+        return self._format_results(scores, indices, metadata_list)
+    def search_by_image(self, image: Image.Image, k: int = 10) -> list[dict]:
+        """Search similar images by reference image.
+        Args:
+            image: Query image.
+            k: Number of results to return.
+        Returns:
+            List of result dicts with 'score', 'index', and metadata.
+        """
+        image_embedding = self.embedding_model.encode_image(image)
+        scores, indices, metadata_list = self.index.search(image_embedding, k=k)
+        return self._format_results(scores, indices, metadata_list)
+    def search_composed(
+        self,
+        image: Image.Image,
+        modification_text: str,
+        k: int = 10,
+        lambda_weight: float | None = None,
+    ) -> list[dict]:
+        """Composed Image Retrieval: find images matching (image + text modification).
+        Uses zero-shot CIR: q = normalize(img_emb + lambda * text_emb)
+        Args:
+            image: Reference image.
+            modification_text: Text describing desired modification.
+            k: Number of results to return.
+            lambda_weight: Weight for text embedding (uses default if None).
+        Returns:
+            List of result dicts with 'score', 'index', and metadata.
+        """
+        if lambda_weight is None:
+            lambda_weight = self.default_lambda
+        image_embedding = self.embedding_model.encode_image(image)
+        text_embedding = self.embedding_model.encode_text(modification_text)
+        composed_query = self._compose_query(image_embedding, text_embedding, lambda_weight)
+        scores, indices, metadata_list = self.index.search(composed_query, k=k)
+        return self._format_results(scores, indices, metadata_list)
+    def _compose_query(
+        self,
+        image_emb: np.ndarray,
+        text_emb: np.ndarray,
+        lambda_weight: float,
+    ) -> np.ndarray:
+        """Compose query embedding from image and text.
+        Args:
+            image_emb: Image embedding vector.
+            text_emb: Text embedding vector.
+            lambda_weight: Weight for text embedding.
+        Returns:
+            Normalized composed query embedding.
+        """
+        composed = image_emb + lambda_weight * text_emb
+        norm = np.linalg.norm(composed)
+        if norm > 0:
+            composed = composed / norm
+        return composed

data/embeddings/image_index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00fffaa18facc712fd49346af90954c47afa8ad7ec3453a782e0afe463e602e8
+size 142912557

data/embeddings/image_index.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48615896a09252765a532b70153cec8b35d2c537a16abdfa9a8b71e2a73079aa
+size 12842559

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+# Core dependencies for HF Spaces
+transformers>=4.49
+torch>=2.0
+faiss-cpu>=1.7.4
+gradio>=5.0,<6.0
+pillow>=10.0
+numpy>=1.24
+datasets>=2.14,<4.0
+huggingface-hub>=0.20
+tqdm>=4.65
+# HF Spaces ZeroGPU
+spaces