TurkishCodeMan's picture
Upload folder using huggingface_hub
3f8c153 verified
import gradio as gr
import torch
from safetensors.torch import load_file
from pathlib import Path
import json
from functools import lru_cache
# Import utilities
from models.model_loader import load_embed_model, load_rerank_model
from utils.search import search_embeddings, rerank_results
from utils.visualization import plot_results
# ============================================================================
# Data Loading Functions
# ============================================================================
@lru_cache(maxsize=1)
def load_metadata(metadata_path: str = "data/recipes/metadata.json") -> dict:
"""Load metadata.json (cached)."""
with open(metadata_path, "r", encoding="utf-8") as f:
return json.load(f)
def get_recipes_and_paths(metadata_path: str = "data/recipes/metadata.json"):
"""Load recipes, image paths, and full data."""
print("📋 Loading metadata...")
metadata = load_metadata(metadata_path)
recipes = metadata["recipes"]
all_recipes_markdown = [r["markdown"] for r in recipes]
all_image_paths = [r["image_path"] for r in recipes]
print(f"✅ Loaded {len(recipes)} recipes")
return all_recipes_markdown, all_image_paths, recipes
# ============================================================================
# Load Models & Data (once at startup)
# ============================================================================
print("🔄 Loading models...")
embed_model, embed_device = load_embed_model()
rerank_model, rerank_processor, rerank_device = load_rerank_model()
print("📦 Loading embeddings...")
# Load Image embeddings
print(" - Loading image embeddings...")
image_embeddings = load_file("data/embeddings/image_embedding_500.safetensors")["image_embeddings_500"]
print(f" ✅ Image embeddings shape: {image_embeddings.shape}")
# Load Text embeddings
print(" - Loading text embeddings...")
text_embeddings = load_file("data/embeddings/text_embeddings_500.safetensors")["text_embeddings_500"]
print(f" ✅ Text embeddings shape: {text_embeddings.shape}")
# Load Image+Text embeddings
print(" - Loading image+text embeddings...")
image_text_embeddings = load_file("data/embeddings/image_text_embeddings_500.safetensors")["image_text_embeddings"]
print(f" ✅ Image+Text embeddings shape: {image_text_embeddings.shape}")
print("📋 Loading recipe data...")
all_recipes, all_image_paths, full_recipes = get_recipes_and_paths()
print("\n✅ System ready!")
print(f" 📚 Recipes loaded: {len(all_recipes)}")
print(f" 🖼️ Images loaded: {len(all_image_paths)}")
print(f" 📊 Embeddings:")
print(f" - Image: {image_embeddings.shape}")
print(f" - Text: {text_embeddings.shape}")
print(f" - Image+Text: {image_text_embeddings.shape}")
# ============================================================================
# Gradio Interface
# ============================================================================
def search_and_display(
query: str,
modality: str,
use_rerank: bool,
num_results: int = 3
):
"""Main search function."""
if not query.strip():
return None, {"error": "Please enter a search query"}
# Select embeddings and documents based on modality
if modality == "Image":
embeddings = image_embeddings
documents = [""] * len(all_image_paths) # Empty for image-only
search_modality = "image"
elif modality == "Text":
embeddings = text_embeddings
documents = all_recipes
search_modality = "text"
else: # Image+Text
embeddings = image_text_embeddings
documents = all_recipes
search_modality = "image_text"
print(f"\n🔍 Searching with modality: {modality}")
print(f" Embeddings shape: {embeddings.shape}")
print(f" Query: {query[:50]}...")
# Initial search
results = search_embeddings(
query=query,
query_image=None,
model=embed_model,
embeddings=embeddings,
documents=documents,
image_paths=all_image_paths,
top_k=20,
modality=search_modality
)
print(f" ✅ Found {len(results)} results")
# Optional reranking (only for image/image_text modalities)
if use_rerank and modality != "Text":
print(f" 🎯 Reranking top {min(10, len(results))} results...")
results = rerank_results(
query=query,
results=results,
rerank_model=rerank_model,
rerank_processor=rerank_processor,
device=rerank_device,
top_k=min(10, len(results))
)
print(f" ✅ Reranking complete")
# Add full recipe details to results
for i, result in enumerate(results[:num_results]):
# Find matching recipe index
img_path = result.get("image_path")
if img_path:
try:
idx = all_image_paths.index(img_path)
result["recipe_details"] = {
"name": full_recipes[idx]["name"],
"description": full_recipes[idx]["description"][:200] + "...",
"tags": full_recipes[idx]["tags"][:5],
"ingredients_count": len(full_recipes[idx]["ingredients"]),
"steps_count": len(full_recipes[idx]["steps"]),
}
except (ValueError, IndexError):
pass
# Visualize
if modality != "Text":
output_img = plot_results(results, query, num_images=num_results)
return output_img, results[:num_results]
else:
# For text-only, return formatted text
text_output = "\n\n".join([
f"**Rank {r['rank']}** (Score: {r['score']:.4f})\n{r.get('text', '')[:500]}..."
for r in results[:num_results]
])
return None, results[:num_results]
# ============================================================================
# Gradio UI
# ============================================================================
with gr.Blocks(title="🍳 Multimodal Recipe Search", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# 🍳 Multimodal Recipe RAG System
Search **500 recipes** using text queries across images and documents.
**Three Search Modes:**
- 🖼️ **Image**: Visual similarity search (find similar-looking dishes)
- 📝 **Text**: Semantic text search (find by ingredients, instructions, reviews)
- 🎨 **Image+Text**: Combined multimodal search (best of both worlds)
Powered by **NVIDIA Nemotron Embed-VL** and **Rerank-VL** models.
"""
)
with gr.Row():
with gr.Column(scale=1):
query_input = gr.Textbox(
label="🔍 Search Query",
placeholder="e.g., 'chocolate cake recipe', 'healthy breakfast', 'pasta with tomatoes'",
lines=2
)
modality_radio = gr.Radio(
choices=["Image", "Text", "Image+Text"],
value="Image+Text", # ✅ Default to Image+Text
label="📊 Search Modality",
info="Choose how to search: visual, text, or combined"
)
rerank_check = gr.Checkbox(
label="🎯 Use Reranking (Cross-Encoder)",
value=True,
info="Rerank top results for better accuracy (adds ~1-2s)"
)
num_results_slider = gr.Slider(
minimum=1,
maximum=5,
value=3,
step=1,
label="📈 Number of Results to Display"
)
search_btn = gr.Button("🚀 Search", variant="primary", size="lg")
# Add info box
gr.Markdown(
"""
**💡 Tips:**
- Use **Image** for visual similarity (e.g., "desserts with chocolate")
- Use **Text** for ingredient/instruction search (e.g., "vegetarian pasta")
- Use **Image+Text** for best overall results
"""
)
with gr.Column(scale=2):
output_image = gr.Image(label="🖼️ Top Recipe Results", type="pil")
output_json = gr.JSON(label="📋 Detailed Results")
# Examples for each modality
with gr.Accordion("💡 Example Queries", open=False):
gr.Examples(
examples=[
# Image searches
["recipes with steak", "Image", True, 3],
["chocolate desserts", "Image", True, 3],
["colorful salads", "Image", False, 4],
# Text searches
["healthy breakfast ideas", "Text", False, 5],
["vegetarian meals with pasta", "Text", False, 3],
["quick dinner under 30 minutes", "Text", False, 4],
# Image+Text searches (best results)
["creamy pasta dishes", "Image+Text", True, 3],
["spicy chicken recipes", "Image+Text", True, 3],
["fresh summer salads", "Image+Text", True, 4],
],
inputs=[query_input, modality_radio, rerank_check, num_results_slider],
label="Try these examples"
)
# Footer
gr.Markdown(
"""
---
### 🔧 Technical Details
**Models:**
- 🔍 Embedding: [nvidia/llama-nemotron-embed-vl-1b-v2](https://huggingface.co/nvidia/llama-nemotron-embed-vl-1b-v2)
- 🎯 Reranking: [nvidia/llama-nemotron-rerank-vl-1b-v2](https://huggingface.co/nvidia/llama-nemotron-rerank-vl-1b-v2)
**Dataset:** [TurkishCodeMan/recipe-synthetic-images-10k](https://huggingface.co/datasets/TurkishCodeMan/recipe-synthetic-images-10k) (500 recipes)
**Embeddings:**
- Image: 2048-dim visual embeddings
- Text: 2048-dim semantic embeddings
- Image+Text: 2048-dim multimodal embeddings
"""
)
# Connect button
search_btn.click(
fn=search_and_display,
inputs=[query_input, modality_radio, rerank_check, num_results_slider],
outputs=[output_image, output_json]
)
# ============================================================================
# Launch
# ============================================================================
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)