import gradio as gr import torch from safetensors.torch import load_file from pathlib import Path import json from functools import lru_cache # Import utilities from models.model_loader import load_embed_model, load_rerank_model from utils.search import search_embeddings, rerank_results from utils.visualization import plot_results # ============================================================================ # Data Loading Functions # ============================================================================ @lru_cache(maxsize=1) def load_metadata(metadata_path: str = "data/recipes/metadata.json") -> dict: """Load metadata.json (cached).""" with open(metadata_path, "r", encoding="utf-8") as f: return json.load(f) def get_recipes_and_paths(metadata_path: str = "data/recipes/metadata.json"): """Load recipes, image paths, and full data.""" print("šŸ“‹ Loading metadata...") metadata = load_metadata(metadata_path) recipes = metadata["recipes"] all_recipes_markdown = [r["markdown"] for r in recipes] all_image_paths = [r["image_path"] for r in recipes] print(f"āœ… Loaded {len(recipes)} recipes") return all_recipes_markdown, all_image_paths, recipes # ============================================================================ # Load Models & Data (once at startup) # ============================================================================ print("šŸ”„ Loading models...") embed_model, embed_device = load_embed_model() rerank_model, rerank_processor, rerank_device = load_rerank_model() print("šŸ“¦ Loading embeddings...") # Load Image embeddings print(" - Loading image embeddings...") image_embeddings = load_file("data/embeddings/image_embedding_500.safetensors")["image_embeddings_500"] print(f" āœ… Image embeddings shape: {image_embeddings.shape}") # Load Text embeddings print(" - Loading text embeddings...") text_embeddings = load_file("data/embeddings/text_embeddings_500.safetensors")["text_embeddings_500"] print(f" āœ… Text embeddings shape: {text_embeddings.shape}") # Load Image+Text embeddings print(" - Loading image+text embeddings...") image_text_embeddings = load_file("data/embeddings/image_text_embeddings_500.safetensors")["image_text_embeddings"] print(f" āœ… Image+Text embeddings shape: {image_text_embeddings.shape}") print("šŸ“‹ Loading recipe data...") all_recipes, all_image_paths, full_recipes = get_recipes_and_paths() print("\nāœ… System ready!") print(f" šŸ“š Recipes loaded: {len(all_recipes)}") print(f" šŸ–¼ļø Images loaded: {len(all_image_paths)}") print(f" šŸ“Š Embeddings:") print(f" - Image: {image_embeddings.shape}") print(f" - Text: {text_embeddings.shape}") print(f" - Image+Text: {image_text_embeddings.shape}") # ============================================================================ # Gradio Interface # ============================================================================ def search_and_display( query: str, modality: str, use_rerank: bool, num_results: int = 3 ): """Main search function.""" if not query.strip(): return None, {"error": "Please enter a search query"} # Select embeddings and documents based on modality if modality == "Image": embeddings = image_embeddings documents = [""] * len(all_image_paths) # Empty for image-only search_modality = "image" elif modality == "Text": embeddings = text_embeddings documents = all_recipes search_modality = "text" else: # Image+Text embeddings = image_text_embeddings documents = all_recipes search_modality = "image_text" print(f"\nšŸ” Searching with modality: {modality}") print(f" Embeddings shape: {embeddings.shape}") print(f" Query: {query[:50]}...") # Initial search results = search_embeddings( query=query, query_image=None, model=embed_model, embeddings=embeddings, documents=documents, image_paths=all_image_paths, top_k=20, modality=search_modality ) print(f" āœ… Found {len(results)} results") # Optional reranking (only for image/image_text modalities) if use_rerank and modality != "Text": print(f" šŸŽÆ Reranking top {min(10, len(results))} results...") results = rerank_results( query=query, results=results, rerank_model=rerank_model, rerank_processor=rerank_processor, device=rerank_device, top_k=min(10, len(results)) ) print(f" āœ… Reranking complete") # Add full recipe details to results for i, result in enumerate(results[:num_results]): # Find matching recipe index img_path = result.get("image_path") if img_path: try: idx = all_image_paths.index(img_path) result["recipe_details"] = { "name": full_recipes[idx]["name"], "description": full_recipes[idx]["description"][:200] + "...", "tags": full_recipes[idx]["tags"][:5], "ingredients_count": len(full_recipes[idx]["ingredients"]), "steps_count": len(full_recipes[idx]["steps"]), } except (ValueError, IndexError): pass # Visualize if modality != "Text": output_img = plot_results(results, query, num_images=num_results) return output_img, results[:num_results] else: # For text-only, return formatted text text_output = "\n\n".join([ f"**Rank {r['rank']}** (Score: {r['score']:.4f})\n{r.get('text', '')[:500]}..." for r in results[:num_results] ]) return None, results[:num_results] # ============================================================================ # Gradio UI # ============================================================================ with gr.Blocks(title="šŸ³ Multimodal Recipe Search", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # šŸ³ Multimodal Recipe RAG System Search **500 recipes** using text queries across images and documents. **Three Search Modes:** - šŸ–¼ļø **Image**: Visual similarity search (find similar-looking dishes) - šŸ“ **Text**: Semantic text search (find by ingredients, instructions, reviews) - šŸŽØ **Image+Text**: Combined multimodal search (best of both worlds) Powered by **NVIDIA Nemotron Embed-VL** and **Rerank-VL** models. """ ) with gr.Row(): with gr.Column(scale=1): query_input = gr.Textbox( label="šŸ” Search Query", placeholder="e.g., 'chocolate cake recipe', 'healthy breakfast', 'pasta with tomatoes'", lines=2 ) modality_radio = gr.Radio( choices=["Image", "Text", "Image+Text"], value="Image+Text", # āœ… Default to Image+Text label="šŸ“Š Search Modality", info="Choose how to search: visual, text, or combined" ) rerank_check = gr.Checkbox( label="šŸŽÆ Use Reranking (Cross-Encoder)", value=True, info="Rerank top results for better accuracy (adds ~1-2s)" ) num_results_slider = gr.Slider( minimum=1, maximum=5, value=3, step=1, label="šŸ“ˆ Number of Results to Display" ) search_btn = gr.Button("šŸš€ Search", variant="primary", size="lg") # Add info box gr.Markdown( """ **šŸ’” Tips:** - Use **Image** for visual similarity (e.g., "desserts with chocolate") - Use **Text** for ingredient/instruction search (e.g., "vegetarian pasta") - Use **Image+Text** for best overall results """ ) with gr.Column(scale=2): output_image = gr.Image(label="šŸ–¼ļø Top Recipe Results", type="pil") output_json = gr.JSON(label="šŸ“‹ Detailed Results") # Examples for each modality with gr.Accordion("šŸ’” Example Queries", open=False): gr.Examples( examples=[ # Image searches ["recipes with steak", "Image", True, 3], ["chocolate desserts", "Image", True, 3], ["colorful salads", "Image", False, 4], # Text searches ["healthy breakfast ideas", "Text", False, 5], ["vegetarian meals with pasta", "Text", False, 3], ["quick dinner under 30 minutes", "Text", False, 4], # Image+Text searches (best results) ["creamy pasta dishes", "Image+Text", True, 3], ["spicy chicken recipes", "Image+Text", True, 3], ["fresh summer salads", "Image+Text", True, 4], ], inputs=[query_input, modality_radio, rerank_check, num_results_slider], label="Try these examples" ) # Footer gr.Markdown( """ --- ### šŸ”§ Technical Details **Models:** - šŸ” Embedding: [nvidia/llama-nemotron-embed-vl-1b-v2](https://huggingface.co/nvidia/llama-nemotron-embed-vl-1b-v2) - šŸŽÆ Reranking: [nvidia/llama-nemotron-rerank-vl-1b-v2](https://huggingface.co/nvidia/llama-nemotron-rerank-vl-1b-v2) **Dataset:** [TurkishCodeMan/recipe-synthetic-images-10k](https://huggingface.co/datasets/TurkishCodeMan/recipe-synthetic-images-10k) (500 recipes) **Embeddings:** - Image: 2048-dim visual embeddings - Text: 2048-dim semantic embeddings - Image+Text: 2048-dim multimodal embeddings """ ) # Connect button search_btn.click( fn=search_and_display, inputs=[query_input, modality_radio, rerank_check, num_results_slider], outputs=[output_image, output_json] ) # ============================================================================ # Launch # ============================================================================ if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )