Spaces:
Runtime error
Runtime error
| """ | |
| Complete Retrieval Setup Script | |
| Demonstrates how to set up the full retrieval pipeline with embeddings and vector store. | |
| """ | |
| from pathlib import Path | |
| import sys | |
| import os | |
| # Add src to path - try multiple strategies for compatibility | |
| current_file = Path(__file__).resolve() | |
| src_dir = current_file.parent | |
| if str(src_dir) not in sys.path: | |
| sys.path.insert(0, str(src_dir)) | |
| # Also try from app directory for HuggingFace Spaces | |
| app_src_dir = Path.cwd() / "src" | |
| if app_src_dir.exists() and str(app_src_dir) not in sys.path: | |
| sys.path.insert(0, str(app_src_dir)) | |
| from loader.ingest import load_upb_documents | |
| from processing.chunking import chunk_documents | |
| from embeddings.embeddings import get_embeddings | |
| from vectorstore.store import VectorStoreManager | |
| from retrieval.retriever import UPBRetriever | |
| def setup_retrieval_system( | |
| vectorstore_path: str = "vectorstore/faiss_index", | |
| use_existing: bool = True, | |
| chunk_size: int = 1000, | |
| chunk_overlap: int = 200, | |
| ): | |
| """ | |
| Set up complete retrieval system with embeddings and vector store. | |
| Args: | |
| vectorstore_path: Path to save/load FAISS index | |
| use_existing: If True and vectorstore exists, load it. Otherwise create new. | |
| chunk_size: Size of document chunks | |
| chunk_overlap: Overlap between chunks | |
| Returns: | |
| Tuple of (UPBRetriever, VectorStoreManager, chunks) | |
| """ | |
| print("=" * 70) | |
| print("UPB RAG - RETRIEVAL SYSTEM SETUP") | |
| print("=" * 70) | |
| # Step 1: Load and chunk documents | |
| print("\n[1/4] Loading documents...") | |
| documents = load_upb_documents(show_progress=True) | |
| print(f"✓ Loaded {len(documents)} documents") | |
| print("\n[2/4] Chunking documents...") | |
| chunks = chunk_documents(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| print(f"✓ Created {len(chunks)} chunks") | |
| # Step 2: Initialize embeddings | |
| print("\n[3/4] Initializing embeddings...") | |
| embeddings = get_embeddings(provider="azure") | |
| print("✓ Embeddings ready") | |
| # Step 3: Create or load vector store | |
| print("\n[4/4] Setting up vector store...") | |
| vectorstore_manager = VectorStoreManager(embeddings) | |
| if use_existing and Path(vectorstore_path).exists(): | |
| print(f"Loading existing vector store from {vectorstore_path}...") | |
| vectorstore_manager.load(vectorstore_path) | |
| print("✓ Vector store loaded") | |
| else: | |
| print("Creating new vector store...") | |
| vectorstore_manager.create_from_documents(chunks) | |
| print("✓ Vector store created") | |
| print(f"Saving to {vectorstore_path}...") | |
| vectorstore_manager.save(vectorstore_path) | |
| print("✓ Vector store saved") | |
| # Step 4: Initialize retriever with vector store | |
| retriever = UPBRetriever(chunks, vectorstore=vectorstore_manager.vectorstore) | |
| print("\n" + "=" * 70) | |
| print("✅ RETRIEVAL SYSTEM READY") | |
| print("=" * 70) | |
| print(f"Documents: {len(documents)}") | |
| print(f"Chunks: {len(chunks)}") | |
| print(f"Embedding Model: Azure OpenAI") | |
| print(f"Vector Store: FAISS") | |
| print("\nAvailable retrieval methods:") | |
| print(" - bm25: Keyword-based sparse retrieval") | |
| print(" - similarity: Dense vector similarity search") | |
| print(" - mmr: Maximal Marginal Relevance (diverse results)") | |
| print(" - hybrid: BM25 + Vector search with RRF (recommended)") | |
| print("=" * 70) | |
| return retriever, vectorstore_manager, chunks | |
| def test_all_retrieval_methods(retriever: UPBRetriever): | |
| """ | |
| Test all retrieval methods with sample queries. | |
| Args: | |
| retriever: Initialized UPBRetriever instance | |
| """ | |
| print("\n\n" + "=" * 70) | |
| print("TESTING ALL RETRIEVAL METHODS") | |
| print("=" * 70) | |
| test_queries = [ | |
| "ingeniería de sistemas inteligencia artificial", | |
| "becas y financiación estudiantil", | |
| "requisitos de inscripción" | |
| ] | |
| methods = ["bm25", "similarity", "mmr", "hybrid"] | |
| for query in test_queries: | |
| print(f"\n{'=' * 70}") | |
| print(f"Query: '{query}'") | |
| print('=' * 70) | |
| for method in methods: | |
| print(f"\n--- {method.upper()} ---") | |
| try: | |
| results = retriever.retrieve(query, method=method, k=2) | |
| print(f"Retrieved {len(results)} documents:") | |
| for i, doc in enumerate(results, 1): | |
| category = doc.metadata.get('category', 'N/A') | |
| preview = doc.page_content[:100].replace('\n', ' ') | |
| print(f" {i}. [{category}] {preview}...") | |
| except Exception as e: | |
| print(f" Error: {e}") | |
| if __name__ == "__main__": | |
| # Setup the complete retrieval system | |
| retriever, vectorstore_manager, chunks = setup_retrieval_system( | |
| vectorstore_path="vectorstore/faiss_index", | |
| use_existing=True # Use existing index if available | |
| ) | |
| # Test all retrieval methods | |
| test_all_retrieval_methods(retriever) | |
| print("\n\n" + "=" * 70) | |
| print("QUICK START EXAMPLE") | |
| print("=" * 70) | |
| print(""" | |
| # To use the retrieval system in your code: | |
| from setup_retrieval import setup_retrieval_system | |
| # Initialize | |
| retriever, vectorstore_manager, chunks = setup_retrieval_system() | |
| # Use different retrieval methods | |
| query = "ingeniería de sistemas" | |
| # BM25 (keyword-based, no embeddings needed) | |
| results = retriever.retrieve(query, method="bm25", k=4) | |
| # Similarity search (dense vector) | |
| results = retriever.retrieve(query, method="similarity", k=4) | |
| # MMR for diverse results | |
| results = retriever.retrieve(query, method="mmr", k=4) | |
| # Hybrid (recommended - combines BM25 + vector with RRF) | |
| results = retriever.retrieve(query, method="hybrid", k=4) | |
| # Custom weights for hybrid | |
| results = retriever.retrieve( | |
| query, | |
| method="hybrid", | |
| k=4, | |
| weights=[0.3, 0.7] # [bm25_weight, vector_weight] | |
| ) | |
| """) | |
| print("=" * 70) | |