Spaces:
Runtime error
Runtime error
File size: 5,963 Bytes
ccea144 dced2d0 ccea144 dced2d0 ccea144 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
"""
Complete Retrieval Setup Script
Demonstrates how to set up the full retrieval pipeline with embeddings and vector store.
"""
from pathlib import Path
import sys
import os
# Add src to path - try multiple strategies for compatibility
current_file = Path(__file__).resolve()
src_dir = current_file.parent
if str(src_dir) not in sys.path:
sys.path.insert(0, str(src_dir))
# Also try from app directory for HuggingFace Spaces
app_src_dir = Path.cwd() / "src"
if app_src_dir.exists() and str(app_src_dir) not in sys.path:
sys.path.insert(0, str(app_src_dir))
from loader.ingest import load_upb_documents
from processing.chunking import chunk_documents
from embeddings.embeddings import get_embeddings
from vectorstore.store import VectorStoreManager
from retrieval.retriever import UPBRetriever
def setup_retrieval_system(
vectorstore_path: str = "vectorstore/faiss_index",
use_existing: bool = True,
chunk_size: int = 1000,
chunk_overlap: int = 200,
):
"""
Set up complete retrieval system with embeddings and vector store.
Args:
vectorstore_path: Path to save/load FAISS index
use_existing: If True and vectorstore exists, load it. Otherwise create new.
chunk_size: Size of document chunks
chunk_overlap: Overlap between chunks
Returns:
Tuple of (UPBRetriever, VectorStoreManager, chunks)
"""
print("=" * 70)
print("UPB RAG - RETRIEVAL SYSTEM SETUP")
print("=" * 70)
# Step 1: Load and chunk documents
print("\n[1/4] Loading documents...")
documents = load_upb_documents(show_progress=True)
print(f"✓ Loaded {len(documents)} documents")
print("\n[2/4] Chunking documents...")
chunks = chunk_documents(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
print(f"✓ Created {len(chunks)} chunks")
# Step 2: Initialize embeddings
print("\n[3/4] Initializing embeddings...")
embeddings = get_embeddings(provider="azure")
print("✓ Embeddings ready")
# Step 3: Create or load vector store
print("\n[4/4] Setting up vector store...")
vectorstore_manager = VectorStoreManager(embeddings)
if use_existing and Path(vectorstore_path).exists():
print(f"Loading existing vector store from {vectorstore_path}...")
vectorstore_manager.load(vectorstore_path)
print("✓ Vector store loaded")
else:
print("Creating new vector store...")
vectorstore_manager.create_from_documents(chunks)
print("✓ Vector store created")
print(f"Saving to {vectorstore_path}...")
vectorstore_manager.save(vectorstore_path)
print("✓ Vector store saved")
# Step 4: Initialize retriever with vector store
retriever = UPBRetriever(chunks, vectorstore=vectorstore_manager.vectorstore)
print("\n" + "=" * 70)
print("✅ RETRIEVAL SYSTEM READY")
print("=" * 70)
print(f"Documents: {len(documents)}")
print(f"Chunks: {len(chunks)}")
print(f"Embedding Model: Azure OpenAI")
print(f"Vector Store: FAISS")
print("\nAvailable retrieval methods:")
print(" - bm25: Keyword-based sparse retrieval")
print(" - similarity: Dense vector similarity search")
print(" - mmr: Maximal Marginal Relevance (diverse results)")
print(" - hybrid: BM25 + Vector search with RRF (recommended)")
print("=" * 70)
return retriever, vectorstore_manager, chunks
def test_all_retrieval_methods(retriever: UPBRetriever):
"""
Test all retrieval methods with sample queries.
Args:
retriever: Initialized UPBRetriever instance
"""
print("\n\n" + "=" * 70)
print("TESTING ALL RETRIEVAL METHODS")
print("=" * 70)
test_queries = [
"ingeniería de sistemas inteligencia artificial",
"becas y financiación estudiantil",
"requisitos de inscripción"
]
methods = ["bm25", "similarity", "mmr", "hybrid"]
for query in test_queries:
print(f"\n{'=' * 70}")
print(f"Query: '{query}'")
print('=' * 70)
for method in methods:
print(f"\n--- {method.upper()} ---")
try:
results = retriever.retrieve(query, method=method, k=2)
print(f"Retrieved {len(results)} documents:")
for i, doc in enumerate(results, 1):
category = doc.metadata.get('category', 'N/A')
preview = doc.page_content[:100].replace('\n', ' ')
print(f" {i}. [{category}] {preview}...")
except Exception as e:
print(f" Error: {e}")
if __name__ == "__main__":
# Setup the complete retrieval system
retriever, vectorstore_manager, chunks = setup_retrieval_system(
vectorstore_path="vectorstore/faiss_index",
use_existing=True # Use existing index if available
)
# Test all retrieval methods
test_all_retrieval_methods(retriever)
print("\n\n" + "=" * 70)
print("QUICK START EXAMPLE")
print("=" * 70)
print("""
# To use the retrieval system in your code:
from setup_retrieval import setup_retrieval_system
# Initialize
retriever, vectorstore_manager, chunks = setup_retrieval_system()
# Use different retrieval methods
query = "ingeniería de sistemas"
# BM25 (keyword-based, no embeddings needed)
results = retriever.retrieve(query, method="bm25", k=4)
# Similarity search (dense vector)
results = retriever.retrieve(query, method="similarity", k=4)
# MMR for diverse results
results = retriever.retrieve(query, method="mmr", k=4)
# Hybrid (recommended - combines BM25 + vector with RRF)
results = retriever.retrieve(query, method="hybrid", k=4)
# Custom weights for hybrid
results = retriever.retrieve(
query,
method="hybrid",
k=4,
weights=[0.3, 0.7] # [bm25_weight, vector_weight]
)
""")
print("=" * 70)
|