Spaces:

rosvend
/

UPB-chatbot-rag

Runtime error

App Files Files Community

rosvend commited on Oct 26, 2025

Commit

c03c816

1 Parent(s): 045cdca

feat: added chunking using MMR and BM25

Browse files

Files changed (8) hide show

pyproject.toml +1 -0
src/embeddings/embeddings.py +0 -0
src/loader/ingest.py +50 -70
src/pipeline.py +60 -0
src/processing/chunking.py +69 -0
src/{rag → retrieval}/rag_pipeline.py +0 -0
src/retrieval/retriever.py +194 -0
uv.lock +14 -0

pyproject.toml CHANGED Viewed

@@ -15,4 +15,5 @@ dependencies = [
     "playwright>=1.55.0",
     "python-dotenv>=1.0.0",
     "faiss-cpu>=1.9.0",
 ]

     "playwright>=1.55.0",
     "python-dotenv>=1.0.0",
     "faiss-cpu>=1.9.0",
+    "rank-bm25>=0.2.2",  # For BM25 sparse retrieval
 ]

src/embeddings/embeddings.py ADDED Viewed

File without changes

src/loader/ingest.py CHANGED Viewed

@@ -1,90 +1,70 @@
 """
-UPB Career Data Ingestion Pipeline
-Scrapes UPB engineering program pages and saves documents for RAG
 """
 from pathlib import Path
-import json
-from data_loader import load_upb_careers
-from config import UPB_ENGINEERING_URLS, TEST_URLS
-# Paths
-CURRENT_DIR = Path(__file__).resolve().parent
-DATA_DIR = CURRENT_DIR.parent / "data"
-RAW_HTML_DIR = DATA_DIR / "raw_html"
-PROCESSED_DIR = DATA_DIR / "processed"
-# Create directories
-RAW_HTML_DIR.mkdir(parents=True, exist_ok=True)
-PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
-def save_documents_json(documents, output_file: Path):
-    """Save documents to JSON file"""
-    doc_dicts = [
-        {
-            "page_content": doc.page_content,
-            "metadata": doc.metadata
-        }
-        for doc in documents
-    ]
-    with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(doc_dicts, f, ensure_ascii=False, indent=2)
-    print(f"💾 Saved {len(documents)} documents to {output_file}")
-def ingest_upb_data(test_mode: bool = False):
     """
-    Main ingestion pipeline
     Args:
-        test_mode: If True, only scrape TEST_URLS. Otherwise scrape all programs.
-    """
-    urls = TEST_URLS if test_mode else UPB_ENGINEERING_URLS
-    print("=" * 70)
-    print("UPB CAREER DATA INGESTION")
-    print("=" * 70)
-    print(f"Mode: {'TEST' if test_mode else 'FULL'}")
-    print(f"URLs to scrape: {len(urls)}\n")
-    # Load documents
-    print("🚀 Starting data collection...\n")
-    documents = load_upb_careers(urls, save_html=True)
-    # Save processed documents
-    output_file = PROCESSED_DIR / ("upb_careers_test.json" if test_mode else "upb_careers_all.json")
-    save_documents_json(documents, output_file)
-    # Print summary
-    print("\n" + "=" * 70)
-    print("INGESTION SUMMARY")
-    print("=" * 70)
-    print(f"✅ Documents loaded: {len(documents)}")
-    print(f"📊 Total characters: {sum(doc.metadata['char_count'] for doc in documents):,}")
-    print(f"📁 Raw HTML saved to: {RAW_HTML_DIR}")
-    print(f"📁 Processed data saved to: {output_file}")
-    # Show document titles
-    print("\n📚 Loaded programs:")
-    for i, doc in enumerate(documents, 1):
-        print(f"  {i}. {doc.metadata['title']} ({doc.metadata['char_count']:,} chars)")
     return documents
 if __name__ == "__main__":
-    import sys
-    # Check if user wants full ingestion
-    test_mode = True
-    if len(sys.argv) > 1 and sys.argv[1] == "--full":
-        test_mode = False
-        print("⚠️  Running FULL ingestion (all engineering programs)")
-        print("This will take several minutes...\n")
-    documents = ingest_upb_data(test_mode=test_mode)
-    print("\n✨ Ingestion complete! Documents are ready for RAG processing.")

 """
+Document Loader Module
+Loads markdown files from the data/ directory with metadata enrichment.
 """
 from pathlib import Path
+from langchain_community.document_loaders import DirectoryLoader, TextLoader
+def load_upb_documents(show_progress=True):
     """
+    Load all markdown files from data/ directory and subdirectories.
     Args:
+        show_progress: Whether to show progress bar (default: True)
+    Returns:
+        list: List of LangChain Document objects with content and metadata
+    """
+    # Get data directory path
+    current_dir = Path(__file__).resolve().parent
+    data_dir = current_dir.parent.parent / "data"
+    # Load all .md files recursively
+    loader = DirectoryLoader(
+        str(data_dir),
+        glob="**/*.md",
+        loader_cls=TextLoader,
+        show_progress=show_progress,
+        use_multithreading=True
+    )
+    documents = loader.load()
+    # Add source category to metadata based on subdirectory
+    for doc in documents:
+        source_path = Path(doc.metadata['source'])
+        relative_path = source_path.relative_to(data_dir)
+        # Determine category from subdirectory
+        if relative_path.parts[0] == 'engineerings':
+            doc.metadata['category'] = 'engineering'
+        elif relative_path.parts[0] == 'contact':
+            doc.metadata['category'] = 'contact'
+        elif relative_path.parts[0] == 'enroll':
+            doc.metadata['category'] = 'enrollment'
+        elif relative_path.parts[0] == 'scholarships':
+            doc.metadata['category'] = 'scholarships'
+        else:
+            doc.metadata['category'] = 'general'
     return documents
 if __name__ == "__main__":
+    print("🚀 Loading markdown files from data/ directory...\n")
+    documents = load_upb_documents()
+    print(f"\n✅ Loaded {len(documents)} documents")
+    print(f"📊 Total characters: {sum(len(doc.page_content) for doc in documents):,}")
+    # Group by category
+    categories = {}
+    for doc in documents:
+        cat = doc.metadata.get('category', 'unknown')
+        categories[cat] = categories.get(cat, 0) + 1
+    print("\n📚 Documents by category:")
+    for cat, count in sorted(categories.items()):
+        print(f"  - {cat}: {count} documents")

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""
+Main Data Pipeline
+Orchestrates the complete flow: load → chunk → ready for retrieval.
+"""
+from pathlib import Path
+import sys
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent))
+from loader.ingest import load_upb_documents
+from processing.chunking import chunk_documents
+def prepare_documents_for_rag(chunk_size=1000, chunk_overlap=200, show_progress=True):
+    """
+    Complete data preparation pipeline.
+    Args:
+        chunk_size: Maximum characters per chunk
+        chunk_overlap: Overlap between chunks in characters
+        show_progress: Show loading progress bar
+    Returns:
+        list: Chunked documents ready for embedding and retrieval
+    """
+    # Step 1: Load documents
+    documents = load_upb_documents(show_progress=show_progress)
+    # Step 2: Chunk documents
+    chunks = chunk_documents(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    return chunks
+if __name__ == "__main__":
+    print("=" * 70)
+    print("UPB RAG DATA PIPELINE")
+    print("=" * 70)
+    print("\n📋 Pipeline: Load → Chunk → Ready for Retrieval\n")
+    chunks = prepare_documents_for_rag()
+    print(f"\n✅ Pipeline complete!")
+    print(f"📊 Generated {len(chunks)} chunks")
+    print(f"📊 Average size: {sum(len(c.page_content) for c in chunks) // len(chunks)} chars")
+    # Statistics
+    categories = {}
+    for chunk in chunks:
+        cat = chunk.metadata.get('category', 'unknown')
+        categories[cat] = categories.get(cat, 0) + 1
+    print("\n📦 Distribution:")
+    for cat, count in sorted(categories.items(), key=lambda x: -x[1]):
+        percentage = (count / len(chunks)) * 100
+        print(f"  - {cat}: {count} chunks ({percentage:.1f}%)")
+    print("\n✨ Ready for embedding and retrieval!")

src/processing/chunking.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Document Chunking Module
+Splits documents into smaller chunks optimized for embedding and retrieval.
+"""
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
+    """
+    Split documents into smaller chunks for embedding.
+    Args:
+        documents: List of LangChain Document objects
+        chunk_size: Maximum size of each chunk in characters (default: 1000)
+        chunk_overlap: Number of characters to overlap between chunks (default: 200)
+    Returns:
+        list: List of chunked Document objects with preserved metadata
+    """
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        length_function=len,
+        add_start_index=True,  # Track position in original document
+        separators=[
+            "\n\n",  # Paragraphs (preferred)
+            "\n",    # Lines
+            " ",     # Words
+            ""       # Characters (fallback)
+        ]
+    )
+    chunks = text_splitter.split_documents(documents)
+    return chunks
+if __name__ == "__main__":
+    from pathlib import Path
+    import sys
+    # Add src to path
+    sys.path.insert(0, str(Path(__file__).parent.parent))
+    from loader.ingest import load_upb_documents
+    print("🚀 Loading documents...\n")
+    documents = load_upb_documents()
+    print(f"✅ Loaded {len(documents)} documents")
+    print(f"📊 Total characters: {sum(len(doc.page_content) for doc in documents):,}\n")
+    print("✂️  Chunking documents...")
+    chunks = chunk_documents(documents)
+    print(f"\n✅ Created {len(chunks)} chunks")
+    print(f"📊 Average chunk size: {sum(len(c.page_content) for c in chunks) // len(chunks):,} characters")
+    # Show chunks by category
+    chunk_categories = {}
+    for chunk in chunks:
+        cat = chunk.metadata.get('category', 'unknown')
+        chunk_categories[cat] = chunk_categories.get(cat, 0) + 1
+    print("\n📦 Chunks by category:")
+    for cat, count in sorted(chunk_categories.items()):
+        print(f"  - {cat}: {count} chunks")
+    print("\n✨ Chunks ready for embedding!")

src/{rag → retrieval}/rag_pipeline.py RENAMED Viewed

File without changes

src/retrieval/retriever.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+Retrieval Module with Multiple Search Strategies
+Implements dense (vector), sparse (BM25), and MMR-based retrieval.
+"""
+from typing import List, Literal
+from langchain_core.documents import Document
+from langchain_community.retrievers import BM25Retriever
+class SimpleEnsembleRetriever:
+    """Simple ensemble retriever that combines results from multiple retrievers."""
+    def __init__(self, retrievers: List, weights: List[float]):
+        self.retrievers = retrievers
+        self.weights = weights
+    def invoke(self, query: str) -> List[Document]:
+        """Combine results from all retrievers with weighted scores."""
+        all_results = []
+        seen_content = set()
+        for retriever, weight in zip(self.retrievers, self.weights):
+            results = retriever.invoke(query)
+            for doc in results:
+                # Simple deduplication by content
+                if doc.page_content not in seen_content:
+                    all_results.append(doc)
+                    seen_content.add(doc.page_content)
+        return all_results
+class UPBRetriever:
+    """
+    Multi-strategy retriever for UPB career documents.
+    Supports: similarity search, MMR, BM25, and hybrid retrieval.
+    """
+    def __init__(self, chunks: List[Document], vectorstore=None):
+        """
+        Initialize retriever with document chunks.
+        Args:
+            chunks: List of chunked Document objects
+            vectorstore: Optional FAISS/ChromaDB vectorstore for dense retrieval
+        """
+        self.chunks = chunks
+        self.vectorstore = vectorstore
+        self._bm25_retriever = None
+    def get_bm25_retriever(self, k: int = 4) -> BM25Retriever:
+        """
+        Get or create BM25 retriever for sparse keyword-based search.
+        Args:
+            k: Number of documents to retrieve
+        Returns:
+            BM25Retriever instance
+        """
+        if self._bm25_retriever is None:
+            self._bm25_retriever = BM25Retriever.from_documents(self.chunks)
+        self._bm25_retriever.k = k
+        return self._bm25_retriever
+    def get_dense_retriever(self, k: int = 4, search_type: Literal["similarity", "mmr"] = "similarity"):
+        """
+        Get dense retriever from vectorstore.
+        Args:
+            k: Number of documents to retrieve
+            search_type: "similarity" for standard search, "mmr" for diverse results
+        Returns:
+            Vectorstore retriever
+        """
+        if self.vectorstore is None:
+            raise ValueError("Vectorstore not initialized. Please create embeddings first.")
+        if search_type == "mmr":
+            # MMR for diversity - reduces redundancy in results
+            return self.vectorstore.as_retriever(
+                search_type="mmr",
+                search_kwargs={
+                    "k": k,
+                    "fetch_k": k * 5,  # Fetch more candidates for diversity
+                    "lambda_mult": 0.7  # Balance: 1.0=relevance, 0.0=diversity
+                }
+            )
+        else:
+            # Standard similarity search
+            return self.vectorstore.as_retriever(search_kwargs={"k": k})
+    def get_hybrid_retriever(self, k: int = 4, weights: List[float] = None):
+        """
+        Get hybrid retriever combining BM25 (sparse) and vector (dense) search.
+        Args:
+            k: Number of documents to retrieve
+            weights: [bm25_weight, vector_weight]. Default: [0.5, 0.5]
+        Returns:
+            SimpleEnsembleRetriever combining both approaches
+        """
+        if self.vectorstore is None:
+            raise ValueError("Vectorstore not initialized. Please create embeddings first.")
+        weights = weights or [0.5, 0.5]
+        bm25_retriever = self.get_bm25_retriever(k=k)
+        dense_retriever = self.get_dense_retriever(k=k)
+        return SimpleEnsembleRetriever(
+            retrievers=[bm25_retriever, dense_retriever],
+            weights=weights
+        )
+    def retrieve(
+        self,
+        query: str,
+        method: Literal["bm25", "similarity", "mmr", "hybrid"] = "hybrid",
+        k: int = 4,
+        **kwargs
+    ) -> List[Document]:
+        """
+        Retrieve relevant documents using specified method.
+        Args:
+            query: Search query
+            method: Retrieval strategy
+                - "bm25": Sparse keyword-based (no embeddings needed)
+                - "similarity": Dense vector similarity search
+                - "mmr": Maximal Marginal Relevance (diverse results)
+                - "hybrid": Combination of BM25 + vector search
+            k: Number of documents to retrieve
+            **kwargs: Additional arguments for specific retrievers
+        Returns:
+            List of relevant Document objects
+        """
+        if method == "bm25":
+            retriever = self.get_bm25_retriever(k=k)
+        elif method == "similarity":
+            retriever = self.get_dense_retriever(k=k, search_type="similarity")
+        elif method == "mmr":
+            retriever = self.get_dense_retriever(k=k, search_type="mmr")
+        elif method == "hybrid":
+            weights = kwargs.get("weights", [0.5, 0.5])
+            retriever = self.get_hybrid_retriever(k=k, weights=weights)
+        else:
+            raise ValueError(f"Unknown retrieval method: {method}")
+        return retriever.invoke(query)
+if __name__ == "__main__":
+    from pathlib import Path
+    import sys
+    # Add src to path
+    sys.path.insert(0, str(Path(__file__).parent.parent))
+    from loader.ingest import load_upb_documents
+    from processing.chunking import chunk_documents
+    print("🚀 Loading and chunking documents...\n")
+    documents = load_upb_documents()
+    chunks = chunk_documents(documents)
+    print(f"✅ Loaded {len(chunks)} chunks\n")
+    # Initialize retriever (without vectorstore for BM25 demo)
+    retriever = UPBRetriever(chunks)
+    # Test BM25 retrieval
+    print("=" * 70)
+    print("TESTING BM25 RETRIEVAL (keyword-based)")
+    print("=" * 70)
+    query = "ingeniería de sistemas inteligencia artificial"
+    results = retriever.retrieve(query, method="bm25", k=3)
+    print(f"\nQuery: '{query}'")
+    print(f"Results: {len(results)} documents\n")
+    for i, doc in enumerate(results, 1):
+        print(f"Result {i}:")
+        print(f"  Category: {doc.metadata.get('category', 'N/A')}")
+        print(f"  Preview: {doc.page_content[:150]}...")
+        print()
+    print("✨ Retrieval module ready!")
+    print("\nNote: For similarity, MMR, and hybrid search, initialize with a vectorstore.")

uv.lock CHANGED Viewed

@@ -791,6 +791,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
 ]
 [[package]]
 name = "regex"
 version = "2025.10.23"
@@ -967,6 +979,7 @@ dependencies = [
     { name = "langchain-text-splitters" },
     { name = "playwright" },
     { name = "python-dotenv" },
 ]
 [package.metadata]
@@ -980,6 +993,7 @@ requires-dist = [
     { name = "langchain-text-splitters", specifier = ">=0.3.4" },
     { name = "playwright", specifier = ">=1.55.0" },
     { name = "python-dotenv", specifier = ">=1.0.0" },
 ]
 [[package]]

     { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
 ]
+[[package]]
+name = "rank-bm25"
+version = "0.2.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fc/0a/f9579384aa017d8b4c15613f86954b92a95a93d641cc849182467cf0bb3b/rank_bm25-0.2.2.tar.gz", hash = "sha256:096ccef76f8188563419aaf384a02f0ea459503fdf77901378d4fd9d87e5e51d", size = 8347, upload-time = "2022-02-16T12:10:52.196Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/21/f691fb2613100a62b3fa91e9988c991e9ca5b89ea31c0d3152a3210344f9/rank_bm25-0.2.2-py3-none-any.whl", hash = "sha256:7bd4a95571adadfc271746fa146a4bcfd89c0cf731e49c3d1ad863290adbe8ae", size = 8584, upload-time = "2022-02-16T12:10:50.626Z" },
+]
 [[package]]
 name = "regex"
 version = "2025.10.23"
     { name = "langchain-text-splitters" },
     { name = "playwright" },
     { name = "python-dotenv" },
+    { name = "rank-bm25" },
 ]
 [package.metadata]
     { name = "langchain-text-splitters", specifier = ">=0.3.4" },
     { name = "playwright", specifier = ">=1.55.0" },
     { name = "python-dotenv", specifier = ">=1.0.0" },
+    { name = "rank-bm25", specifier = ">=0.2.2" },
 ]
 [[package]]