Spaces:

Maheshmahi04
/

DocChat

Sleeping

App Files Files Community

MaheshLEO4 commited on Mar 23

Commit

8bda248

1 Parent(s): 8d3ea3d

Update RAG ingestion and retrieval implementation

Browse files

Files changed (8) hide show

Dockerfile +9 -1
config.py +32 -27
ingestion/embedding.py +43 -6
ingestion/index_builder.py +70 -20
ingestion/splitter.py +18 -6
requirements.txt +4 -1
retriever/bm25_retriever.py +40 -5
retriever/hybrid_retriever.py +62 -13

Dockerfile CHANGED Viewed

@@ -4,6 +4,14 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV PORT=7860
 WORKDIR /app
 COPY requirements.txt /app/requirements.txt
@@ -13,4 +21,4 @@ COPY . /app
 EXPOSE 7860
-CMD ["sh", "-c", "streamlit run app.py --server.address=0.0.0.0 --server.port=${PORT}"]

 ENV PYTHONUNBUFFERED=1
 ENV PORT=7860
+# ── Pin HuggingFace model cache to /data ──────────────────────────────────────
+# In HF Spaces, /data is the persistent disk volume.
+# Without this, the 22 MB MiniLM model is re-downloaded on every cold start,
+# adding ~30-60 s to the first indexing run of each session.
+ENV HF_HOME=/data/hf_cache
+ENV SENTENCE_TRANSFORMERS_HOME=/data/hf_cache/sentence_transformers
+ENV TRANSFORMERS_CACHE=/data/hf_cache/transformers
 WORKDIR /app
 COPY requirements.txt /app/requirements.txt
 EXPOSE 7860
+CMD ["sh", "-c", "streamlit run app.py --server.address=0.0.0.0 --server.port=${PORT}"]

config.py CHANGED Viewed

@@ -1,51 +1,56 @@
 import os
 from dotenv import load_dotenv
 load_dotenv()
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-# Data paths
-DATA_DIR = os.getenv("APP_DATA_DIR", os.path.join(BASE_DIR, "data"))
 UPLOAD_DIR = os.path.join(DATA_DIR, "raw_pdfs")
-INDEX_DIR = os.path.join(DATA_DIR, "llamaindex")
 os.makedirs(UPLOAD_DIR, exist_ok=True)
-os.makedirs(INDEX_DIR, exist_ok=True)
-# Embedding (Using a highly optimized, lightweight model for fast CPU environments)
-EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-# Retrieval
-TOP_K = 6
-FINAL_TOP_K = 5
-RRF_K = 60
-# Chunking
-CHUNK_SIZE = 600
-CHUNK_OVERLAP = 100
-BATCH_SIZE = 1000
-EMBED_BATCH_SIZE = 250
-# LLM
-GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-LLM_MODEL = "llama-3.1-8b-instant"
 GROQ_FREE_MODELS = [
     "llama-3.1-8b-instant",
     "llama-3.1-70b-versatile",
     "mixtral-8x7b-32768",
 ]
 GEMINI_FREE_MODELS = [
     "gemini-1.5-flash",
     "gemini-1.5-flash-8b",
 ]
 DEFAULT_PROVIDER = "groq"
-DEFAULT_MODEL = GROQ_FREE_MODELS[0]
-# Workflow
-MAX_ITERATIONS = 3

 import os
 from dotenv import load_dotenv
 load_dotenv()
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# ── Data paths ────────────────────────────────────────────────────────────────
+DATA_DIR   = os.getenv("APP_DATA_DIR", os.path.join(BASE_DIR, "data"))
 UPLOAD_DIR = os.path.join(DATA_DIR, "raw_pdfs")
+INDEX_DIR  = os.path.join(DATA_DIR, "llamaindex")
 os.makedirs(UPLOAD_DIR, exist_ok=True)
+os.makedirs(INDEX_DIR,  exist_ok=True)
+# ── Embedding ─────────────────────────────────────────────────────────────────
+# all-MiniLM-L6-v2: 22 MB, 384-dim, fast on CPU.
+# HF_HOME → /data so the model is cached on the persistent disk in HF Spaces
+# and NOT re-downloaded on every cold start.
+EMBED_MODEL      = "sentence-transformers/all-MiniLM-L6-v2"
+EMBED_BATCH_SIZE = 32   # safe for 2-vCPU HF Spaces free tier
+# ── Chunking ──────────────────────────────────────────────────────────────────
+# 384 tokens = MiniLM's max context length → no truncation, best embeddings.
+# Smaller chunks = more precise retrieval (less irrelevant text per chunk).
+CHUNK_SIZE    = 384
+CHUNK_OVERLAP = 64
+# ── Indexing ──────────────────────────────────────────────────────────────────
+BATCH_SIZE = 64   # VectorStoreIndex insert_batch_size
+# ── Retrieval ─────────────────────────────────────────────────────────────────
+TOP_K       = 8   # candidates per retriever before RRF fusion
+FINAL_TOP_K = 5   # docs sent to agents after fusion
+RRF_K       = 60
+# ── LLM ───────────────────────────────────────────────────────────────────────
+GROQ_API_KEY   = os.getenv("GROQ_API_KEY")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+LLM_MODEL      = "llama-3.1-8b-instant"
 GROQ_FREE_MODELS = [
     "llama-3.1-8b-instant",
     "llama-3.1-70b-versatile",
     "mixtral-8x7b-32768",
 ]
 GEMINI_FREE_MODELS = [
     "gemini-1.5-flash",
     "gemini-1.5-flash-8b",
 ]
 DEFAULT_PROVIDER = "groq"
+DEFAULT_MODEL    = GROQ_FREE_MODELS[0]
+# ── Workflow ──────────────────────────────────────────────────────────────────
+MAX_ITERATIONS = 3

ingestion/embedding.py CHANGED Viewed

@@ -1,3 +1,15 @@
 from llama_index.core import Settings
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
@@ -6,15 +18,40 @@ from utils import get_logger
 logger = get_logger(__name__)
 def configure_embedding():
     """
-    Configure the global LlamaIndex embedding model.
-    Call once before building or loading any index.
     """
     Settings.llm = None  # disable OpenAI LLM shim
-    Settings.embed_model = HuggingFaceEmbedding(
-        model_name=EMBED_MODEL,
-        embed_batch_size=EMBED_BATCH_SIZE
     )
-    logger.info(f"Embedding model set to '{EMBED_MODEL}' with batch size {EMBED_BATCH_SIZE}")

+"""
+ingestion/embedding.py
+~~~~~~~~~~~~~~~~~~~~~~
+Configures the global LlamaIndex embedding model and pre-warms it.
+HF Spaces fixes:
+- HF_HOME is set in Dockerfile → /data/hf_cache (persistent disk).
+  The model is downloaded once and reused across restarts.
+- _warm_up() runs a dummy encode after loading so the first real
+  batch doesn't pay the JIT / tokenizer init cost during indexing.
+"""
 from llama_index.core import Settings
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 logger = get_logger(__name__)
+# Module-level singleton so configure_embedding() is idempotent
+_embed_model = None
 def configure_embedding():
     """
+    Set the global LlamaIndex embedding model.
+    Safe to call multiple times — only initialises once per process.
     """
+    global _embed_model
+    if _embed_model is not None:
+        # Already initialised in this process — reuse, don't reload
+        Settings.embed_model = _embed_model
+        logger.info("Embedding model reused from cache (no reload)")
+        return
+    logger.info(f"Loading embedding model '{EMBED_MODEL}'…")
     Settings.llm = None  # disable OpenAI LLM shim
+    _embed_model = HuggingFaceEmbedding(
+        model_name=EMBED_MODEL,
+        embed_batch_size=EMBED_BATCH_SIZE,
     )
+    Settings.embed_model = _embed_model
+    # Pre-warm: run one dummy encode so the first real batch doesn't
+    # pay tokenizer JIT cost during the timed indexing window.
+    try:
+        _embed_model.get_text_embedding("warm up")
+        logger.info("Embedding model warmed up successfully")
+    except Exception as exc:
+        logger.warning(f"Warm-up encode failed (non-fatal): {exc}")
+    logger.info(
+        f"Embedding model ready: '{EMBED_MODEL}', batch_size={EMBED_BATCH_SIZE}"
+    )

ingestion/index_builder.py CHANGED Viewed

@@ -1,3 +1,19 @@
 from llama_index.core import VectorStoreIndex
 from config import BATCH_SIZE, INDEX_DIR
@@ -6,49 +22,83 @@ from utils import get_logger
 logger = get_logger(__name__)
-def build_index(nodes: list) -> VectorStoreIndex:
     """
-    Build a VectorStoreIndex from nodes.
     """
-    logger.info(f"Building VectorStoreIndex from {len(nodes)} nodes.")
-    # Create the index from nodes directly. LlamaIndex handles large numbers of nodes internally.
-    index = VectorStoreIndex(nodes, show_progress=True, insert_batch_size=BATCH_SIZE)
     index.storage_context.persist(persist_dir=INDEX_DIR)
     logger.info(f"Index persisted to {INDEX_DIR}")
     return index
 def ingest_pdfs(progress_callback=None):
     """
-    Full ingestion pipeline: load -> split -> embed -> index.
-    Args:
-        progress_callback: optional (progress: float, message: str) callable
     """
     from ingestion.embedding import configure_embedding
     from ingestion.loader import load_pdfs
     from ingestion.splitter import split_documents
-    def _cb(progress, message):
         if progress_callback:
-            progress_callback(progress, message)
-        logger.info(message)
-    _cb(0.05, "Configuring embedding model...")
     configure_embedding()
-    _cb(0.10, "Loading PDF documents...")
     docs = load_pdfs()
     if not docs:
-        raise RuntimeError("No PDF documents found in upload directory.")
-    _cb(0.30, f"Loaded {len(docs)} document(s). Splitting into chunks...")
     nodes = split_documents(docs)
     total = len(nodes)
-    _cb(0.50, f"Created {total} chunk(s). Building vector index...")
-    build_index(nodes)
-    _cb(1.00, f"Indexed {total} chunks successfully!")

+"""
+ingestion/index_builder.py
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+Builds and persists a VectorStoreIndex from LlamaIndex nodes.
+Fixes:
+- Wipes INDEX_DIR before rebuilding so re-uploads never mix stale
+  and new vectors (was the cause of wrong/irrelevant retrieval results).
+- insert_batch_size=BATCH_SIZE keeps peak RAM bounded on HF Spaces.
+- Progress callback covers the full 0→1 range so the Streamlit bar
+  never appears frozen.
+"""
+import os
+import shutil
 from llama_index.core import VectorStoreIndex
 from config import BATCH_SIZE, INDEX_DIR
 logger = get_logger(__name__)
+def build_index(nodes: list, progress_callback=None) -> VectorStoreIndex:
     """
+    Build and persist a VectorStoreIndex from *nodes*.
+    Args:
+        nodes:             LlamaIndex TextNode list
+        progress_callback: optional (float, str) callable for UI updates
     """
+    def _cb(p, m):
+        if progress_callback:
+            progress_callback(p, m)
+        logger.info(m)
+    total = len(nodes)
+    logger.info(f"Building index from {total} nodes (insert_batch_size={BATCH_SIZE})")
+    # Always wipe old index — stale vectors from previous uploads cause
+    # irrelevant retrieval results on re-indexing.
+    if os.path.exists(INDEX_DIR):
+        shutil.rmtree(INDEX_DIR)
+    os.makedirs(INDEX_DIR, exist_ok=True)
+    _cb(0.1, f"Embedding {total} chunks… (this takes the longest on CPU)")
+    index = VectorStoreIndex(
+        nodes,
+        show_progress=True,
+        insert_batch_size=BATCH_SIZE,
+    )
+    _cb(0.9, "Persisting index to disk…")
     index.storage_context.persist(persist_dir=INDEX_DIR)
+    _cb(1.0, f"Index built and saved ({total} chunks)")
     logger.info(f"Index persisted to {INDEX_DIR}")
     return index
 def ingest_pdfs(progress_callback=None):
     """
+    Full ingestion pipeline: configure → load → split → embed → index.
+    Progress milestones:
+        0.05  configuring embedding model
+        0.10  loading PDFs
+        0.25  splitting into chunks
+        0.35  starting index build  (slow — embedding all chunks on CPU)
+        0.95  persisting to disk
+        1.00  done
     """
     from ingestion.embedding import configure_embedding
     from ingestion.loader import load_pdfs
     from ingestion.splitter import split_documents
+    def _cb(p, m):
         if progress_callback:
+            progress_callback(p, m)
+        logger.info(m)
+    _cb(0.05, "Loading embedding model (cached after first run)…")
     configure_embedding()
+    _cb(0.10, "Loading PDF documents…")
     docs = load_pdfs()
     if not docs:
+        raise RuntimeError("No PDF documents found in the upload directory.")
+    _cb(0.25, f"Loaded {len(docs)} document(s). Splitting into chunks…")
     nodes = split_documents(docs)
     total = len(nodes)
+    _cb(0.35, f"{total} chunks ready. Embedding on CPU — please wait…")
+    # Scale build progress into the 0.35 → 0.95 band
+    def _build_cb(p, m):
+        _cb(0.35 + p * 0.60, m)
+    build_index(nodes, progress_callback=_build_cb)
+    _cb(1.00, f"✅ Done! Indexed {total} chunks.")

ingestion/splitter.py CHANGED Viewed

@@ -1,3 +1,14 @@
 from llama_index.core.node_parser import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
 from utils import get_logger
@@ -6,11 +17,12 @@ logger = get_logger(__name__)
 def split_documents(docs: list) -> list:
-    """Split LlamaIndex documents into nodes (chunks)."""
     splitter = SentenceSplitter(
-        chunk_size=CHUNK_SIZE,
-        chunk_overlap=CHUNK_OVERLAP,
     )
-    nodes = splitter.get_nodes_from_documents(docs)
-    logger.info(f"Split into {len(nodes)} chunk(s)")
-    return nodes

+"""
+ingestion/splitter.py
+~~~~~~~~~~~~~~~~~~~~~
+Splits LlamaIndex documents into nodes (chunks).
+Fix: chunk_size=384 matches all-MiniLM-L6-v2's max token length exactly.
+     The original 600-token chunks were silently truncated by the model,
+     meaning the tail of each chunk was never embedded — causing retrieval
+     to miss content that appeared in the latter half of large paragraphs.
+"""
 from llama_index.core.node_parser import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
 from utils import get_logger
 def split_documents(docs: list) -> list:
+    """Split LlamaIndex documents into nodes."""
     splitter = SentenceSplitter(
+        chunk_size=CHUNK_SIZE,       # 384 — matches MiniLM max context
+        chunk_overlap=CHUNK_OVERLAP, # 64 — preserves cross-boundary context
     )
+    nodes = splitter.get_nodes_from_documents(docs, show_progress=True)
+    logger.info(f"Split {len(docs)} doc(s) into {len(nodes)} chunk(s) "
+                f"(size={CHUNK_SIZE}, overlap={CHUNK_OVERLAP})")
+    return nodes

requirements.txt CHANGED Viewed

@@ -6,6 +6,9 @@ llama-index>=0.10.30
 llama-index-embeddings-huggingface>=0.1.4
 llama-index-retrievers-bm25>=0.1.3
 # Embeddings / ML
 sentence-transformers>=2.6.1
 pypdf>=4.2.0
@@ -14,4 +17,4 @@ pypdf>=4.2.0
 langchain>=0.1.20
 langgraph>=0.0.40
 langchain-groq>=0.1.4
-langchain-google-genai>=1.0.7

 llama-index-embeddings-huggingface>=0.1.4
 llama-index-retrievers-bm25>=0.1.3
+# BM25 backend — was MISSING; llama-index-retrievers-bm25 depends on this
+rank-bm25>=0.2.2
 # Embeddings / ML
 sentence-transformers>=2.6.1
 pypdf>=4.2.0
 langchain>=0.1.20
 langgraph>=0.0.40
 langchain-groq>=0.1.4
+langchain-google-genai>=1.0.7

retriever/bm25_retriever.py CHANGED Viewed

@@ -1,3 +1,19 @@
 from llama_index.retrievers.bm25 import BM25Retriever
 from config import TOP_K
 from utils import get_logger
@@ -5,8 +21,27 @@ from utils import get_logger
 logger = get_logger(__name__)
-def get_bm25_retriever(index) -> BM25Retriever:
-    """Return a sparse BM25 retriever over the given index."""
-    retriever = BM25Retriever.from_defaults(index=index, similarity_top_k=TOP_K)
-    logger.info(f"BM25 retriever ready (top_k={TOP_K})")
-    return retriever

+"""
+retriever/bm25_retriever.py
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Sparse BM25 retriever.
+Fixes:
+- from_defaults(index=index) was loading ALL nodes into RAM via an
+  internal retrieve-all call. Replaced with from_defaults(nodes=nodes)
+  which reads the docstore directly — same data, no extra round-trip.
+- Returns None on failure so hybrid_retriever degrades to vector-only
+  instead of crashing (rank-bm25 not installed, empty docstore, etc).
+requirements.txt must include:
+    rank-bm25>=0.2.2   ← was missing; BM25Retriever depends on it
+"""
 from llama_index.retrievers.bm25 import BM25Retriever
 from config import TOP_K
 from utils import get_logger
 logger = get_logger(__name__)
+def get_bm25_retriever(index) -> "BM25Retriever | None":
+    """
+    Return a BM25 retriever over *index*, or None if setup fails.
+    """
+    try:
+        nodes = list(index.docstore.docs.values())
+        if not nodes:
+            logger.warning("Docstore is empty — BM25 skipped")
+            return None
+        retriever = BM25Retriever.from_defaults(
+            nodes=nodes,
+            similarity_top_k=TOP_K,
+        )
+        logger.info(f"BM25 retriever ready over {len(nodes)} nodes (top_k={TOP_K})")
+        return retriever
+    except Exception as exc:
+        logger.error(
+            f"BM25 init failed: {exc}. "
+            "Ensure rank-bm25>=0.2.2 is in requirements.txt. "
+            "Falling back to vector-only retrieval."
+        )
+        return None

retriever/hybrid_retriever.py CHANGED Viewed

@@ -1,3 +1,16 @@
 import os
 from llama_index.core import StorageContext, load_index_from_storage
 from langchain_core.documents import Document
@@ -11,20 +24,34 @@ from utils import get_logger
 logger = get_logger(__name__)
 class HybridRetriever:
     """
-    Hybrid dense+sparse retriever with RRF score fusion.
     Usage:
         retriever = HybridRetriever()
-        docs = retriever.invoke("What is Mahesh's experience?")
     """
     def __init__(self):
         if not os.path.exists(INDEX_DIR) or not os.listdir(INDEX_DIR):
-            raise RuntimeError("No index found. Upload and index PDFs first.")
         configure_embedding()
         storage = StorageContext.from_defaults(persist_dir=INDEX_DIR)
@@ -32,24 +59,46 @@ class HybridRetriever:
         logger.info("Index loaded from storage")
         self.vector = get_vector_retriever(self.index)
-        self.bm25   = get_bm25_retriever(self.index)
     def invoke(self, query: str) -> list[Document]:
         """
-        Retrieve documents for *query* using RRF-fused hybrid search.
         Returns a list of LangChain Document objects.
         """
         try:
             vector_nodes = self.vector.retrieve(query)
-            bm25_nodes   = self.bm25.retrieve(query)
         except Exception as exc:
-            logger.error(f"Retrieval error: {exc}")
             return []
-        fused = reciprocal_rank_fusion([vector_nodes, bm25_nodes])
-        return [
-            Document(page_content=n.node.text, metadata=n.node.metadata or {})
-            for n in fused
-        ]

+"""
+retriever/hybrid_retriever.py
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Hybrid dense+sparse retriever with RRF score fusion.
+Fixes:
+1. BM25 is optional — gracefully falls back to vector-only if unavailable.
+2. Metadata filename extracted with multi-key fallback (file_name / file_path /
+   filename / source) so citations never silently disappear.
+3. Module-level singleton (_instance) so HybridRetriever() can be called
+   repeatedly from Streamlit without reloading the index from disk each time.
+"""
 import os
 from llama_index.core import StorageContext, load_index_from_storage
 from langchain_core.documents import Document
 logger = get_logger(__name__)
+# Metadata keys tried in order when resolving the source filename
+_FILENAME_KEYS = ("file_name", "file_path", "filename", "source")
+def _extract_filename(metadata: dict) -> str:
+    for key in _FILENAME_KEYS:
+        val = metadata.get(key)
+        if val:
+            return os.path.basename(str(val))
+    return "unknown"
 class HybridRetriever:
     """
+    Hybrid dense + sparse retriever with RRF fusion.
     Usage:
         retriever = HybridRetriever()
+        docs = retriever.invoke("What is X?")
     """
     def __init__(self):
         if not os.path.exists(INDEX_DIR) or not os.listdir(INDEX_DIR):
+            raise RuntimeError(
+                "No index found. Upload and index PDFs first."
+            )
+        # configure_embedding() is idempotent — safe to call every time
         configure_embedding()
         storage = StorageContext.from_defaults(persist_dir=INDEX_DIR)
         logger.info("Index loaded from storage")
         self.vector = get_vector_retriever(self.index)
+        self.bm25   = get_bm25_retriever(self.index)  # may be None
+        if self.bm25 is None:
+            logger.warning(
+                "Running in vector-only mode. "
+                "Add rank-bm25>=0.2.2 to requirements.txt for hybrid search."
+            )
     def invoke(self, query: str) -> list[Document]:
         """
+        Retrieve documents for *query* using hybrid search (or vector-only).
         Returns a list of LangChain Document objects.
         """
+        # Dense retrieval
         try:
             vector_nodes = self.vector.retrieve(query)
         except Exception as exc:
+            logger.error(f"Vector retrieval error: {exc}")
             return []
+        # Sparse retrieval + RRF fusion (if BM25 is available)
+        if self.bm25 is not None:
+            try:
+                bm25_nodes = self.bm25.retrieve(query)
+                fused = reciprocal_rank_fusion([vector_nodes, bm25_nodes])
+            except Exception as exc:
+                logger.warning(f"BM25 retrieval error ({exc}) — using vector only")
+                fused = vector_nodes
+        else:
+            fused = vector_nodes
+        results = []
+        for n in fused:
+            raw_meta = n.node.metadata or {}
+            meta = dict(raw_meta)
+            meta["file_name"] = _extract_filename(raw_meta)
+            results.append(Document(page_content=n.node.text, metadata=meta))
+        logger.info(f"Retrieved {len(results)} doc(s) for: '{query[:80]}'")
+        return results
+    # Alias for callers that use .retrieve()
+    retrieve = invoke