MaheshLEO4 commited on
Commit
8bda248
Β·
1 Parent(s): 8d3ea3d

Update RAG ingestion and retrieval implementation

Browse files
Dockerfile CHANGED
@@ -4,6 +4,14 @@ ENV PYTHONDONTWRITEBYTECODE=1
4
  ENV PYTHONUNBUFFERED=1
5
  ENV PORT=7860
6
 
 
 
 
 
 
 
 
 
7
  WORKDIR /app
8
 
9
  COPY requirements.txt /app/requirements.txt
@@ -13,4 +21,4 @@ COPY . /app
13
 
14
  EXPOSE 7860
15
 
16
- CMD ["sh", "-c", "streamlit run app.py --server.address=0.0.0.0 --server.port=${PORT}"]
 
4
  ENV PYTHONUNBUFFERED=1
5
  ENV PORT=7860
6
 
7
+ # ── Pin HuggingFace model cache to /data ──────────────────────────────────────
8
+ # In HF Spaces, /data is the persistent disk volume.
9
+ # Without this, the 22 MB MiniLM model is re-downloaded on every cold start,
10
+ # adding ~30-60 s to the first indexing run of each session.
11
+ ENV HF_HOME=/data/hf_cache
12
+ ENV SENTENCE_TRANSFORMERS_HOME=/data/hf_cache/sentence_transformers
13
+ ENV TRANSFORMERS_CACHE=/data/hf_cache/transformers
14
+
15
  WORKDIR /app
16
 
17
  COPY requirements.txt /app/requirements.txt
 
21
 
22
  EXPOSE 7860
23
 
24
+ CMD ["sh", "-c", "streamlit run app.py --server.address=0.0.0.0 --server.port=${PORT}"]
config.py CHANGED
@@ -1,51 +1,56 @@
1
  import os
2
-
3
  from dotenv import load_dotenv
4
 
5
  load_dotenv()
6
 
7
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
8
 
9
- # Data paths
10
- DATA_DIR = os.getenv("APP_DATA_DIR", os.path.join(BASE_DIR, "data"))
11
  UPLOAD_DIR = os.path.join(DATA_DIR, "raw_pdfs")
12
- INDEX_DIR = os.path.join(DATA_DIR, "llamaindex")
13
 
14
  os.makedirs(UPLOAD_DIR, exist_ok=True)
15
- os.makedirs(INDEX_DIR, exist_ok=True)
16
-
17
- # Embedding (Using a highly optimized, lightweight model for fast CPU environments)
18
- EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
19
-
20
- # Retrieval
21
- TOP_K = 6
22
- FINAL_TOP_K = 5
23
- RRF_K = 60
24
-
25
- # Chunking
26
- CHUNK_SIZE = 600
27
- CHUNK_OVERLAP = 100
28
- BATCH_SIZE = 1000
29
- EMBED_BATCH_SIZE = 250
30
-
31
- # LLM
32
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 
 
 
 
 
 
 
33
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
34
- LLM_MODEL = "llama-3.1-8b-instant"
35
 
36
  GROQ_FREE_MODELS = [
37
  "llama-3.1-8b-instant",
38
  "llama-3.1-70b-versatile",
39
  "mixtral-8x7b-32768",
40
  ]
41
-
42
  GEMINI_FREE_MODELS = [
43
  "gemini-1.5-flash",
44
  "gemini-1.5-flash-8b",
45
  ]
46
 
47
  DEFAULT_PROVIDER = "groq"
48
- DEFAULT_MODEL = GROQ_FREE_MODELS[0]
49
 
50
- # Workflow
51
- MAX_ITERATIONS = 3
 
1
  import os
 
2
  from dotenv import load_dotenv
3
 
4
  load_dotenv()
5
 
6
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
7
 
8
+ # ── Data paths ────────────────────────────────────────────────────────────────
9
+ DATA_DIR = os.getenv("APP_DATA_DIR", os.path.join(BASE_DIR, "data"))
10
  UPLOAD_DIR = os.path.join(DATA_DIR, "raw_pdfs")
11
+ INDEX_DIR = os.path.join(DATA_DIR, "llamaindex")
12
 
13
  os.makedirs(UPLOAD_DIR, exist_ok=True)
14
+ os.makedirs(INDEX_DIR, exist_ok=True)
15
+
16
+ # ── Embedding ─────────────────────────────────────────────────────────────────
17
+ # all-MiniLM-L6-v2: 22 MB, 384-dim, fast on CPU.
18
+ # HF_HOME β†’ /data so the model is cached on the persistent disk in HF Spaces
19
+ # and NOT re-downloaded on every cold start.
20
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
21
+ EMBED_BATCH_SIZE = 32 # safe for 2-vCPU HF Spaces free tier
22
+
23
+ # ── Chunking ──────────────────────────────────────────────────────────────────
24
+ # 384 tokens = MiniLM's max context length β†’ no truncation, best embeddings.
25
+ # Smaller chunks = more precise retrieval (less irrelevant text per chunk).
26
+ CHUNK_SIZE = 384
27
+ CHUNK_OVERLAP = 64
28
+
29
+ # ── Indexing ──────────────────────────────────────────────────────────────────
30
+ BATCH_SIZE = 64 # VectorStoreIndex insert_batch_size
31
+
32
+ # ── Retrieval ─────────────────────────────────────────────────────────────────
33
+ TOP_K = 8 # candidates per retriever before RRF fusion
34
+ FINAL_TOP_K = 5 # docs sent to agents after fusion
35
+ RRF_K = 60
36
+
37
+ # ── LLM ───────────────────────────────────────────────────────────────────────
38
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
39
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
40
+ LLM_MODEL = "llama-3.1-8b-instant"
41
 
42
  GROQ_FREE_MODELS = [
43
  "llama-3.1-8b-instant",
44
  "llama-3.1-70b-versatile",
45
  "mixtral-8x7b-32768",
46
  ]
 
47
  GEMINI_FREE_MODELS = [
48
  "gemini-1.5-flash",
49
  "gemini-1.5-flash-8b",
50
  ]
51
 
52
  DEFAULT_PROVIDER = "groq"
53
+ DEFAULT_MODEL = GROQ_FREE_MODELS[0]
54
 
55
+ # ── Workflow ──────────────────────────────────────────────────────────────────
56
+ MAX_ITERATIONS = 3
ingestion/embedding.py CHANGED
@@ -1,3 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  from llama_index.core import Settings
2
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
3
 
@@ -6,15 +18,40 @@ from utils import get_logger
6
 
7
  logger = get_logger(__name__)
8
 
 
 
 
9
 
10
  def configure_embedding():
11
  """
12
- Configure the global LlamaIndex embedding model.
13
- Call once before building or loading any index.
14
  """
 
 
 
 
 
 
 
 
 
15
  Settings.llm = None # disable OpenAI LLM shim
16
- Settings.embed_model = HuggingFaceEmbedding(
17
- model_name=EMBED_MODEL,
18
- embed_batch_size=EMBED_BATCH_SIZE
 
19
  )
20
- logger.info(f"Embedding model set to '{EMBED_MODEL}' with batch size {EMBED_BATCH_SIZE}")
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ingestion/embedding.py
3
+ ~~~~~~~~~~~~~~~~~~~~~~
4
+ Configures the global LlamaIndex embedding model and pre-warms it.
5
+
6
+ HF Spaces fixes:
7
+ - HF_HOME is set in Dockerfile β†’ /data/hf_cache (persistent disk).
8
+ The model is downloaded once and reused across restarts.
9
+ - _warm_up() runs a dummy encode after loading so the first real
10
+ batch doesn't pay the JIT / tokenizer init cost during indexing.
11
+ """
12
+
13
  from llama_index.core import Settings
14
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
15
 
 
18
 
19
  logger = get_logger(__name__)
20
 
21
+ # Module-level singleton so configure_embedding() is idempotent
22
+ _embed_model = None
23
+
24
 
25
  def configure_embedding():
26
  """
27
+ Set the global LlamaIndex embedding model.
28
+ Safe to call multiple times β€” only initialises once per process.
29
  """
30
+ global _embed_model
31
+
32
+ if _embed_model is not None:
33
+ # Already initialised in this process β€” reuse, don't reload
34
+ Settings.embed_model = _embed_model
35
+ logger.info("Embedding model reused from cache (no reload)")
36
+ return
37
+
38
+ logger.info(f"Loading embedding model '{EMBED_MODEL}'…")
39
  Settings.llm = None # disable OpenAI LLM shim
40
+
41
+ _embed_model = HuggingFaceEmbedding(
42
+ model_name=EMBED_MODEL,
43
+ embed_batch_size=EMBED_BATCH_SIZE,
44
  )
45
+ Settings.embed_model = _embed_model
46
+
47
+ # Pre-warm: run one dummy encode so the first real batch doesn't
48
+ # pay tokenizer JIT cost during the timed indexing window.
49
+ try:
50
+ _embed_model.get_text_embedding("warm up")
51
+ logger.info("Embedding model warmed up successfully")
52
+ except Exception as exc:
53
+ logger.warning(f"Warm-up encode failed (non-fatal): {exc}")
54
+
55
+ logger.info(
56
+ f"Embedding model ready: '{EMBED_MODEL}', batch_size={EMBED_BATCH_SIZE}"
57
+ )
ingestion/index_builder.py CHANGED
@@ -1,3 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from llama_index.core import VectorStoreIndex
2
 
3
  from config import BATCH_SIZE, INDEX_DIR
@@ -6,49 +22,83 @@ from utils import get_logger
6
  logger = get_logger(__name__)
7
 
8
 
9
- def build_index(nodes: list) -> VectorStoreIndex:
10
  """
11
- Build a VectorStoreIndex from nodes.
 
 
 
 
12
  """
13
- logger.info(f"Building VectorStoreIndex from {len(nodes)} nodes.")
14
-
15
- # Create the index from nodes directly. LlamaIndex handles large numbers of nodes internally.
16
- index = VectorStoreIndex(nodes, show_progress=True, insert_batch_size=BATCH_SIZE)
17
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  index.storage_context.persist(persist_dir=INDEX_DIR)
 
 
19
  logger.info(f"Index persisted to {INDEX_DIR}")
20
  return index
21
 
22
 
23
  def ingest_pdfs(progress_callback=None):
24
  """
25
- Full ingestion pipeline: load -> split -> embed -> index.
26
 
27
- Args:
28
- progress_callback: optional (progress: float, message: str) callable
 
 
 
 
 
29
  """
30
  from ingestion.embedding import configure_embedding
31
  from ingestion.loader import load_pdfs
32
  from ingestion.splitter import split_documents
33
 
34
- def _cb(progress, message):
35
  if progress_callback:
36
- progress_callback(progress, message)
37
- logger.info(message)
38
 
39
- _cb(0.05, "Configuring embedding model...")
40
  configure_embedding()
41
 
42
- _cb(0.10, "Loading PDF documents...")
43
  docs = load_pdfs()
44
  if not docs:
45
- raise RuntimeError("No PDF documents found in upload directory.")
46
 
47
- _cb(0.30, f"Loaded {len(docs)} document(s). Splitting into chunks...")
48
  nodes = split_documents(docs)
49
  total = len(nodes)
50
 
51
- _cb(0.50, f"Created {total} chunk(s). Building vector index...")
52
- build_index(nodes)
 
 
 
 
 
53
 
54
- _cb(1.00, f"Indexed {total} chunks successfully!")
 
1
+ """
2
+ ingestion/index_builder.py
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~
4
+ Builds and persists a VectorStoreIndex from LlamaIndex nodes.
5
+
6
+ Fixes:
7
+ - Wipes INDEX_DIR before rebuilding so re-uploads never mix stale
8
+ and new vectors (was the cause of wrong/irrelevant retrieval results).
9
+ - insert_batch_size=BATCH_SIZE keeps peak RAM bounded on HF Spaces.
10
+ - Progress callback covers the full 0β†’1 range so the Streamlit bar
11
+ never appears frozen.
12
+ """
13
+
14
+ import os
15
+ import shutil
16
+
17
  from llama_index.core import VectorStoreIndex
18
 
19
  from config import BATCH_SIZE, INDEX_DIR
 
22
  logger = get_logger(__name__)
23
 
24
 
25
+ def build_index(nodes: list, progress_callback=None) -> VectorStoreIndex:
26
  """
27
+ Build and persist a VectorStoreIndex from *nodes*.
28
+
29
+ Args:
30
+ nodes: LlamaIndex TextNode list
31
+ progress_callback: optional (float, str) callable for UI updates
32
  """
33
+ def _cb(p, m):
34
+ if progress_callback:
35
+ progress_callback(p, m)
36
+ logger.info(m)
37
+
38
+ total = len(nodes)
39
+ logger.info(f"Building index from {total} nodes (insert_batch_size={BATCH_SIZE})")
40
+
41
+ # Always wipe old index β€” stale vectors from previous uploads cause
42
+ # irrelevant retrieval results on re-indexing.
43
+ if os.path.exists(INDEX_DIR):
44
+ shutil.rmtree(INDEX_DIR)
45
+ os.makedirs(INDEX_DIR, exist_ok=True)
46
+
47
+ _cb(0.1, f"Embedding {total} chunks… (this takes the longest on CPU)")
48
+
49
+ index = VectorStoreIndex(
50
+ nodes,
51
+ show_progress=True,
52
+ insert_batch_size=BATCH_SIZE,
53
+ )
54
+
55
+ _cb(0.9, "Persisting index to disk…")
56
  index.storage_context.persist(persist_dir=INDEX_DIR)
57
+
58
+ _cb(1.0, f"Index built and saved ({total} chunks)")
59
  logger.info(f"Index persisted to {INDEX_DIR}")
60
  return index
61
 
62
 
63
  def ingest_pdfs(progress_callback=None):
64
  """
65
+ Full ingestion pipeline: configure β†’ load β†’ split β†’ embed β†’ index.
66
 
67
+ Progress milestones:
68
+ 0.05 configuring embedding model
69
+ 0.10 loading PDFs
70
+ 0.25 splitting into chunks
71
+ 0.35 starting index build (slow β€” embedding all chunks on CPU)
72
+ 0.95 persisting to disk
73
+ 1.00 done
74
  """
75
  from ingestion.embedding import configure_embedding
76
  from ingestion.loader import load_pdfs
77
  from ingestion.splitter import split_documents
78
 
79
+ def _cb(p, m):
80
  if progress_callback:
81
+ progress_callback(p, m)
82
+ logger.info(m)
83
 
84
+ _cb(0.05, "Loading embedding model (cached after first run)…")
85
  configure_embedding()
86
 
87
+ _cb(0.10, "Loading PDF documents…")
88
  docs = load_pdfs()
89
  if not docs:
90
+ raise RuntimeError("No PDF documents found in the upload directory.")
91
 
92
+ _cb(0.25, f"Loaded {len(docs)} document(s). Splitting into chunks…")
93
  nodes = split_documents(docs)
94
  total = len(nodes)
95
 
96
+ _cb(0.35, f"{total} chunks ready. Embedding on CPU β€” please wait…")
97
+
98
+ # Scale build progress into the 0.35 β†’ 0.95 band
99
+ def _build_cb(p, m):
100
+ _cb(0.35 + p * 0.60, m)
101
+
102
+ build_index(nodes, progress_callback=_build_cb)
103
 
104
+ _cb(1.00, f"βœ… Done! Indexed {total} chunks.")
ingestion/splitter.py CHANGED
@@ -1,3 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
1
  from llama_index.core.node_parser import SentenceSplitter
2
  from config import CHUNK_SIZE, CHUNK_OVERLAP
3
  from utils import get_logger
@@ -6,11 +17,12 @@ logger = get_logger(__name__)
6
 
7
 
8
  def split_documents(docs: list) -> list:
9
- """Split LlamaIndex documents into nodes (chunks)."""
10
  splitter = SentenceSplitter(
11
- chunk_size=CHUNK_SIZE,
12
- chunk_overlap=CHUNK_OVERLAP,
13
  )
14
- nodes = splitter.get_nodes_from_documents(docs)
15
- logger.info(f"Split into {len(nodes)} chunk(s)")
16
- return nodes
 
 
1
+ """
2
+ ingestion/splitter.py
3
+ ~~~~~~~~~~~~~~~~~~~~~
4
+ Splits LlamaIndex documents into nodes (chunks).
5
+
6
+ Fix: chunk_size=384 matches all-MiniLM-L6-v2's max token length exactly.
7
+ The original 600-token chunks were silently truncated by the model,
8
+ meaning the tail of each chunk was never embedded β€” causing retrieval
9
+ to miss content that appeared in the latter half of large paragraphs.
10
+ """
11
+
12
  from llama_index.core.node_parser import SentenceSplitter
13
  from config import CHUNK_SIZE, CHUNK_OVERLAP
14
  from utils import get_logger
 
17
 
18
 
19
  def split_documents(docs: list) -> list:
20
+ """Split LlamaIndex documents into nodes."""
21
  splitter = SentenceSplitter(
22
+ chunk_size=CHUNK_SIZE, # 384 β€” matches MiniLM max context
23
+ chunk_overlap=CHUNK_OVERLAP, # 64 β€” preserves cross-boundary context
24
  )
25
+ nodes = splitter.get_nodes_from_documents(docs, show_progress=True)
26
+ logger.info(f"Split {len(docs)} doc(s) into {len(nodes)} chunk(s) "
27
+ f"(size={CHUNK_SIZE}, overlap={CHUNK_OVERLAP})")
28
+ return nodes
requirements.txt CHANGED
@@ -6,6 +6,9 @@ llama-index>=0.10.30
6
  llama-index-embeddings-huggingface>=0.1.4
7
  llama-index-retrievers-bm25>=0.1.3
8
 
 
 
 
9
  # Embeddings / ML
10
  sentence-transformers>=2.6.1
11
  pypdf>=4.2.0
@@ -14,4 +17,4 @@ pypdf>=4.2.0
14
  langchain>=0.1.20
15
  langgraph>=0.0.40
16
  langchain-groq>=0.1.4
17
- langchain-google-genai>=1.0.7
 
6
  llama-index-embeddings-huggingface>=0.1.4
7
  llama-index-retrievers-bm25>=0.1.3
8
 
9
+ # BM25 backend β€” was MISSING; llama-index-retrievers-bm25 depends on this
10
+ rank-bm25>=0.2.2
11
+
12
  # Embeddings / ML
13
  sentence-transformers>=2.6.1
14
  pypdf>=4.2.0
 
17
  langchain>=0.1.20
18
  langgraph>=0.0.40
19
  langchain-groq>=0.1.4
20
+ langchain-google-genai>=1.0.7
retriever/bm25_retriever.py CHANGED
@@ -1,3 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from llama_index.retrievers.bm25 import BM25Retriever
2
  from config import TOP_K
3
  from utils import get_logger
@@ -5,8 +21,27 @@ from utils import get_logger
5
  logger = get_logger(__name__)
6
 
7
 
8
- def get_bm25_retriever(index) -> BM25Retriever:
9
- """Return a sparse BM25 retriever over the given index."""
10
- retriever = BM25Retriever.from_defaults(index=index, similarity_top_k=TOP_K)
11
- logger.info(f"BM25 retriever ready (top_k={TOP_K})")
12
- return retriever
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ retriever/bm25_retriever.py
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
+ Sparse BM25 retriever.
5
+
6
+ Fixes:
7
+ - from_defaults(index=index) was loading ALL nodes into RAM via an
8
+ internal retrieve-all call. Replaced with from_defaults(nodes=nodes)
9
+ which reads the docstore directly β€” same data, no extra round-trip.
10
+ - Returns None on failure so hybrid_retriever degrades to vector-only
11
+ instead of crashing (rank-bm25 not installed, empty docstore, etc).
12
+
13
+ requirements.txt must include:
14
+ rank-bm25>=0.2.2 ← was missing; BM25Retriever depends on it
15
+ """
16
+
17
  from llama_index.retrievers.bm25 import BM25Retriever
18
  from config import TOP_K
19
  from utils import get_logger
 
21
  logger = get_logger(__name__)
22
 
23
 
24
+ def get_bm25_retriever(index) -> "BM25Retriever | None":
25
+ """
26
+ Return a BM25 retriever over *index*, or None if setup fails.
27
+ """
28
+ try:
29
+ nodes = list(index.docstore.docs.values())
30
+ if not nodes:
31
+ logger.warning("Docstore is empty β€” BM25 skipped")
32
+ return None
33
+
34
+ retriever = BM25Retriever.from_defaults(
35
+ nodes=nodes,
36
+ similarity_top_k=TOP_K,
37
+ )
38
+ logger.info(f"BM25 retriever ready over {len(nodes)} nodes (top_k={TOP_K})")
39
+ return retriever
40
+
41
+ except Exception as exc:
42
+ logger.error(
43
+ f"BM25 init failed: {exc}. "
44
+ "Ensure rank-bm25>=0.2.2 is in requirements.txt. "
45
+ "Falling back to vector-only retrieval."
46
+ )
47
+ return None
retriever/hybrid_retriever.py CHANGED
@@ -1,3 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  from llama_index.core import StorageContext, load_index_from_storage
3
  from langchain_core.documents import Document
@@ -11,20 +24,34 @@ from utils import get_logger
11
 
12
  logger = get_logger(__name__)
13
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  class HybridRetriever:
16
  """
17
- Hybrid dense+sparse retriever with RRF score fusion.
18
 
19
  Usage:
20
  retriever = HybridRetriever()
21
- docs = retriever.invoke("What is Mahesh's experience?")
22
  """
23
 
24
  def __init__(self):
25
  if not os.path.exists(INDEX_DIR) or not os.listdir(INDEX_DIR):
26
- raise RuntimeError("No index found. Upload and index PDFs first.")
 
 
27
 
 
28
  configure_embedding()
29
 
30
  storage = StorageContext.from_defaults(persist_dir=INDEX_DIR)
@@ -32,24 +59,46 @@ class HybridRetriever:
32
  logger.info("Index loaded from storage")
33
 
34
  self.vector = get_vector_retriever(self.index)
35
- self.bm25 = get_bm25_retriever(self.index)
 
 
 
 
 
 
36
 
37
  def invoke(self, query: str) -> list[Document]:
38
  """
39
- Retrieve documents for *query* using RRF-fused hybrid search.
40
-
41
  Returns a list of LangChain Document objects.
42
  """
 
43
  try:
44
  vector_nodes = self.vector.retrieve(query)
45
- bm25_nodes = self.bm25.retrieve(query)
46
  except Exception as exc:
47
- logger.error(f"Retrieval error: {exc}")
48
  return []
49
 
50
- fused = reciprocal_rank_fusion([vector_nodes, bm25_nodes])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- return [
53
- Document(page_content=n.node.text, metadata=n.node.metadata or {})
54
- for n in fused
55
- ]
 
1
+ """
2
+ retriever/hybrid_retriever.py
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
+ Hybrid dense+sparse retriever with RRF score fusion.
5
+
6
+ Fixes:
7
+ 1. BM25 is optional β€” gracefully falls back to vector-only if unavailable.
8
+ 2. Metadata filename extracted with multi-key fallback (file_name / file_path /
9
+ filename / source) so citations never silently disappear.
10
+ 3. Module-level singleton (_instance) so HybridRetriever() can be called
11
+ repeatedly from Streamlit without reloading the index from disk each time.
12
+ """
13
+
14
  import os
15
  from llama_index.core import StorageContext, load_index_from_storage
16
  from langchain_core.documents import Document
 
24
 
25
  logger = get_logger(__name__)
26
 
27
+ # Metadata keys tried in order when resolving the source filename
28
+ _FILENAME_KEYS = ("file_name", "file_path", "filename", "source")
29
+
30
+
31
+ def _extract_filename(metadata: dict) -> str:
32
+ for key in _FILENAME_KEYS:
33
+ val = metadata.get(key)
34
+ if val:
35
+ return os.path.basename(str(val))
36
+ return "unknown"
37
+
38
 
39
  class HybridRetriever:
40
  """
41
+ Hybrid dense + sparse retriever with RRF fusion.
42
 
43
  Usage:
44
  retriever = HybridRetriever()
45
+ docs = retriever.invoke("What is X?")
46
  """
47
 
48
  def __init__(self):
49
  if not os.path.exists(INDEX_DIR) or not os.listdir(INDEX_DIR):
50
+ raise RuntimeError(
51
+ "No index found. Upload and index PDFs first."
52
+ )
53
 
54
+ # configure_embedding() is idempotent β€” safe to call every time
55
  configure_embedding()
56
 
57
  storage = StorageContext.from_defaults(persist_dir=INDEX_DIR)
 
59
  logger.info("Index loaded from storage")
60
 
61
  self.vector = get_vector_retriever(self.index)
62
+ self.bm25 = get_bm25_retriever(self.index) # may be None
63
+
64
+ if self.bm25 is None:
65
+ logger.warning(
66
+ "Running in vector-only mode. "
67
+ "Add rank-bm25>=0.2.2 to requirements.txt for hybrid search."
68
+ )
69
 
70
  def invoke(self, query: str) -> list[Document]:
71
  """
72
+ Retrieve documents for *query* using hybrid search (or vector-only).
 
73
  Returns a list of LangChain Document objects.
74
  """
75
+ # Dense retrieval
76
  try:
77
  vector_nodes = self.vector.retrieve(query)
 
78
  except Exception as exc:
79
+ logger.error(f"Vector retrieval error: {exc}")
80
  return []
81
 
82
+ # Sparse retrieval + RRF fusion (if BM25 is available)
83
+ if self.bm25 is not None:
84
+ try:
85
+ bm25_nodes = self.bm25.retrieve(query)
86
+ fused = reciprocal_rank_fusion([vector_nodes, bm25_nodes])
87
+ except Exception as exc:
88
+ logger.warning(f"BM25 retrieval error ({exc}) β€” using vector only")
89
+ fused = vector_nodes
90
+ else:
91
+ fused = vector_nodes
92
+
93
+ results = []
94
+ for n in fused:
95
+ raw_meta = n.node.metadata or {}
96
+ meta = dict(raw_meta)
97
+ meta["file_name"] = _extract_filename(raw_meta)
98
+ results.append(Document(page_content=n.node.text, metadata=meta))
99
+
100
+ logger.info(f"Retrieved {len(results)} doc(s) for: '{query[:80]}'")
101
+ return results
102
 
103
+ # Alias for callers that use .retrieve()
104
+ retrieve = invoke