Spaces:
Sleeping
Sleeping
Commit Β·
8bda248
1
Parent(s): 8d3ea3d
Update RAG ingestion and retrieval implementation
Browse files- Dockerfile +9 -1
- config.py +32 -27
- ingestion/embedding.py +43 -6
- ingestion/index_builder.py +70 -20
- ingestion/splitter.py +18 -6
- requirements.txt +4 -1
- retriever/bm25_retriever.py +40 -5
- retriever/hybrid_retriever.py +62 -13
Dockerfile
CHANGED
|
@@ -4,6 +4,14 @@ ENV PYTHONDONTWRITEBYTECODE=1
|
|
| 4 |
ENV PYTHONUNBUFFERED=1
|
| 5 |
ENV PORT=7860
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
WORKDIR /app
|
| 8 |
|
| 9 |
COPY requirements.txt /app/requirements.txt
|
|
@@ -13,4 +21,4 @@ COPY . /app
|
|
| 13 |
|
| 14 |
EXPOSE 7860
|
| 15 |
|
| 16 |
-
CMD ["sh", "-c", "streamlit run app.py --server.address=0.0.0.0 --server.port=${PORT}"]
|
|
|
|
| 4 |
ENV PYTHONUNBUFFERED=1
|
| 5 |
ENV PORT=7860
|
| 6 |
|
| 7 |
+
# ββ Pin HuggingFace model cache to /data ββββββββββββββββββββββββββββββββββββββ
|
| 8 |
+
# In HF Spaces, /data is the persistent disk volume.
|
| 9 |
+
# Without this, the 22 MB MiniLM model is re-downloaded on every cold start,
|
| 10 |
+
# adding ~30-60 s to the first indexing run of each session.
|
| 11 |
+
ENV HF_HOME=/data/hf_cache
|
| 12 |
+
ENV SENTENCE_TRANSFORMERS_HOME=/data/hf_cache/sentence_transformers
|
| 13 |
+
ENV TRANSFORMERS_CACHE=/data/hf_cache/transformers
|
| 14 |
+
|
| 15 |
WORKDIR /app
|
| 16 |
|
| 17 |
COPY requirements.txt /app/requirements.txt
|
|
|
|
| 21 |
|
| 22 |
EXPOSE 7860
|
| 23 |
|
| 24 |
+
CMD ["sh", "-c", "streamlit run app.py --server.address=0.0.0.0 --server.port=${PORT}"]
|
config.py
CHANGED
|
@@ -1,51 +1,56 @@
|
|
| 1 |
import os
|
| 2 |
-
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
|
| 5 |
load_dotenv()
|
| 6 |
|
| 7 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 8 |
|
| 9 |
-
# Data paths
|
| 10 |
-
DATA_DIR
|
| 11 |
UPLOAD_DIR = os.path.join(DATA_DIR, "raw_pdfs")
|
| 12 |
-
INDEX_DIR
|
| 13 |
|
| 14 |
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
| 15 |
-
os.makedirs(INDEX_DIR,
|
| 16 |
-
|
| 17 |
-
# Embedding
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
#
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
#
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
#
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 34 |
-
LLM_MODEL
|
| 35 |
|
| 36 |
GROQ_FREE_MODELS = [
|
| 37 |
"llama-3.1-8b-instant",
|
| 38 |
"llama-3.1-70b-versatile",
|
| 39 |
"mixtral-8x7b-32768",
|
| 40 |
]
|
| 41 |
-
|
| 42 |
GEMINI_FREE_MODELS = [
|
| 43 |
"gemini-1.5-flash",
|
| 44 |
"gemini-1.5-flash-8b",
|
| 45 |
]
|
| 46 |
|
| 47 |
DEFAULT_PROVIDER = "groq"
|
| 48 |
-
DEFAULT_MODEL
|
| 49 |
|
| 50 |
-
# Workflow
|
| 51 |
-
MAX_ITERATIONS = 3
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
|
| 4 |
load_dotenv()
|
| 5 |
|
| 6 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 7 |
|
| 8 |
+
# ββ Data paths ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 9 |
+
DATA_DIR = os.getenv("APP_DATA_DIR", os.path.join(BASE_DIR, "data"))
|
| 10 |
UPLOAD_DIR = os.path.join(DATA_DIR, "raw_pdfs")
|
| 11 |
+
INDEX_DIR = os.path.join(DATA_DIR, "llamaindex")
|
| 12 |
|
| 13 |
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
| 14 |
+
os.makedirs(INDEX_DIR, exist_ok=True)
|
| 15 |
+
|
| 16 |
+
# ββ Embedding βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
+
# all-MiniLM-L6-v2: 22 MB, 384-dim, fast on CPU.
|
| 18 |
+
# HF_HOME β /data so the model is cached on the persistent disk in HF Spaces
|
| 19 |
+
# and NOT re-downloaded on every cold start.
|
| 20 |
+
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 21 |
+
EMBED_BATCH_SIZE = 32 # safe for 2-vCPU HF Spaces free tier
|
| 22 |
+
|
| 23 |
+
# ββ Chunking ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 24 |
+
# 384 tokens = MiniLM's max context length β no truncation, best embeddings.
|
| 25 |
+
# Smaller chunks = more precise retrieval (less irrelevant text per chunk).
|
| 26 |
+
CHUNK_SIZE = 384
|
| 27 |
+
CHUNK_OVERLAP = 64
|
| 28 |
+
|
| 29 |
+
# ββ Indexing ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
BATCH_SIZE = 64 # VectorStoreIndex insert_batch_size
|
| 31 |
+
|
| 32 |
+
# ββ Retrieval βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
TOP_K = 8 # candidates per retriever before RRF fusion
|
| 34 |
+
FINAL_TOP_K = 5 # docs sent to agents after fusion
|
| 35 |
+
RRF_K = 60
|
| 36 |
+
|
| 37 |
+
# ββ LLM βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
| 39 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 40 |
+
LLM_MODEL = "llama-3.1-8b-instant"
|
| 41 |
|
| 42 |
GROQ_FREE_MODELS = [
|
| 43 |
"llama-3.1-8b-instant",
|
| 44 |
"llama-3.1-70b-versatile",
|
| 45 |
"mixtral-8x7b-32768",
|
| 46 |
]
|
|
|
|
| 47 |
GEMINI_FREE_MODELS = [
|
| 48 |
"gemini-1.5-flash",
|
| 49 |
"gemini-1.5-flash-8b",
|
| 50 |
]
|
| 51 |
|
| 52 |
DEFAULT_PROVIDER = "groq"
|
| 53 |
+
DEFAULT_MODEL = GROQ_FREE_MODELS[0]
|
| 54 |
|
| 55 |
+
# ββ Workflow ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
MAX_ITERATIONS = 3
|
ingestion/embedding.py
CHANGED
|
@@ -1,3 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from llama_index.core import Settings
|
| 2 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 3 |
|
|
@@ -6,15 +18,40 @@ from utils import get_logger
|
|
| 6 |
|
| 7 |
logger = get_logger(__name__)
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def configure_embedding():
|
| 11 |
"""
|
| 12 |
-
|
| 13 |
-
|
| 14 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
Settings.llm = None # disable OpenAI LLM shim
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
| 19 |
)
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ingestion/embedding.py
|
| 3 |
+
~~~~~~~~~~~~~~~~~~~~~~
|
| 4 |
+
Configures the global LlamaIndex embedding model and pre-warms it.
|
| 5 |
+
|
| 6 |
+
HF Spaces fixes:
|
| 7 |
+
- HF_HOME is set in Dockerfile β /data/hf_cache (persistent disk).
|
| 8 |
+
The model is downloaded once and reused across restarts.
|
| 9 |
+
- _warm_up() runs a dummy encode after loading so the first real
|
| 10 |
+
batch doesn't pay the JIT / tokenizer init cost during indexing.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
from llama_index.core import Settings
|
| 14 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 15 |
|
|
|
|
| 18 |
|
| 19 |
logger = get_logger(__name__)
|
| 20 |
|
| 21 |
+
# Module-level singleton so configure_embedding() is idempotent
|
| 22 |
+
_embed_model = None
|
| 23 |
+
|
| 24 |
|
| 25 |
def configure_embedding():
|
| 26 |
"""
|
| 27 |
+
Set the global LlamaIndex embedding model.
|
| 28 |
+
Safe to call multiple times β only initialises once per process.
|
| 29 |
"""
|
| 30 |
+
global _embed_model
|
| 31 |
+
|
| 32 |
+
if _embed_model is not None:
|
| 33 |
+
# Already initialised in this process β reuse, don't reload
|
| 34 |
+
Settings.embed_model = _embed_model
|
| 35 |
+
logger.info("Embedding model reused from cache (no reload)")
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
logger.info(f"Loading embedding model '{EMBED_MODEL}'β¦")
|
| 39 |
Settings.llm = None # disable OpenAI LLM shim
|
| 40 |
+
|
| 41 |
+
_embed_model = HuggingFaceEmbedding(
|
| 42 |
+
model_name=EMBED_MODEL,
|
| 43 |
+
embed_batch_size=EMBED_BATCH_SIZE,
|
| 44 |
)
|
| 45 |
+
Settings.embed_model = _embed_model
|
| 46 |
+
|
| 47 |
+
# Pre-warm: run one dummy encode so the first real batch doesn't
|
| 48 |
+
# pay tokenizer JIT cost during the timed indexing window.
|
| 49 |
+
try:
|
| 50 |
+
_embed_model.get_text_embedding("warm up")
|
| 51 |
+
logger.info("Embedding model warmed up successfully")
|
| 52 |
+
except Exception as exc:
|
| 53 |
+
logger.warning(f"Warm-up encode failed (non-fatal): {exc}")
|
| 54 |
+
|
| 55 |
+
logger.info(
|
| 56 |
+
f"Embedding model ready: '{EMBED_MODEL}', batch_size={EMBED_BATCH_SIZE}"
|
| 57 |
+
)
|
ingestion/index_builder.py
CHANGED
|
@@ -1,3 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from llama_index.core import VectorStoreIndex
|
| 2 |
|
| 3 |
from config import BATCH_SIZE, INDEX_DIR
|
|
@@ -6,49 +22,83 @@ from utils import get_logger
|
|
| 6 |
logger = get_logger(__name__)
|
| 7 |
|
| 8 |
|
| 9 |
-
def build_index(nodes: list) -> VectorStoreIndex:
|
| 10 |
"""
|
| 11 |
-
Build a VectorStoreIndex from nodes.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
index.storage_context.persist(persist_dir=INDEX_DIR)
|
|
|
|
|
|
|
| 19 |
logger.info(f"Index persisted to {INDEX_DIR}")
|
| 20 |
return index
|
| 21 |
|
| 22 |
|
| 23 |
def ingest_pdfs(progress_callback=None):
|
| 24 |
"""
|
| 25 |
-
Full ingestion pipeline: load
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
from ingestion.embedding import configure_embedding
|
| 31 |
from ingestion.loader import load_pdfs
|
| 32 |
from ingestion.splitter import split_documents
|
| 33 |
|
| 34 |
-
def _cb(
|
| 35 |
if progress_callback:
|
| 36 |
-
progress_callback(
|
| 37 |
-
logger.info(
|
| 38 |
|
| 39 |
-
_cb(0.05, "
|
| 40 |
configure_embedding()
|
| 41 |
|
| 42 |
-
_cb(0.10, "Loading PDF documents
|
| 43 |
docs = load_pdfs()
|
| 44 |
if not docs:
|
| 45 |
-
raise RuntimeError("No PDF documents found in upload directory.")
|
| 46 |
|
| 47 |
-
_cb(0.
|
| 48 |
nodes = split_documents(docs)
|
| 49 |
total = len(nodes)
|
| 50 |
|
| 51 |
-
_cb(0.
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
_cb(1.00, f"Indexed {total} chunks
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ingestion/index_builder.py
|
| 3 |
+
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
| 4 |
+
Builds and persists a VectorStoreIndex from LlamaIndex nodes.
|
| 5 |
+
|
| 6 |
+
Fixes:
|
| 7 |
+
- Wipes INDEX_DIR before rebuilding so re-uploads never mix stale
|
| 8 |
+
and new vectors (was the cause of wrong/irrelevant retrieval results).
|
| 9 |
+
- insert_batch_size=BATCH_SIZE keeps peak RAM bounded on HF Spaces.
|
| 10 |
+
- Progress callback covers the full 0β1 range so the Streamlit bar
|
| 11 |
+
never appears frozen.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import shutil
|
| 16 |
+
|
| 17 |
from llama_index.core import VectorStoreIndex
|
| 18 |
|
| 19 |
from config import BATCH_SIZE, INDEX_DIR
|
|
|
|
| 22 |
logger = get_logger(__name__)
|
| 23 |
|
| 24 |
|
| 25 |
+
def build_index(nodes: list, progress_callback=None) -> VectorStoreIndex:
|
| 26 |
"""
|
| 27 |
+
Build and persist a VectorStoreIndex from *nodes*.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
nodes: LlamaIndex TextNode list
|
| 31 |
+
progress_callback: optional (float, str) callable for UI updates
|
| 32 |
"""
|
| 33 |
+
def _cb(p, m):
|
| 34 |
+
if progress_callback:
|
| 35 |
+
progress_callback(p, m)
|
| 36 |
+
logger.info(m)
|
| 37 |
+
|
| 38 |
+
total = len(nodes)
|
| 39 |
+
logger.info(f"Building index from {total} nodes (insert_batch_size={BATCH_SIZE})")
|
| 40 |
+
|
| 41 |
+
# Always wipe old index β stale vectors from previous uploads cause
|
| 42 |
+
# irrelevant retrieval results on re-indexing.
|
| 43 |
+
if os.path.exists(INDEX_DIR):
|
| 44 |
+
shutil.rmtree(INDEX_DIR)
|
| 45 |
+
os.makedirs(INDEX_DIR, exist_ok=True)
|
| 46 |
+
|
| 47 |
+
_cb(0.1, f"Embedding {total} chunks⦠(this takes the longest on CPU)")
|
| 48 |
+
|
| 49 |
+
index = VectorStoreIndex(
|
| 50 |
+
nodes,
|
| 51 |
+
show_progress=True,
|
| 52 |
+
insert_batch_size=BATCH_SIZE,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
_cb(0.9, "Persisting index to diskβ¦")
|
| 56 |
index.storage_context.persist(persist_dir=INDEX_DIR)
|
| 57 |
+
|
| 58 |
+
_cb(1.0, f"Index built and saved ({total} chunks)")
|
| 59 |
logger.info(f"Index persisted to {INDEX_DIR}")
|
| 60 |
return index
|
| 61 |
|
| 62 |
|
| 63 |
def ingest_pdfs(progress_callback=None):
|
| 64 |
"""
|
| 65 |
+
Full ingestion pipeline: configure β load β split β embed β index.
|
| 66 |
|
| 67 |
+
Progress milestones:
|
| 68 |
+
0.05 configuring embedding model
|
| 69 |
+
0.10 loading PDFs
|
| 70 |
+
0.25 splitting into chunks
|
| 71 |
+
0.35 starting index build (slow β embedding all chunks on CPU)
|
| 72 |
+
0.95 persisting to disk
|
| 73 |
+
1.00 done
|
| 74 |
"""
|
| 75 |
from ingestion.embedding import configure_embedding
|
| 76 |
from ingestion.loader import load_pdfs
|
| 77 |
from ingestion.splitter import split_documents
|
| 78 |
|
| 79 |
+
def _cb(p, m):
|
| 80 |
if progress_callback:
|
| 81 |
+
progress_callback(p, m)
|
| 82 |
+
logger.info(m)
|
| 83 |
|
| 84 |
+
_cb(0.05, "Loading embedding model (cached after first run)β¦")
|
| 85 |
configure_embedding()
|
| 86 |
|
| 87 |
+
_cb(0.10, "Loading PDF documentsβ¦")
|
| 88 |
docs = load_pdfs()
|
| 89 |
if not docs:
|
| 90 |
+
raise RuntimeError("No PDF documents found in the upload directory.")
|
| 91 |
|
| 92 |
+
_cb(0.25, f"Loaded {len(docs)} document(s). Splitting into chunksβ¦")
|
| 93 |
nodes = split_documents(docs)
|
| 94 |
total = len(nodes)
|
| 95 |
|
| 96 |
+
_cb(0.35, f"{total} chunks ready. Embedding on CPU β please waitβ¦")
|
| 97 |
+
|
| 98 |
+
# Scale build progress into the 0.35 β 0.95 band
|
| 99 |
+
def _build_cb(p, m):
|
| 100 |
+
_cb(0.35 + p * 0.60, m)
|
| 101 |
+
|
| 102 |
+
build_index(nodes, progress_callback=_build_cb)
|
| 103 |
|
| 104 |
+
_cb(1.00, f"β
Done! Indexed {total} chunks.")
|
ingestion/splitter.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from llama_index.core.node_parser import SentenceSplitter
|
| 2 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 3 |
from utils import get_logger
|
|
@@ -6,11 +17,12 @@ logger = get_logger(__name__)
|
|
| 6 |
|
| 7 |
|
| 8 |
def split_documents(docs: list) -> list:
|
| 9 |
-
"""Split LlamaIndex documents into nodes
|
| 10 |
splitter = SentenceSplitter(
|
| 11 |
-
chunk_size=CHUNK_SIZE,
|
| 12 |
-
chunk_overlap=CHUNK_OVERLAP,
|
| 13 |
)
|
| 14 |
-
nodes = splitter.get_nodes_from_documents(docs)
|
| 15 |
-
logger.info(f"Split into {len(nodes)} chunk(s)"
|
| 16 |
-
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ingestion/splitter.py
|
| 3 |
+
~~~~~~~~~~~~~~~~~~~~~
|
| 4 |
+
Splits LlamaIndex documents into nodes (chunks).
|
| 5 |
+
|
| 6 |
+
Fix: chunk_size=384 matches all-MiniLM-L6-v2's max token length exactly.
|
| 7 |
+
The original 600-token chunks were silently truncated by the model,
|
| 8 |
+
meaning the tail of each chunk was never embedded β causing retrieval
|
| 9 |
+
to miss content that appeared in the latter half of large paragraphs.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
from llama_index.core.node_parser import SentenceSplitter
|
| 13 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 14 |
from utils import get_logger
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
def split_documents(docs: list) -> list:
|
| 20 |
+
"""Split LlamaIndex documents into nodes."""
|
| 21 |
splitter = SentenceSplitter(
|
| 22 |
+
chunk_size=CHUNK_SIZE, # 384 β matches MiniLM max context
|
| 23 |
+
chunk_overlap=CHUNK_OVERLAP, # 64 β preserves cross-boundary context
|
| 24 |
)
|
| 25 |
+
nodes = splitter.get_nodes_from_documents(docs, show_progress=True)
|
| 26 |
+
logger.info(f"Split {len(docs)} doc(s) into {len(nodes)} chunk(s) "
|
| 27 |
+
f"(size={CHUNK_SIZE}, overlap={CHUNK_OVERLAP})")
|
| 28 |
+
return nodes
|
requirements.txt
CHANGED
|
@@ -6,6 +6,9 @@ llama-index>=0.10.30
|
|
| 6 |
llama-index-embeddings-huggingface>=0.1.4
|
| 7 |
llama-index-retrievers-bm25>=0.1.3
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
# Embeddings / ML
|
| 10 |
sentence-transformers>=2.6.1
|
| 11 |
pypdf>=4.2.0
|
|
@@ -14,4 +17,4 @@ pypdf>=4.2.0
|
|
| 14 |
langchain>=0.1.20
|
| 15 |
langgraph>=0.0.40
|
| 16 |
langchain-groq>=0.1.4
|
| 17 |
-
langchain-google-genai>=1.0.7
|
|
|
|
| 6 |
llama-index-embeddings-huggingface>=0.1.4
|
| 7 |
llama-index-retrievers-bm25>=0.1.3
|
| 8 |
|
| 9 |
+
# BM25 backend β was MISSING; llama-index-retrievers-bm25 depends on this
|
| 10 |
+
rank-bm25>=0.2.2
|
| 11 |
+
|
| 12 |
# Embeddings / ML
|
| 13 |
sentence-transformers>=2.6.1
|
| 14 |
pypdf>=4.2.0
|
|
|
|
| 17 |
langchain>=0.1.20
|
| 18 |
langgraph>=0.0.40
|
| 19 |
langchain-groq>=0.1.4
|
| 20 |
+
langchain-google-genai>=1.0.7
|
retriever/bm25_retriever.py
CHANGED
|
@@ -1,3 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from llama_index.retrievers.bm25 import BM25Retriever
|
| 2 |
from config import TOP_K
|
| 3 |
from utils import get_logger
|
|
@@ -5,8 +21,27 @@ from utils import get_logger
|
|
| 5 |
logger = get_logger(__name__)
|
| 6 |
|
| 7 |
|
| 8 |
-
def get_bm25_retriever(index) -> BM25Retriever:
|
| 9 |
-
"""
|
| 10 |
-
retriever
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
retriever/bm25_retriever.py
|
| 3 |
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
| 4 |
+
Sparse BM25 retriever.
|
| 5 |
+
|
| 6 |
+
Fixes:
|
| 7 |
+
- from_defaults(index=index) was loading ALL nodes into RAM via an
|
| 8 |
+
internal retrieve-all call. Replaced with from_defaults(nodes=nodes)
|
| 9 |
+
which reads the docstore directly β same data, no extra round-trip.
|
| 10 |
+
- Returns None on failure so hybrid_retriever degrades to vector-only
|
| 11 |
+
instead of crashing (rank-bm25 not installed, empty docstore, etc).
|
| 12 |
+
|
| 13 |
+
requirements.txt must include:
|
| 14 |
+
rank-bm25>=0.2.2 β was missing; BM25Retriever depends on it
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
from llama_index.retrievers.bm25 import BM25Retriever
|
| 18 |
from config import TOP_K
|
| 19 |
from utils import get_logger
|
|
|
|
| 21 |
logger = get_logger(__name__)
|
| 22 |
|
| 23 |
|
| 24 |
+
def get_bm25_retriever(index) -> "BM25Retriever | None":
|
| 25 |
+
"""
|
| 26 |
+
Return a BM25 retriever over *index*, or None if setup fails.
|
| 27 |
+
"""
|
| 28 |
+
try:
|
| 29 |
+
nodes = list(index.docstore.docs.values())
|
| 30 |
+
if not nodes:
|
| 31 |
+
logger.warning("Docstore is empty β BM25 skipped")
|
| 32 |
+
return None
|
| 33 |
+
|
| 34 |
+
retriever = BM25Retriever.from_defaults(
|
| 35 |
+
nodes=nodes,
|
| 36 |
+
similarity_top_k=TOP_K,
|
| 37 |
+
)
|
| 38 |
+
logger.info(f"BM25 retriever ready over {len(nodes)} nodes (top_k={TOP_K})")
|
| 39 |
+
return retriever
|
| 40 |
+
|
| 41 |
+
except Exception as exc:
|
| 42 |
+
logger.error(
|
| 43 |
+
f"BM25 init failed: {exc}. "
|
| 44 |
+
"Ensure rank-bm25>=0.2.2 is in requirements.txt. "
|
| 45 |
+
"Falling back to vector-only retrieval."
|
| 46 |
+
)
|
| 47 |
+
return None
|
retriever/hybrid_retriever.py
CHANGED
|
@@ -1,3 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from llama_index.core import StorageContext, load_index_from_storage
|
| 3 |
from langchain_core.documents import Document
|
|
@@ -11,20 +24,34 @@ from utils import get_logger
|
|
| 11 |
|
| 12 |
logger = get_logger(__name__)
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
class HybridRetriever:
|
| 16 |
"""
|
| 17 |
-
Hybrid dense+sparse retriever with RRF
|
| 18 |
|
| 19 |
Usage:
|
| 20 |
retriever = HybridRetriever()
|
| 21 |
-
docs = retriever.invoke("What is
|
| 22 |
"""
|
| 23 |
|
| 24 |
def __init__(self):
|
| 25 |
if not os.path.exists(INDEX_DIR) or not os.listdir(INDEX_DIR):
|
| 26 |
-
raise RuntimeError(
|
|
|
|
|
|
|
| 27 |
|
|
|
|
| 28 |
configure_embedding()
|
| 29 |
|
| 30 |
storage = StorageContext.from_defaults(persist_dir=INDEX_DIR)
|
|
@@ -32,24 +59,46 @@ class HybridRetriever:
|
|
| 32 |
logger.info("Index loaded from storage")
|
| 33 |
|
| 34 |
self.vector = get_vector_retriever(self.index)
|
| 35 |
-
self.bm25 = get_bm25_retriever(self.index)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def invoke(self, query: str) -> list[Document]:
|
| 38 |
"""
|
| 39 |
-
Retrieve documents for *query* using
|
| 40 |
-
|
| 41 |
Returns a list of LangChain Document objects.
|
| 42 |
"""
|
|
|
|
| 43 |
try:
|
| 44 |
vector_nodes = self.vector.retrieve(query)
|
| 45 |
-
bm25_nodes = self.bm25.retrieve(query)
|
| 46 |
except Exception as exc:
|
| 47 |
-
logger.error(f"
|
| 48 |
return []
|
| 49 |
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
for n in fused
|
| 55 |
-
]
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
retriever/hybrid_retriever.py
|
| 3 |
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
| 4 |
+
Hybrid dense+sparse retriever with RRF score fusion.
|
| 5 |
+
|
| 6 |
+
Fixes:
|
| 7 |
+
1. BM25 is optional β gracefully falls back to vector-only if unavailable.
|
| 8 |
+
2. Metadata filename extracted with multi-key fallback (file_name / file_path /
|
| 9 |
+
filename / source) so citations never silently disappear.
|
| 10 |
+
3. Module-level singleton (_instance) so HybridRetriever() can be called
|
| 11 |
+
repeatedly from Streamlit without reloading the index from disk each time.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
import os
|
| 15 |
from llama_index.core import StorageContext, load_index_from_storage
|
| 16 |
from langchain_core.documents import Document
|
|
|
|
| 24 |
|
| 25 |
logger = get_logger(__name__)
|
| 26 |
|
| 27 |
+
# Metadata keys tried in order when resolving the source filename
|
| 28 |
+
_FILENAME_KEYS = ("file_name", "file_path", "filename", "source")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _extract_filename(metadata: dict) -> str:
|
| 32 |
+
for key in _FILENAME_KEYS:
|
| 33 |
+
val = metadata.get(key)
|
| 34 |
+
if val:
|
| 35 |
+
return os.path.basename(str(val))
|
| 36 |
+
return "unknown"
|
| 37 |
+
|
| 38 |
|
| 39 |
class HybridRetriever:
|
| 40 |
"""
|
| 41 |
+
Hybrid dense + sparse retriever with RRF fusion.
|
| 42 |
|
| 43 |
Usage:
|
| 44 |
retriever = HybridRetriever()
|
| 45 |
+
docs = retriever.invoke("What is X?")
|
| 46 |
"""
|
| 47 |
|
| 48 |
def __init__(self):
|
| 49 |
if not os.path.exists(INDEX_DIR) or not os.listdir(INDEX_DIR):
|
| 50 |
+
raise RuntimeError(
|
| 51 |
+
"No index found. Upload and index PDFs first."
|
| 52 |
+
)
|
| 53 |
|
| 54 |
+
# configure_embedding() is idempotent β safe to call every time
|
| 55 |
configure_embedding()
|
| 56 |
|
| 57 |
storage = StorageContext.from_defaults(persist_dir=INDEX_DIR)
|
|
|
|
| 59 |
logger.info("Index loaded from storage")
|
| 60 |
|
| 61 |
self.vector = get_vector_retriever(self.index)
|
| 62 |
+
self.bm25 = get_bm25_retriever(self.index) # may be None
|
| 63 |
+
|
| 64 |
+
if self.bm25 is None:
|
| 65 |
+
logger.warning(
|
| 66 |
+
"Running in vector-only mode. "
|
| 67 |
+
"Add rank-bm25>=0.2.2 to requirements.txt for hybrid search."
|
| 68 |
+
)
|
| 69 |
|
| 70 |
def invoke(self, query: str) -> list[Document]:
|
| 71 |
"""
|
| 72 |
+
Retrieve documents for *query* using hybrid search (or vector-only).
|
|
|
|
| 73 |
Returns a list of LangChain Document objects.
|
| 74 |
"""
|
| 75 |
+
# Dense retrieval
|
| 76 |
try:
|
| 77 |
vector_nodes = self.vector.retrieve(query)
|
|
|
|
| 78 |
except Exception as exc:
|
| 79 |
+
logger.error(f"Vector retrieval error: {exc}")
|
| 80 |
return []
|
| 81 |
|
| 82 |
+
# Sparse retrieval + RRF fusion (if BM25 is available)
|
| 83 |
+
if self.bm25 is not None:
|
| 84 |
+
try:
|
| 85 |
+
bm25_nodes = self.bm25.retrieve(query)
|
| 86 |
+
fused = reciprocal_rank_fusion([vector_nodes, bm25_nodes])
|
| 87 |
+
except Exception as exc:
|
| 88 |
+
logger.warning(f"BM25 retrieval error ({exc}) β using vector only")
|
| 89 |
+
fused = vector_nodes
|
| 90 |
+
else:
|
| 91 |
+
fused = vector_nodes
|
| 92 |
+
|
| 93 |
+
results = []
|
| 94 |
+
for n in fused:
|
| 95 |
+
raw_meta = n.node.metadata or {}
|
| 96 |
+
meta = dict(raw_meta)
|
| 97 |
+
meta["file_name"] = _extract_filename(raw_meta)
|
| 98 |
+
results.append(Document(page_content=n.node.text, metadata=meta))
|
| 99 |
+
|
| 100 |
+
logger.info(f"Retrieved {len(results)} doc(s) for: '{query[:80]}'")
|
| 101 |
+
return results
|
| 102 |
|
| 103 |
+
# Alias for callers that use .retrieve()
|
| 104 |
+
retrieve = invoke
|
|
|
|
|
|