KnowYourRIght-Bot / src /retriever.py
menikev's picture
Update src/retriever.py
0811500 verified
#!/usr/bin/env python3
import os
import sys
from pathlib import Path
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
def import_ingest(project_root: Path):
import importlib.util
ingest_path = project_root / "src" / "ingest_documents.py"
if not ingest_path.exists():
print(f"ERROR: {ingest_path} not found. Add your ingester there.", file=sys.stderr)
return None
spec = importlib.util.spec_from_file_location("ingest_documents", str(ingest_path))
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod) # type: ignore
return getattr(mod, "main", None)
def open_db(persist_dir: Path):
embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"})
return Chroma(
persist_directory=str(persist_dir),
embedding_function=embed,
collection_name="legal_documents",
)
def get_retriever():
project_root = Path(__file__).resolve().parent
persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db"))
needs_build = not persist_dir.exists() or not any(persist_dir.iterdir())
if needs_build:
print("⚡ vector_db missing/empty → running ingestion...")
ingest_main = import_ingest(project_root)
if ingest_main is None:
raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
ingest_main()
vectordb = open_db(persist_dir)
count = vectordb._collection.count()
print(f"Found {count} documents in collection 'legal_documents'")
if count == 0:
print("⚠️ Collection empty after first load — forcing rebuild…")
ingest_main = import_ingest(project_root)
if ingest_main is None:
raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
ingest_main()
vectordb = open_db(persist_dir)
count = vectordb._collection.count()
print(f"[recheck] Found {count} documents in 'legal_documents'")
if count == 0:
raise ValueError(
"Chroma collection is still empty. Likely causes:\n"
" - No .txt files in ./data/processed or ./src/data/processed\n"
" - Files committed to a different path than expected\n"
" - Ingest produced zero chunks (empty content)"
)
return vectordb.as_retriever(search_kwargs={"k": 3})