#!/usr/bin/env python3 import os import sys from pathlib import Path from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_chroma import Chroma def import_ingest(project_root: Path): import importlib.util ingest_path = project_root / "src" / "ingest_documents.py" if not ingest_path.exists(): print(f"ERROR: {ingest_path} not found. Add your ingester there.", file=sys.stderr) return None spec = importlib.util.spec_from_file_location("ingest_documents", str(ingest_path)) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) # type: ignore return getattr(mod, "main", None) def open_db(persist_dir: Path): embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"}) return Chroma( persist_directory=str(persist_dir), embedding_function=embed, collection_name="legal_documents", ) def get_retriever(): project_root = Path(__file__).resolve().parent persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db")) needs_build = not persist_dir.exists() or not any(persist_dir.iterdir()) if needs_build: print("⚡ vector_db missing/empty → running ingestion...") ingest_main = import_ingest(project_root) if ingest_main is None: raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.") ingest_main() vectordb = open_db(persist_dir) count = vectordb._collection.count() print(f"Found {count} documents in collection 'legal_documents'") if count == 0: print("⚠️ Collection empty after first load — forcing rebuild…") ingest_main = import_ingest(project_root) if ingest_main is None: raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.") ingest_main() vectordb = open_db(persist_dir) count = vectordb._collection.count() print(f"[recheck] Found {count} documents in 'legal_documents'") if count == 0: raise ValueError( "Chroma collection is still empty. Likely causes:\n" " - No .txt files in ./data/processed or ./src/data/processed\n" " - Files committed to a different path than expected\n" " - Ingest produced zero chunks (empty content)" ) return vectordb.as_retriever(search_kwargs={"k": 3})