Spaces:
Sleeping
Sleeping
File size: 2,446 Bytes
3bc55f7 0811500 3bc55f7 2bfcf6d 3bc55f7 f44f221 3bc55f7 f44f221 3bc55f7 2bfcf6d f44f221 0811500 3bc55f7 f44f221 3bc55f7 f44f221 3bc55f7 f44f221 3bc55f7 f44f221 3bc55f7 0811500 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | #!/usr/bin/env python3
import os
import sys
from pathlib import Path
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
def import_ingest(project_root: Path):
import importlib.util
ingest_path = project_root / "src" / "ingest_documents.py"
if not ingest_path.exists():
print(f"ERROR: {ingest_path} not found. Add your ingester there.", file=sys.stderr)
return None
spec = importlib.util.spec_from_file_location("ingest_documents", str(ingest_path))
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod) # type: ignore
return getattr(mod, "main", None)
def open_db(persist_dir: Path):
embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"})
return Chroma(
persist_directory=str(persist_dir),
embedding_function=embed,
collection_name="legal_documents",
)
def get_retriever():
project_root = Path(__file__).resolve().parent
persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db"))
needs_build = not persist_dir.exists() or not any(persist_dir.iterdir())
if needs_build:
print("⚡ vector_db missing/empty → running ingestion...")
ingest_main = import_ingest(project_root)
if ingest_main is None:
raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
ingest_main()
vectordb = open_db(persist_dir)
count = vectordb._collection.count()
print(f"Found {count} documents in collection 'legal_documents'")
if count == 0:
print("⚠️ Collection empty after first load — forcing rebuild…")
ingest_main = import_ingest(project_root)
if ingest_main is None:
raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
ingest_main()
vectordb = open_db(persist_dir)
count = vectordb._collection.count()
print(f"[recheck] Found {count} documents in 'legal_documents'")
if count == 0:
raise ValueError(
"Chroma collection is still empty. Likely causes:\n"
" - No .txt files in ./data/processed or ./src/data/processed\n"
" - Files committed to a different path than expected\n"
" - Ingest produced zero chunks (empty content)"
)
return vectordb.as_retriever(search_kwargs={"k": 3}) |