Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_chroma import Chroma | |
| def import_ingest(project_root: Path): | |
| import importlib.util | |
| ingest_path = project_root / "src" / "ingest_documents.py" | |
| if not ingest_path.exists(): | |
| print(f"ERROR: {ingest_path} not found. Add your ingester there.", file=sys.stderr) | |
| return None | |
| spec = importlib.util.spec_from_file_location("ingest_documents", str(ingest_path)) | |
| mod = importlib.util.module_from_spec(spec) | |
| spec.loader.exec_module(mod) # type: ignore | |
| return getattr(mod, "main", None) | |
| def open_db(persist_dir: Path): | |
| embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"}) | |
| return Chroma( | |
| persist_directory=str(persist_dir), | |
| embedding_function=embed, | |
| collection_name="legal_documents", | |
| ) | |
| def get_retriever(): | |
| project_root = Path(__file__).resolve().parent | |
| persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db")) | |
| needs_build = not persist_dir.exists() or not any(persist_dir.iterdir()) | |
| if needs_build: | |
| print("⚡ vector_db missing/empty → running ingestion...") | |
| ingest_main = import_ingest(project_root) | |
| if ingest_main is None: | |
| raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.") | |
| ingest_main() | |
| vectordb = open_db(persist_dir) | |
| count = vectordb._collection.count() | |
| print(f"Found {count} documents in collection 'legal_documents'") | |
| if count == 0: | |
| print("⚠️ Collection empty after first load — forcing rebuild…") | |
| ingest_main = import_ingest(project_root) | |
| if ingest_main is None: | |
| raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.") | |
| ingest_main() | |
| vectordb = open_db(persist_dir) | |
| count = vectordb._collection.count() | |
| print(f"[recheck] Found {count} documents in 'legal_documents'") | |
| if count == 0: | |
| raise ValueError( | |
| "Chroma collection is still empty. Likely causes:\n" | |
| " - No .txt files in ./data/processed or ./src/data/processed\n" | |
| " - Files committed to a different path than expected\n" | |
| " - Ingest produced zero chunks (empty content)" | |
| ) | |
| return vectordb.as_retriever(search_kwargs={"k": 3}) |