Spaces:

menikev
/

KnowYourRIght-Bot

Sleeping

App Files Files Community

KnowYourRIght-Bot / src /retriever.py

menikev

Update src/retriever.py

0811500 verified 5 months ago

raw

history blame contribute delete

2.45 kB

	#!/usr/bin/env python3
	import os
	import sys
	from pathlib import Path
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_chroma import Chroma


	def import_ingest(project_root: Path):
	import importlib.util
	ingest_path = project_root / "src" / "ingest_documents.py"
	if not ingest_path.exists():
	print(f"ERROR: {ingest_path} not found. Add your ingester there.", file=sys.stderr)
	return None
	spec = importlib.util.spec_from_file_location("ingest_documents", str(ingest_path))
	mod = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(mod) # type: ignore
	return getattr(mod, "main", None)


	def open_db(persist_dir: Path):
	embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"})
	return Chroma(
	persist_directory=str(persist_dir),
	embedding_function=embed,
	collection_name="legal_documents",
	)


	def get_retriever():
	project_root = Path(__file__).resolve().parent
	persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db"))

	needs_build = not persist_dir.exists() or not any(persist_dir.iterdir())
	if needs_build:
	print("⚡ vector_db missing/empty → running ingestion...")
	ingest_main = import_ingest(project_root)
	if ingest_main is None:
	raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
	ingest_main()

	vectordb = open_db(persist_dir)
	count = vectordb._collection.count()
	print(f"Found {count} documents in collection 'legal_documents'")

	if count == 0:
	print("⚠️ Collection empty after first load — forcing rebuild…")
	ingest_main = import_ingest(project_root)
	if ingest_main is None:
	raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
	ingest_main()
	vectordb = open_db(persist_dir)
	count = vectordb._collection.count()
	print(f"[recheck] Found {count} documents in 'legal_documents'")

	if count == 0:
	raise ValueError(
	"Chroma collection is still empty. Likely causes:\n"
	" - No .txt files in ./data/processed or ./src/data/processed\n"
	" - Files committed to a different path than expected\n"
	" - Ingest produced zero chunks (empty content)"
	)

	return vectordb.as_retriever(search_kwargs={"k": 3})