File size: 2,446 Bytes
3bc55f7
0811500
3bc55f7
 
2bfcf6d
3bc55f7
 
 
f44f221
3bc55f7
 
 
 
 
 
 
 
 
 
 
f44f221
3bc55f7
 
 
 
 
 
2bfcf6d
 
 
f44f221
0811500
 
3bc55f7
 
 
f44f221
3bc55f7
 
 
 
f44f221
3bc55f7
 
 
 
 
f44f221
3bc55f7
 
 
f44f221
3bc55f7
 
 
 
 
 
 
 
 
 
 
0811500
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python3
import os
import sys
from pathlib import Path
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma


def import_ingest(project_root: Path):
    import importlib.util
    ingest_path = project_root / "src" / "ingest_documents.py"
    if not ingest_path.exists():
        print(f"ERROR: {ingest_path} not found. Add your ingester there.", file=sys.stderr)
        return None
    spec = importlib.util.spec_from_file_location("ingest_documents", str(ingest_path))
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)  # type: ignore
    return getattr(mod, "main", None)


def open_db(persist_dir: Path):
    embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"})
    return Chroma(
        persist_directory=str(persist_dir),
        embedding_function=embed,
        collection_name="legal_documents",
    )


def get_retriever():
    project_root = Path(__file__).resolve().parent
    persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db"))

    needs_build = not persist_dir.exists() or not any(persist_dir.iterdir())
    if needs_build:
        print("⚡ vector_db missing/empty → running ingestion...")
        ingest_main = import_ingest(project_root)
        if ingest_main is None:
            raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
        ingest_main()

    vectordb = open_db(persist_dir)
    count = vectordb._collection.count()
    print(f"Found {count} documents in collection 'legal_documents'")

    if count == 0:
        print("⚠️ Collection empty after first load — forcing rebuild…")
        ingest_main = import_ingest(project_root)
        if ingest_main is None:
            raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
        ingest_main()
        vectordb = open_db(persist_dir)
        count = vectordb._collection.count()
        print(f"[recheck] Found {count} documents in 'legal_documents'")

    if count == 0:
        raise ValueError(
            "Chroma collection is still empty. Likely causes:\n"
            " - No .txt files in ./data/processed or ./src/data/processed\n"
            " - Files committed to a different path than expected\n"
            " - Ingest produced zero chunks (empty content)"
        )

    return vectordb.as_retriever(search_kwargs={"k": 3})