Spaces:
Sleeping
Sleeping
Update src/retriever.py
Browse files- src/retriever.py +4 -13
src/retriever.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
import os # ← ADD THIS LINE
|
| 5 |
import sys
|
| 6 |
from pathlib import Path
|
| 7 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
@@ -9,7 +7,6 @@ from langchain_chroma import Chroma
|
|
| 9 |
|
| 10 |
|
| 11 |
def import_ingest(project_root: Path):
|
| 12 |
-
"""Import src/ingest_documents.py even if src isn't a package."""
|
| 13 |
import importlib.util
|
| 14 |
ingest_path = project_root / "src" / "ingest_documents.py"
|
| 15 |
if not ingest_path.exists():
|
|
@@ -31,13 +28,9 @@ def open_db(persist_dir: Path):
|
|
| 31 |
|
| 32 |
|
| 33 |
def get_retriever():
|
| 34 |
-
# Fix: Get project_root for import_ingest function
|
| 35 |
project_root = Path(__file__).resolve().parent
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"})
|
| 39 |
-
|
| 40 |
-
# If DB missing or clearly empty → build
|
| 41 |
needs_build = not persist_dir.exists() or not any(persist_dir.iterdir())
|
| 42 |
if needs_build:
|
| 43 |
print("⚡ vector_db missing/empty → running ingestion...")
|
|
@@ -46,12 +39,10 @@ def get_retriever():
|
|
| 46 |
raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
|
| 47 |
ingest_main()
|
| 48 |
|
| 49 |
-
# Open DB
|
| 50 |
vectordb = open_db(persist_dir)
|
| 51 |
count = vectordb._collection.count()
|
| 52 |
print(f"Found {count} documents in collection 'legal_documents'")
|
| 53 |
|
| 54 |
-
# If still zero, try a forced rebuild once more (covers "wrong folder" cases)
|
| 55 |
if count == 0:
|
| 56 |
print("⚠️ Collection empty after first load — forcing rebuild…")
|
| 57 |
ingest_main = import_ingest(project_root)
|
|
@@ -70,4 +61,4 @@ def get_retriever():
|
|
| 70 |
" - Ingest produced zero chunks (empty content)"
|
| 71 |
)
|
| 72 |
|
| 73 |
-
return vectordb.as_retriever(search_kwargs={"k":
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
import os
|
|
|
|
|
|
|
| 3 |
import sys
|
| 4 |
from pathlib import Path
|
| 5 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def import_ingest(project_root: Path):
|
|
|
|
| 10 |
import importlib.util
|
| 11 |
ingest_path = project_root / "src" / "ingest_documents.py"
|
| 12 |
if not ingest_path.exists():
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def get_retriever():
|
|
|
|
| 31 |
project_root = Path(__file__).resolve().parent
|
| 32 |
+
persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db"))
|
| 33 |
+
|
|
|
|
|
|
|
|
|
|
| 34 |
needs_build = not persist_dir.exists() or not any(persist_dir.iterdir())
|
| 35 |
if needs_build:
|
| 36 |
print("⚡ vector_db missing/empty → running ingestion...")
|
|
|
|
| 39 |
raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
|
| 40 |
ingest_main()
|
| 41 |
|
|
|
|
| 42 |
vectordb = open_db(persist_dir)
|
| 43 |
count = vectordb._collection.count()
|
| 44 |
print(f"Found {count} documents in collection 'legal_documents'")
|
| 45 |
|
|
|
|
| 46 |
if count == 0:
|
| 47 |
print("⚠️ Collection empty after first load — forcing rebuild…")
|
| 48 |
ingest_main = import_ingest(project_root)
|
|
|
|
| 61 |
" - Ingest produced zero chunks (empty content)"
|
| 62 |
)
|
| 63 |
|
| 64 |
+
return vectordb.as_retriever(search_kwargs={"k": 3})
|