menikev commited on
Commit
0811500
·
verified ·
1 Parent(s): 62d7114

Update src/retriever.py

Browse files
Files changed (1) hide show
  1. src/retriever.py +4 -13
src/retriever.py CHANGED
@@ -1,7 +1,5 @@
1
  #!/usr/bin/env python3
2
- """Retriever with robust, Spaces-friendly auto-build."""
3
-
4
- import os # ← ADD THIS LINE
5
  import sys
6
  from pathlib import Path
7
  from langchain_community.embeddings import HuggingFaceEmbeddings
@@ -9,7 +7,6 @@ from langchain_chroma import Chroma
9
 
10
 
11
  def import_ingest(project_root: Path):
12
- """Import src/ingest_documents.py even if src isn't a package."""
13
  import importlib.util
14
  ingest_path = project_root / "src" / "ingest_documents.py"
15
  if not ingest_path.exists():
@@ -31,13 +28,9 @@ def open_db(persist_dir: Path):
31
 
32
 
33
  def get_retriever():
34
- # Fix: Get project_root for import_ingest function
35
  project_root = Path(__file__).resolve().parent
36
-
37
- persist_dir = Path(os.getenv("VECTOR_DB_DIR", "/home/user/.cache/vector_db"))
38
- embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"})
39
-
40
- # If DB missing or clearly empty → build
41
  needs_build = not persist_dir.exists() or not any(persist_dir.iterdir())
42
  if needs_build:
43
  print("⚡ vector_db missing/empty → running ingestion...")
@@ -46,12 +39,10 @@ def get_retriever():
46
  raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
47
  ingest_main()
48
 
49
- # Open DB
50
  vectordb = open_db(persist_dir)
51
  count = vectordb._collection.count()
52
  print(f"Found {count} documents in collection 'legal_documents'")
53
 
54
- # If still zero, try a forced rebuild once more (covers "wrong folder" cases)
55
  if count == 0:
56
  print("⚠️ Collection empty after first load — forcing rebuild…")
57
  ingest_main = import_ingest(project_root)
@@ -70,4 +61,4 @@ def get_retriever():
70
  " - Ingest produced zero chunks (empty content)"
71
  )
72
 
73
- return vectordb.as_retriever(search_kwargs={"k": 5})
 
1
  #!/usr/bin/env python3
2
+ import os
 
 
3
  import sys
4
  from pathlib import Path
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
 
7
 
8
 
9
  def import_ingest(project_root: Path):
 
10
  import importlib.util
11
  ingest_path = project_root / "src" / "ingest_documents.py"
12
  if not ingest_path.exists():
 
28
 
29
 
30
  def get_retriever():
 
31
  project_root = Path(__file__).resolve().parent
32
+ persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db"))
33
+
 
 
 
34
  needs_build = not persist_dir.exists() or not any(persist_dir.iterdir())
35
  if needs_build:
36
  print("⚡ vector_db missing/empty → running ingestion...")
 
39
  raise RuntimeError("Cannot import ingester. Ensure src/ingest_documents.py exists.")
40
  ingest_main()
41
 
 
42
  vectordb = open_db(persist_dir)
43
  count = vectordb._collection.count()
44
  print(f"Found {count} documents in collection 'legal_documents'")
45
 
 
46
  if count == 0:
47
  print("⚠️ Collection empty after first load — forcing rebuild…")
48
  ingest_main = import_ingest(project_root)
 
61
  " - Ingest produced zero chunks (empty content)"
62
  )
63
 
64
+ return vectordb.as_retriever(search_kwargs={"k": 3})