Oviya commited on
Commit
5687457
Β·
1 Parent(s): d2e4e52
Files changed (1) hide show
  1. ragg/ingest_all.py +19 -9
ragg/ingest_all.py CHANGED
@@ -5,30 +5,37 @@ from langchain_community.vectorstores import Chroma
5
 
6
  # Allow running both as a module and as a script
7
  if __package__ in (None, ""):
8
- # When you run: python ragg/ingest_all.py
9
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
10
  from ragg.rag_backend import ingest_pdfs_from_folder, get_embeddings
11
  import ragg.rag_backend as rag_backend
12
  else:
13
- # When you run: python -m ragg.ingest_all (recommended)
14
  from ragg.rag_backend import ingest_pdfs_from_folder, get_embeddings
15
  import ragg.rag_backend as rag_backend
16
 
17
  IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
18
  HERE = os.path.dirname(__file__)
19
- PDFS_ROOT = os.path.abspath(os.path.join(HERE, "..", "pdfs"))
20
- # Hugging Face shows cwd as /app β†’ persist under /app/chroma
21
- CHROMA_BASE = "/app/chroma" if IS_PROD else "chroma"
 
 
 
 
 
 
22
 
23
  def ingest_all_levels():
24
  """
25
- Ingest PDFs from ../pdfs/{low,mid,high} into separate Chroma DBs.
 
26
  """
27
  print("\nπŸš€ Starting ingestion\n")
28
  print(f"πŸ“‚ PDFs root: {PDFS_ROOT}")
29
  print(f"πŸ’Ύ Chroma base: {CHROMA_BASE} (ENV={'prod' if IS_PROD else 'dev'})\n")
30
 
31
- for name in ["low", "mid", "high"]:
 
 
32
  folder_path = os.path.join(PDFS_ROOT, name)
33
  if not os.path.isdir(folder_path):
34
  print(f"⚠️ Skip {name}: {folder_path} not found")
@@ -37,9 +44,9 @@ def ingest_all_levels():
37
  chroma_dir = os.path.join(CHROMA_BASE, name)
38
  os.makedirs(chroma_dir, exist_ok=True)
39
 
40
- # Monkey-patch vectorstore factory used by your backend for this run
41
  def get_vectorstore_for_level():
42
- print(f"πŸ”Ή Chroma at: {chroma_dir}")
43
  return Chroma(
44
  persist_directory=chroma_dir,
45
  embedding_function=get_embeddings()
@@ -48,6 +55,7 @@ def ingest_all_levels():
48
  rag_backend.get_vectorstore = get_vectorstore_for_level
49
 
50
  try:
 
51
  res = ingest_pdfs_from_folder(
52
  folder_path, subject="English", grade="5", chapter=name
53
  )
@@ -58,5 +66,7 @@ def ingest_all_levels():
58
  print(f"❌ {name}: {e}")
59
  print(traceback.format_exc())
60
 
 
 
61
  if __name__ == "__main__":
62
  ingest_all_levels()
 
5
 
6
  # Allow running both as a module and as a script
7
  if __package__ in (None, ""):
 
8
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
9
  from ragg.rag_backend import ingest_pdfs_from_folder, get_embeddings
10
  import ragg.rag_backend as rag_backend
11
  else:
 
12
  from ragg.rag_backend import ingest_pdfs_from_folder, get_embeddings
13
  import ragg.rag_backend as rag_backend
14
 
15
  IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
16
  HERE = os.path.dirname(__file__)
17
+
18
+ # βœ… Point to the actual PDF location inside the Space
19
+ PDFS_ROOT = os.path.abspath(os.path.join(HERE, "pdfs")) # /app/ragg/pdfs (fallback)
20
+ if not os.path.isdir(PDFS_ROOT):
21
+ # Hugging Face uses /app as working dir β†’ /app/pdfs/*
22
+ PDFS_ROOT = os.path.abspath(os.path.join(HERE, "..", "pdfs"))
23
+
24
+ # βœ… Always persist Chroma under /data/chroma (mounted persistent dir)
25
+ CHROMA_BASE = os.getenv("CHROMA_ROOT", "/data/chroma")
26
 
27
  def ingest_all_levels():
28
  """
29
+ Ingest PDFs from ./pdfs/{low,mid,high} into separate Chroma DBs.
30
+ Works both locally and on Hugging Face.
31
  """
32
  print("\nπŸš€ Starting ingestion\n")
33
  print(f"πŸ“‚ PDFs root: {PDFS_ROOT}")
34
  print(f"πŸ’Ύ Chroma base: {CHROMA_BASE} (ENV={'prod' if IS_PROD else 'dev'})\n")
35
 
36
+ levels = ["low", "mid", "high"]
37
+
38
+ for name in levels:
39
  folder_path = os.path.join(PDFS_ROOT, name)
40
  if not os.path.isdir(folder_path):
41
  print(f"⚠️ Skip {name}: {folder_path} not found")
 
44
  chroma_dir = os.path.join(CHROMA_BASE, name)
45
  os.makedirs(chroma_dir, exist_ok=True)
46
 
47
+ # Rebind vectorstore factory used by your backend for this run
48
  def get_vectorstore_for_level():
49
+ print(f"πŸ”Ή Using Chroma at: {chroma_dir}")
50
  return Chroma(
51
  persist_directory=chroma_dir,
52
  embedding_function=get_embeddings()
 
55
  rag_backend.get_vectorstore = get_vectorstore_for_level
56
 
57
  try:
58
+ print(f"➑️ Ingesting {folder_path}")
59
  res = ingest_pdfs_from_folder(
60
  folder_path, subject="English", grade="5", chapter=name
61
  )
 
66
  print(f"❌ {name}: {e}")
67
  print(traceback.format_exc())
68
 
69
+ print("🎯 Ingestion complete.\n")
70
+
71
  if __name__ == "__main__":
72
  ingest_all_levels()