Oviya commited on
Commit
428d9cf
Β·
1 Parent(s): 76b0dda
Files changed (1) hide show
  1. ragg/ingest_all.py +41 -33
ragg/ingest_all.py CHANGED
@@ -1,54 +1,62 @@
1
- # ingest_all.py
2
  import os
3
- from rag_backend import ingest_pdfs_from_folder, get_embeddings
4
- # from ragg.rag_backend import ingest_pdfs_from_folder, get_embeddings
5
  from langchain_community.vectorstores import Chroma
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def ingest_all_levels():
8
  """
9
- Ingest all level-based PDFs (low, mid, high) into separate Chroma vector databases.
10
- Each folder (../pdfs/low, ../pdfs/mid, ../pdfs/high) should contain its own PDFs.
11
  """
12
- pdf_sets = ["low", "mid", "high"]
13
- print("\nπŸš€ Starting ingestion for all PDF levels...\n")
14
-
15
- for name in pdf_sets:
16
- folder_path = os.path.join("..", "pdfs", name)
17
- if not os.path.exists(folder_path):
18
- print(f"⚠️ Skipping '{name}' β€” folder not found at {folder_path}")
 
19
  continue
20
 
21
- print(f"πŸ“˜ Ingesting PDF set: {name}")
22
-
23
- # βœ… Prepare a dedicated Chroma folder for this level
24
- # chroma_dir = os.path.join("chroma", name)
25
- chroma_dir = os.path.join("/home/user/chroma", name) if os.getenv("ENV") == "prod" else os.path.join("chroma", name)
26
  os.makedirs(chroma_dir, exist_ok=True)
27
 
28
- # βœ… Monkey patch: temporarily override get_vectorstore() for this ingestion
29
  def get_vectorstore_for_level():
30
- print(f"πŸ”Ή Initializing Chroma vectorstore at: {chroma_dir}")
31
  return Chroma(
32
  persist_directory=chroma_dir,
33
  embedding_function=get_embeddings()
34
  )
35
- # Print number of documents in the vector store
36
- print(f"πŸ“¦ Number of documents in {name} Chroma store: {len(vectorstore)}")
37
- return vectorstore
38
-
39
 
40
- # βœ… Temporarily replace the function used in rag_backend
41
- import rag_backend as rag_backend
42
  rag_backend.get_vectorstore = get_vectorstore_for_level
43
 
44
- # βœ… Ingest PDFs for this level
45
- result = ingest_pdfs_from_folder(folder_path, subject="English", grade="5", chapter=name)
46
- print(f"βœ… Done for '{name}': {result}")
47
- print(f"πŸ“¦ Stored in: {chroma_dir}\n")
48
- # βœ… After ingestion, print chunks from the Chroma vector store
49
-
50
- print("🎯 All available PDFs processed successfully.\n")
51
-
 
 
52
 
53
  if __name__ == "__main__":
54
  ingest_all_levels()
 
1
+ # ragg/ingest_all.py
2
  import os
3
+ import sys
 
4
  from langchain_community.vectorstores import Chroma
5
 
6
+ # Allow running both as a module and as a script
7
+ if __package__ in (None, ""):
8
+ # When you run: python ragg/ingest_all.py
9
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
10
+ from ragg.rag_backend import ingest_pdfs_from_folder, get_embeddings
11
+ import ragg.rag_backend as rag_backend
12
+ else:
13
+ # When you run: python -m ragg.ingest_all (recommended)
14
+ from ragg.rag_backend import ingest_pdfs_from_folder, get_embeddings
15
+ import ragg.rag_backend as rag_backend
16
+
17
+ IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
18
+ HERE = os.path.dirname(__file__)
19
+ PDFS_ROOT = os.path.abspath(os.path.join(HERE, "..", "pdfs"))
20
+ # Hugging Face shows cwd as /app β†’ persist under /app/chroma
21
+ CHROMA_BASE = "/app/chroma" if IS_PROD else "chroma"
22
+
23
  def ingest_all_levels():
24
  """
25
+ Ingest PDFs from ../pdfs/{low,mid,high} into separate Chroma DBs.
 
26
  """
27
+ print("\nπŸš€ Starting ingestion\n")
28
+ print(f"πŸ“‚ PDFs root: {PDFS_ROOT}")
29
+ print(f"πŸ’Ύ Chroma base: {CHROMA_BASE} (ENV={'prod' if IS_PROD else 'dev'})\n")
30
+
31
+ for name in ["low", "mid", "high"]:
32
+ folder_path = os.path.join(PDFS_ROOT, name)
33
+ if not os.path.isdir(folder_path):
34
+ print(f"⚠️ Skip {name}: {folder_path} not found")
35
  continue
36
 
37
+ chroma_dir = os.path.join(CHROMA_BASE, name)
 
 
 
 
38
  os.makedirs(chroma_dir, exist_ok=True)
39
 
40
+ # Monkey-patch vectorstore factory used by your backend for this run
41
  def get_vectorstore_for_level():
42
+ print(f"πŸ”Ή Chroma at: {chroma_dir}")
43
  return Chroma(
44
  persist_directory=chroma_dir,
45
  embedding_function=get_embeddings()
46
  )
 
 
 
 
47
 
 
 
48
  rag_backend.get_vectorstore = get_vectorstore_for_level
49
 
50
+ try:
51
+ res = ingest_pdfs_from_folder(
52
+ folder_path, subject="English", grade="5", chapter=name
53
+ )
54
+ print(f"βœ… {name}: {res}")
55
+ print(f"πŸ“¦ Stored in: {chroma_dir}\n")
56
+ except Exception as e:
57
+ import traceback
58
+ print(f"❌ {name}: {e}")
59
+ print(traceback.format_exc())
60
 
61
  if __name__ == "__main__":
62
  ingest_all_levels()