Oviya commited on
Commit
2a1e508
Β·
1 Parent(s): 5687457
Files changed (2) hide show
  1. ragg/ingest_all.py +41 -31
  2. ragg/rag_backend.py +40 -7
ragg/ingest_all.py CHANGED
@@ -1,9 +1,10 @@
1
  # ragg/ingest_all.py
2
  import os
3
  import sys
 
4
  from langchain_community.vectorstores import Chroma
5
 
6
- # Allow running both as a module and as a script
7
  if __package__ in (None, ""):
8
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
9
  from ragg.rag_backend import ingest_pdfs_from_folder, get_embeddings
@@ -12,58 +13,67 @@ else:
12
  from ragg.rag_backend import ingest_pdfs_from_folder, get_embeddings
13
  import ragg.rag_backend as rag_backend
14
 
15
- IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
16
- HERE = os.path.dirname(__file__)
 
17
 
18
- # βœ… Point to the actual PDF location inside the Space
19
- PDFS_ROOT = os.path.abspath(os.path.join(HERE, "pdfs")) # /app/ragg/pdfs (fallback)
20
- if not os.path.isdir(PDFS_ROOT):
21
- # Hugging Face uses /app as working dir β†’ /app/pdfs/*
22
- PDFS_ROOT = os.path.abspath(os.path.join(HERE, "..", "pdfs"))
23
 
24
- # βœ… Always persist Chroma under /data/chroma (mounted persistent dir)
25
- CHROMA_BASE = os.getenv("CHROMA_ROOT", "/data/chroma")
 
26
 
27
  def ingest_all_levels():
28
- """
29
- Ingest PDFs from ./pdfs/{low,mid,high} into separate Chroma DBs.
30
- Works both locally and on Hugging Face.
31
- """
32
- print("\nπŸš€ Starting ingestion\n")
33
  print(f"πŸ“‚ PDFs root: {PDFS_ROOT}")
34
- print(f"πŸ’Ύ Chroma base: {CHROMA_BASE} (ENV={'prod' if IS_PROD else 'dev'})\n")
35
 
36
- levels = ["low", "mid", "high"]
37
-
38
- for name in levels:
39
- folder_path = os.path.join(PDFS_ROOT, name)
40
- if not os.path.isdir(folder_path):
41
- print(f"⚠️ Skip {name}: {folder_path} not found")
42
  continue
43
 
44
- chroma_dir = os.path.join(CHROMA_BASE, name)
45
- os.makedirs(chroma_dir, exist_ok=True)
 
46
 
47
- # Rebind vectorstore factory used by your backend for this run
48
  def get_vectorstore_for_level():
49
  print(f"πŸ”Ή Using Chroma at: {chroma_dir}")
50
- return Chroma(
51
- persist_directory=chroma_dir,
52
- embedding_function=get_embeddings()
 
 
53
  )
 
54
 
55
  rag_backend.get_vectorstore = get_vectorstore_for_level
56
 
57
  try:
58
  print(f"➑️ Ingesting {folder_path}")
59
  res = ingest_pdfs_from_folder(
60
- folder_path, subject="English", grade="5", chapter=name
 
 
 
61
  )
62
- print(f"βœ… {name}: {res}")
 
 
 
 
 
 
63
  print(f"πŸ“¦ Stored in: {chroma_dir}\n")
 
64
  except Exception as e:
65
  import traceback
66
- print(f"❌ {name}: {e}")
67
  print(traceback.format_exc())
68
 
69
  print("🎯 Ingestion complete.\n")
 
1
  # ragg/ingest_all.py
2
  import os
3
  import sys
4
+ from pathlib import Path
5
  from langchain_community.vectorstores import Chroma
6
 
7
+ # Support both module and direct runs
8
  if __package__ in (None, ""):
9
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
10
  from ragg.rag_backend import ingest_pdfs_from_folder, get_embeddings
 
13
  from ragg.rag_backend import ingest_pdfs_from_folder, get_embeddings
14
  import ragg.rag_backend as rag_backend
15
 
16
+ # Detect environment
17
+ IS_HF = bool(os.getenv("HF_HOME") or os.getenv("SPACE_ID"))
18
+ HERE = Path(__file__).resolve().parent
19
 
20
+ # PDF root auto-detect
21
+ PDFS_ROOT = (HERE / "pdfs")
22
+ if not PDFS_ROOT.is_dir():
23
+ PDFS_ROOT = (HERE.parent / "pdfs") # Works for /app/pdfs/*
 
24
 
25
+ # Chroma root auto-detect
26
+ CHROMA_BASE = Path(os.getenv("CHROMA_ROOT") or ("/data/chroma" if IS_HF else "./chroma"))
27
+ CHROMA_BASE.mkdir(parents=True, exist_ok=True)
28
 
29
  def ingest_all_levels():
30
+ print("\nπŸš€ Starting ingestion")
 
 
 
 
31
  print(f"πŸ“‚ PDFs root: {PDFS_ROOT}")
32
+ print(f"πŸ’Ύ Chroma base: {CHROMA_BASE}\n")
33
 
34
+ for level in ["low", "mid", "high"]:
35
+ folder_path = PDFS_ROOT / level
36
+ if not folder_path.is_dir():
37
+ print(f"⚠️ Skip {level}: {folder_path} not found")
 
 
38
  continue
39
 
40
+ chroma_dir = CHROMA_BASE / level
41
+ chroma_dir.mkdir(parents=True, exist_ok=True)
42
+ collection_name = f"pylearn_{level}"
43
 
44
+ # Monkey-patch vectorstore factory for this run
45
  def get_vectorstore_for_level():
46
  print(f"πŸ”Ή Using Chroma at: {chroma_dir}")
47
+ print(f"πŸ”Ή Collection: {collection_name}")
48
+ vs = Chroma(
49
+ collection_name=collection_name,
50
+ persist_directory=str(chroma_dir),
51
+ embedding_function=get_embeddings(),
52
  )
53
+ return vs
54
 
55
  rag_backend.get_vectorstore = get_vectorstore_for_level
56
 
57
  try:
58
  print(f"➑️ Ingesting {folder_path}")
59
  res = ingest_pdfs_from_folder(
60
+ folder_path=str(folder_path),
61
+ subject="English",
62
+ grade="5",
63
+ chapter=level,
64
  )
65
+ # Persist the vectorstore to disk
66
+ vs = rag_backend.get_vectorstore()
67
+ if hasattr(vs, "persist"):
68
+ vs.persist()
69
+ print("πŸ“ Called persist()")
70
+
71
+ print(f"βœ… {level}: {res}")
72
  print(f"πŸ“¦ Stored in: {chroma_dir}\n")
73
+
74
  except Exception as e:
75
  import traceback
76
+ print(f"❌ {level}: {e}")
77
  print(traceback.format_exc())
78
 
79
  print("🎯 Ingestion complete.\n")
ragg/rag_backend.py CHANGED
@@ -83,15 +83,47 @@ def _vs_count_safe(vs) -> Optional[int]:
83
  except Exception:
84
  return None
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def get_vectorstore():
 
 
 
 
 
 
87
  global _vectorstore
88
- if _vectorstore is None:
89
- os.makedirs(CHROMA_DIR, exist_ok=True)
90
- print(f"πŸ”Ή Loading Chroma vectorstore at: {CHROMA_DIR}")
91
- _vectorstore = Chroma(
92
- persist_directory=CHROMA_DIR,
93
- embedding_function=get_embeddings()
94
- )
 
 
 
 
 
 
 
 
 
 
95
  cnt = _vs_count_safe(_vectorstore)
96
  if cnt is not None:
97
  print(f"πŸ“¦ Vectorstore currently has ~{cnt} chunks.")
@@ -99,6 +131,7 @@ def get_vectorstore():
99
  print("πŸ“¦ Vectorstore count not available (skipping).")
100
  return _vectorstore
101
 
 
102
  # ---------------- Text Splitter ---------------- #
103
  def chunk_docs(docs: List[Document], chunk_size=1200, chunk_overlap=150) -> List[Document]:
104
  splitter = RecursiveCharacterTextSplitter(
 
83
  except Exception:
84
  return None
85
 
86
+ # def get_vectorstore():
87
+ # global _vectorstore
88
+ # if _vectorstore is None:
89
+ # os.makedirs(CHROMA_DIR, exist_ok=True)
90
+ # print(f"πŸ”Ή Loading Chroma vectorstore at: {CHROMA_DIR}")
91
+ # _vectorstore = Chroma(
92
+ # persist_directory=CHROMA_DIR,
93
+ # embedding_function=get_embeddings()
94
+ # )
95
+ # cnt = _vs_count_safe(_vectorstore)
96
+ # if cnt is not None:
97
+ # print(f"πŸ“¦ Vectorstore currently has ~{cnt} chunks.")
98
+ # else:
99
+ # print("πŸ“¦ Vectorstore count not available (skipping).")
100
+ # return _vectorstore
101
+
102
  def get_vectorstore():
103
+ """
104
+ Returns a Chroma vectorstore that works in both local and Hugging Face environments.
105
+ - Uses CHROMA_DIR if defined (e.g., /data/chroma/low)
106
+ - Defaults to ./chroma when running locally
107
+ - Monkey-patching from ingest_all.py overrides this function
108
+ """
109
  global _vectorstore
110
+ if _vectorstore is not None:
111
+ return _vectorstore
112
+
113
+ # Detect environment (Hugging Face vs Local)
114
+ hf_env = bool(os.getenv("HF_HOME") or os.getenv("SPACE_ID"))
115
+ default_chroma = "/data/chroma" if hf_env else "./chroma"
116
+
117
+ # Final resolved path (can be overridden via env variable)
118
+ chroma_dir = os.getenv("CHROMA_DIR", default_chroma)
119
+ os.makedirs(chroma_dir, exist_ok=True)
120
+
121
+ print(f"πŸ”Ή Loading Chroma vectorstore at: {chroma_dir}")
122
+ _vectorstore = Chroma(
123
+ persist_directory=chroma_dir,
124
+ embedding_function=get_embeddings()
125
+ )
126
+
127
  cnt = _vs_count_safe(_vectorstore)
128
  if cnt is not None:
129
  print(f"πŸ“¦ Vectorstore currently has ~{cnt} chunks.")
 
131
  print("πŸ“¦ Vectorstore count not available (skipping).")
132
  return _vectorstore
133
 
134
+
135
  # ---------------- Text Splitter ---------------- #
136
  def chunk_docs(docs: List[Document], chunk_size=1200, chunk_overlap=150) -> List[Document]:
137
  splitter = RecursiveCharacterTextSplitter(