MaheshLEO4 commited on
Commit
273e15c
·
1 Parent(s): 07b3441

Remove collection directories and simplify upload flow

Browse files
app.py CHANGED
@@ -1,12 +1,10 @@
1
  import os
2
- import shutil
3
  import streamlit as st
4
 
5
  from ingestion import ingest_pdfs
6
  from retriever import HybridRetriever
7
  from graph import AgentWorkflow
8
  from config import (
9
- COLLECTIONS_DIR,
10
  get_upload_dir,
11
  get_index_dir,
12
  GROQ_FREE_MODELS,
@@ -15,7 +13,7 @@ from config import (
15
  DEFAULT_MODEL,
16
  )
17
 
18
- st.set_page_config(page_title="Multi-Agent RAG", layout="wide")
19
 
20
  st.markdown(
21
  """
@@ -81,27 +79,16 @@ st.markdown(
81
 
82
  st.markdown('<div class="hero-title">Multi-Agent Hybrid RAG</div>', unsafe_allow_html=True)
83
  st.markdown(
84
- '<p class="hero-subtitle">Create collections, index PDFs, and chat with scoped answers.</p>',
85
  unsafe_allow_html=True,
86
  )
87
 
88
- def get_all_collections():
89
- if not os.path.exists(COLLECTIONS_DIR):
90
- return ["default"]
91
- cols = [d for d in os.listdir(COLLECTIONS_DIR) if os.path.isdir(os.path.join(COLLECTIONS_DIR, d))]
92
- if "default" not in cols:
93
- cols.append("default")
94
- return sorted(list(set(cols)))
95
-
96
- os.makedirs(COLLECTIONS_DIR, exist_ok=True)
97
-
98
  defaults = {
99
  "chat_history": [],
100
  "conversation_history": [],
101
  "retriever": None,
102
  "model_provider": DEFAULT_PROVIDER,
103
  "model_name": DEFAULT_MODEL,
104
- "active_collection": "default"
105
  }
106
  for key, val in defaults.items():
107
  if key not in st.session_state:
@@ -126,50 +113,12 @@ with st.sidebar:
126
  st.session_state.model_provider = model_provider
127
  st.session_state.model_name = model_name
128
 
129
- st.divider()
130
- st.subheader("Collections")
131
- all_collections = get_all_collections()
132
-
133
- selected_col = st.selectbox(
134
- "Current Collection",
135
- all_collections,
136
- index=all_collections.index(st.session_state.active_collection) if st.session_state.active_collection in all_collections else 0
137
- )
138
-
139
- # If collection changed:
140
- if selected_col != st.session_state.active_collection:
141
- st.session_state.active_collection = selected_col
142
- st.session_state.retriever = None
143
- st.session_state.chat_history = []
144
- st.session_state.conversation_history = []
145
- st.rerun()
146
-
147
- c_new = st.text_input("New Collection Name")
148
- if st.button("Create Collection"):
149
- if c_new and c_new.strip() not in all_collections:
150
- get_upload_dir(c_new.strip())
151
- st.session_state.active_collection = c_new.strip()
152
- st.session_state.retriever = None
153
- st.session_state.chat_history = []
154
- st.session_state.conversation_history = []
155
- st.rerun()
156
-
157
- if selected_col != "default":
158
- if st.button(f"Delete '{selected_col}'"):
159
- shutil.rmtree(os.path.join(COLLECTIONS_DIR, selected_col))
160
- st.session_state.active_collection = "default"
161
- st.session_state.retriever = None
162
- st.session_state.chat_history = []
163
- st.session_state.conversation_history = []
164
- st.rerun()
165
-
166
- current_col = st.session_state.active_collection
167
- upload_dir = get_upload_dir(current_col)
168
- index_dir = get_index_dir(current_col)
169
 
170
- # Collection Manager
171
- st.markdown("<div class='section-title'>Collection files</div>", unsafe_allow_html=True)
172
- st.markdown(f"### {current_col} <span class='chip'>upload folder</span>", unsafe_allow_html=True)
173
  col_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
174
 
175
  if col_files:
@@ -190,11 +139,11 @@ if col_files:
190
  st.session_state.retriever = None
191
  st.rerun()
192
  else:
193
- st.info("No documents in this collection.")
194
 
195
  st.markdown("<div class='section-title'>Add documents</div>", unsafe_allow_html=True)
196
  uploaded_files = st.file_uploader(
197
- f"Add PDFs to '{current_col}'",
198
  type=["pdf"],
199
  accept_multiple_files=True,
200
  )
@@ -208,25 +157,24 @@ if uploaded_files:
208
  fh.write(f.getbuffer())
209
  saved_any = True
210
  if saved_any:
211
- st.success("Files uploaded! Click 'Index Collection' to apply changes.")
212
  st.rerun()
213
 
214
  colbase_has_pdf = len(os.listdir(upload_dir)) > 0
215
  index_exists = os.path.exists(index_dir) and len(os.listdir(index_dir)) > 0
216
 
217
  if colbase_has_pdf:
218
- if st.button("Index / Re-index Collection", type="primary"):
219
  progress_bar = st.progress(0)
220
  status_text = st.empty()
221
  try:
222
  ingest_pdfs(
223
- collection_name=current_col,
224
  progress_callback=lambda p, m: (progress_bar.progress(p), status_text.text(m))
225
  )
226
  st.session_state.retriever = None
227
  progress_bar.empty()
228
  status_text.empty()
229
- st.success("Collection indexed! You can now ask questions.")
230
  st.rerun()
231
  except Exception as exc:
232
  progress_bar.empty(); status_text.empty()
@@ -245,17 +193,17 @@ for msg in st.session_state.chat_history:
245
  with st.expander("Verification Report", expanded=False):
246
  st.markdown(msg["verification"])
247
 
248
- question = st.chat_input(f"Ask about '{current_col}'...")
249
 
250
  if question:
251
  if not index_exists:
252
- st.warning("Please index the collection first before asking questions.")
253
  st.stop()
254
 
255
  if st.session_state.retriever is None:
256
  with st.spinner("Loading retriever..."):
257
  try:
258
- st.session_state.retriever = HybridRetriever(collection_name=current_col)
259
  except Exception as e:
260
  st.error(str(e))
261
  st.stop()
 
1
  import os
 
2
  import streamlit as st
3
 
4
  from ingestion import ingest_pdfs
5
  from retriever import HybridRetriever
6
  from graph import AgentWorkflow
7
  from config import (
 
8
  get_upload_dir,
9
  get_index_dir,
10
  GROQ_FREE_MODELS,
 
13
  DEFAULT_MODEL,
14
  )
15
 
16
+ st.set_page_config(page_title="Docchat", layout="wide")
17
 
18
  st.markdown(
19
  """
 
79
 
80
  st.markdown('<div class="hero-title">Multi-Agent Hybrid RAG</div>', unsafe_allow_html=True)
81
  st.markdown(
82
+ '<p class="hero-subtitle">Upload PDFs, index them, and chat with grounded answers.</p>',
83
  unsafe_allow_html=True,
84
  )
85
 
 
 
 
 
 
 
 
 
 
 
86
  defaults = {
87
  "chat_history": [],
88
  "conversation_history": [],
89
  "retriever": None,
90
  "model_provider": DEFAULT_PROVIDER,
91
  "model_name": DEFAULT_MODEL,
 
92
  }
93
  for key, val in defaults.items():
94
  if key not in st.session_state:
 
113
  st.session_state.model_provider = model_provider
114
  st.session_state.model_name = model_name
115
 
116
+ upload_dir = get_upload_dir()
117
+ index_dir = get_index_dir()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ # Upload Manager
120
+ st.markdown("<div class='section-title'>Upload files</div>", unsafe_allow_html=True)
121
+ st.markdown("### Upload folder <span class='chip'>shared</span>", unsafe_allow_html=True)
122
  col_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
123
 
124
  if col_files:
 
139
  st.session_state.retriever = None
140
  st.rerun()
141
  else:
142
+ st.info("No documents in the upload folder.")
143
 
144
  st.markdown("<div class='section-title'>Add documents</div>", unsafe_allow_html=True)
145
  uploaded_files = st.file_uploader(
146
+ "Add PDFs",
147
  type=["pdf"],
148
  accept_multiple_files=True,
149
  )
 
157
  fh.write(f.getbuffer())
158
  saved_any = True
159
  if saved_any:
160
+ st.success("Files uploaded! Click 'Index PDFs' to apply changes.")
161
  st.rerun()
162
 
163
  colbase_has_pdf = len(os.listdir(upload_dir)) > 0
164
  index_exists = os.path.exists(index_dir) and len(os.listdir(index_dir)) > 0
165
 
166
  if colbase_has_pdf:
167
+ if st.button("Index PDFs", type="primary"):
168
  progress_bar = st.progress(0)
169
  status_text = st.empty()
170
  try:
171
  ingest_pdfs(
 
172
  progress_callback=lambda p, m: (progress_bar.progress(p), status_text.text(m))
173
  )
174
  st.session_state.retriever = None
175
  progress_bar.empty()
176
  status_text.empty()
177
+ st.success("Index ready! You can now ask questions.")
178
  st.rerun()
179
  except Exception as exc:
180
  progress_bar.empty(); status_text.empty()
 
193
  with st.expander("Verification Report", expanded=False):
194
  st.markdown(msg["verification"])
195
 
196
+ question = st.chat_input("Ask about your PDFs...")
197
 
198
  if question:
199
  if not index_exists:
200
+ st.warning("Please index the PDFs first before asking questions.")
201
  st.stop()
202
 
203
  if st.session_state.retriever is None:
204
  with st.spinner("Loading retriever..."):
205
  try:
206
+ st.session_state.retriever = HybridRetriever()
207
  except Exception as e:
208
  st.error(str(e))
209
  st.stop()
config.py CHANGED
@@ -22,20 +22,14 @@ def _select_data_dir() -> str:
22
 
23
 
24
  DATA_DIR = _select_data_dir()
25
- COLLECTIONS_DIR = os.path.join(DATA_DIR, "collections")
26
 
27
- os.makedirs(COLLECTIONS_DIR, exist_ok=True)
28
-
29
- def get_collection_dir(collection_name: str) -> str:
30
- return os.path.join(COLLECTIONS_DIR, collection_name)
31
-
32
- def get_upload_dir(collection_name: str) -> str:
33
- path = os.path.join(get_collection_dir(collection_name), "raw_pdfs")
34
  os.makedirs(path, exist_ok=True)
35
  return path
36
 
37
- def get_index_dir(collection_name: str) -> str:
38
- path = os.path.join(get_collection_dir(collection_name), "llamaindex")
39
  os.makedirs(path, exist_ok=True)
40
  return path
41
 
 
22
 
23
 
24
  DATA_DIR = _select_data_dir()
 
25
 
26
+ def get_upload_dir() -> str:
27
+ path = os.path.join(DATA_DIR, "raw_pdfs")
 
 
 
 
 
28
  os.makedirs(path, exist_ok=True)
29
  return path
30
 
31
+ def get_index_dir() -> str:
32
+ path = os.path.join(DATA_DIR, "llamaindex")
33
  os.makedirs(path, exist_ok=True)
34
  return path
35
 
ingestion/index_builder.py CHANGED
@@ -6,13 +6,13 @@ from utils import get_logger
6
 
7
  logger = get_logger(__name__)
8
 
9
- def build_index(nodes: list, collection_name: str, progress_callback=None) -> VectorStoreIndex:
10
  def _cb(p, m):
11
  if progress_callback:
12
  progress_callback(p, m)
13
  logger.info(m)
14
 
15
- index_dir = get_index_dir(collection_name)
16
  total = len(nodes)
17
  logger.info(f"Building index from {total} nodes")
18
 
@@ -34,7 +34,7 @@ def build_index(nodes: list, collection_name: str, progress_callback=None) -> Ve
34
  logger.info(f"Index persisted to {index_dir}")
35
  return index
36
 
37
- def ingest_pdfs(collection_name: str, progress_callback=None):
38
  from ingestion.embedding import configure_embedding
39
  from ingestion.loader import load_pdfs
40
  from ingestion.splitter import split_documents
@@ -48,7 +48,7 @@ def ingest_pdfs(collection_name: str, progress_callback=None):
48
  configure_embedding()
49
 
50
  _cb(0.10, "Loading PDF documents...")
51
- docs = load_pdfs(collection_name)
52
 
53
  _cb(0.25, f"Loaded {len(docs)} pages(s). Splitting into chunks...")
54
  nodes = split_documents(docs)
@@ -59,5 +59,5 @@ def ingest_pdfs(collection_name: str, progress_callback=None):
59
  def _build_cb(p, m):
60
  _cb(0.35 + p * 0.60, m)
61
 
62
- build_index(nodes, collection_name, progress_callback=_build_cb)
63
  _cb(1.00, f"Done! Indexed {total} chunks.")
 
6
 
7
  logger = get_logger(__name__)
8
 
9
+ def build_index(nodes: list, progress_callback=None) -> VectorStoreIndex:
10
  def _cb(p, m):
11
  if progress_callback:
12
  progress_callback(p, m)
13
  logger.info(m)
14
 
15
+ index_dir = get_index_dir()
16
  total = len(nodes)
17
  logger.info(f"Building index from {total} nodes")
18
 
 
34
  logger.info(f"Index persisted to {index_dir}")
35
  return index
36
 
37
+ def ingest_pdfs(progress_callback=None):
38
  from ingestion.embedding import configure_embedding
39
  from ingestion.loader import load_pdfs
40
  from ingestion.splitter import split_documents
 
48
  configure_embedding()
49
 
50
  _cb(0.10, "Loading PDF documents...")
51
+ docs = load_pdfs()
52
 
53
  _cb(0.25, f"Loaded {len(docs)} pages(s). Splitting into chunks...")
54
  nodes = split_documents(docs)
 
59
  def _build_cb(p, m):
60
  _cb(0.35 + p * 0.60, m)
61
 
62
+ build_index(nodes, progress_callback=_build_cb)
63
  _cb(1.00, f"Done! Indexed {total} chunks.")
ingestion/loader.py CHANGED
@@ -35,10 +35,10 @@ def _dedupe_lines(text: str) -> str:
35
  return "\n".join(deduped)
36
 
37
 
38
- def load_pdfs(collection_name: str) -> list:
39
- """Load PDFs from a collection using PyMuPDF; fall back to SimpleDirectoryReader if needed."""
40
  docs = []
41
- upload_dir = get_upload_dir(collection_name)
42
  pdf_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
43
  errors = []
44
 
@@ -102,7 +102,7 @@ def load_pdfs(collection_name: str) -> list:
102
 
103
  if not docs:
104
  if not pdf_files:
105
- raise RuntimeError("No PDF files found in the collection upload folder.")
106
  if errors:
107
  raise RuntimeError(
108
  "PDFs were found but no extractable text was produced. "
 
35
  return "\n".join(deduped)
36
 
37
 
38
+ def load_pdfs() -> list:
39
+ """Load PDFs from the shared upload folder using PyMuPDF; fall back to SimpleDirectoryReader if needed."""
40
  docs = []
41
+ upload_dir = get_upload_dir()
42
  pdf_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
43
  errors = []
44
 
 
102
 
103
  if not docs:
104
  if not pdf_files:
105
+ raise RuntimeError("No PDF files found in the upload folder.")
106
  if errors:
107
  raise RuntimeError(
108
  "PDFs were found but no extractable text was produced. "
retriever/hybrid_retriever.py CHANGED
@@ -21,11 +21,11 @@ def _extract_filename(metadata: dict) -> str:
21
  return "unknown"
22
 
23
  class HybridRetriever:
24
- def __init__(self, collection_name: str):
25
- index_dir = get_index_dir(collection_name)
26
  if not os.path.exists(index_dir) or not os.listdir(index_dir):
27
  raise RuntimeError(
28
- f"No index found for collection '{collection_name}'. Upload and index PDFs first."
29
  )
30
 
31
  configure_embedding()
 
21
  return "unknown"
22
 
23
  class HybridRetriever:
24
+ def __init__(self):
25
+ index_dir = get_index_dir()
26
  if not os.path.exists(index_dir) or not os.listdir(index_dir):
27
  raise RuntimeError(
28
+ "No index found. Upload and index PDFs first."
29
  )
30
 
31
  configure_embedding()