Spaces:

Maheshmahi04
/

DocChat

Sleeping

App Files Files Community

MaheshLEO4 commited on Mar 23

Commit

273e15c

1 Parent(s): 07b3441

Remove collection directories and simplify upload flow

Browse files

Files changed (5) hide show

app.py +15 -67
config.py +4 -10
ingestion/index_builder.py +5 -5
ingestion/loader.py +4 -4
retriever/hybrid_retriever.py +3 -3

app.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import os
-import shutil
 import streamlit as st
 from ingestion import ingest_pdfs
 from retriever import HybridRetriever
 from graph import AgentWorkflow
 from config import (
-    COLLECTIONS_DIR,
     get_upload_dir,
     get_index_dir,
     GROQ_FREE_MODELS,
@@ -15,7 +13,7 @@ from config import (
     DEFAULT_MODEL,
 )
-st.set_page_config(page_title="Multi-Agent RAG", layout="wide")
 st.markdown(
     """
@@ -81,27 +79,16 @@ st.markdown(
 st.markdown('<div class="hero-title">Multi-Agent Hybrid RAG</div>', unsafe_allow_html=True)
 st.markdown(
-    '<p class="hero-subtitle">Create collections, index PDFs, and chat with scoped answers.</p>',
     unsafe_allow_html=True,
 )
-def get_all_collections():
-    if not os.path.exists(COLLECTIONS_DIR):
-        return ["default"]
-    cols = [d for d in os.listdir(COLLECTIONS_DIR) if os.path.isdir(os.path.join(COLLECTIONS_DIR, d))]
-    if "default" not in cols:
-        cols.append("default")
-    return sorted(list(set(cols)))
-os.makedirs(COLLECTIONS_DIR, exist_ok=True)
 defaults = {
     "chat_history":          [],
     "conversation_history":  [],
     "retriever":             None,
     "model_provider":        DEFAULT_PROVIDER,
     "model_name":            DEFAULT_MODEL,
-    "active_collection":     "default"
 }
 for key, val in defaults.items():
     if key not in st.session_state:
@@ -126,50 +113,12 @@ with st.sidebar:
     st.session_state.model_provider = model_provider
     st.session_state.model_name = model_name
-    st.divider()
-    st.subheader("Collections")
-    all_collections = get_all_collections()
-    selected_col = st.selectbox(
-        "Current Collection",
-        all_collections,
-        index=all_collections.index(st.session_state.active_collection) if st.session_state.active_collection in all_collections else 0
-    )
-    # If collection changed:
-    if selected_col != st.session_state.active_collection:
-        st.session_state.active_collection = selected_col
-        st.session_state.retriever = None
-        st.session_state.chat_history = []
-        st.session_state.conversation_history = []
-        st.rerun()
-    c_new = st.text_input("New Collection Name")
-    if st.button("Create Collection"):
-        if c_new and c_new.strip() not in all_collections:
-            get_upload_dir(c_new.strip())
-            st.session_state.active_collection = c_new.strip()
-            st.session_state.retriever = None
-            st.session_state.chat_history = []
-            st.session_state.conversation_history = []
-            st.rerun()
-    if selected_col != "default":
-        if st.button(f"Delete '{selected_col}'"):
-            shutil.rmtree(os.path.join(COLLECTIONS_DIR, selected_col))
-            st.session_state.active_collection = "default"
-            st.session_state.retriever = None
-            st.session_state.chat_history = []
-            st.session_state.conversation_history = []
-            st.rerun()
-current_col = st.session_state.active_collection
-upload_dir = get_upload_dir(current_col)
-index_dir = get_index_dir(current_col)
-# Collection Manager
-st.markdown("<div class='section-title'>Collection files</div>", unsafe_allow_html=True)
-st.markdown(f"### {current_col} <span class='chip'>upload folder</span>", unsafe_allow_html=True)
 col_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
 if col_files:
@@ -190,11 +139,11 @@ if col_files:
             st.session_state.retriever = None
             st.rerun()
 else:
-    st.info("No documents in this collection.")
 st.markdown("<div class='section-title'>Add documents</div>", unsafe_allow_html=True)
 uploaded_files = st.file_uploader(
-    f"Add PDFs to '{current_col}'",
     type=["pdf"],
     accept_multiple_files=True,
 )
@@ -208,25 +157,24 @@ if uploaded_files:
                 fh.write(f.getbuffer())
             saved_any = True
     if saved_any:
-        st.success("Files uploaded! Click 'Index Collection' to apply changes.")
         st.rerun()
 colbase_has_pdf = len(os.listdir(upload_dir)) > 0
 index_exists = os.path.exists(index_dir) and len(os.listdir(index_dir)) > 0
 if colbase_has_pdf:
-    if st.button("Index / Re-index Collection", type="primary"):
         progress_bar = st.progress(0)
         status_text  = st.empty()
         try:
             ingest_pdfs(
-                collection_name=current_col,
                 progress_callback=lambda p, m: (progress_bar.progress(p), status_text.text(m))
             )
             st.session_state.retriever = None
             progress_bar.empty()
             status_text.empty()
-            st.success("Collection indexed! You can now ask questions.")
             st.rerun()
         except Exception as exc:
             progress_bar.empty(); status_text.empty()
@@ -245,17 +193,17 @@ for msg in st.session_state.chat_history:
         with st.expander("Verification Report", expanded=False):
             st.markdown(msg["verification"])
-question = st.chat_input(f"Ask about '{current_col}'...")
 if question:
     if not index_exists:
-        st.warning("Please index the collection first before asking questions.")
         st.stop()
     if st.session_state.retriever is None:
         with st.spinner("Loading retriever..."):
             try:
-                st.session_state.retriever = HybridRetriever(collection_name=current_col)
             except Exception as e:
                 st.error(str(e))
                 st.stop()

 import os
 import streamlit as st
 from ingestion import ingest_pdfs
 from retriever import HybridRetriever
 from graph import AgentWorkflow
 from config import (
     get_upload_dir,
     get_index_dir,
     GROQ_FREE_MODELS,
     DEFAULT_MODEL,
 )
+st.set_page_config(page_title="Docchat", layout="wide")
 st.markdown(
     """
 st.markdown('<div class="hero-title">Multi-Agent Hybrid RAG</div>', unsafe_allow_html=True)
 st.markdown(
+    '<p class="hero-subtitle">Upload PDFs, index them, and chat with grounded answers.</p>',
     unsafe_allow_html=True,
 )
 defaults = {
     "chat_history":          [],
     "conversation_history":  [],
     "retriever":             None,
     "model_provider":        DEFAULT_PROVIDER,
     "model_name":            DEFAULT_MODEL,
 }
 for key, val in defaults.items():
     if key not in st.session_state:
     st.session_state.model_provider = model_provider
     st.session_state.model_name = model_name
+upload_dir = get_upload_dir()
+index_dir = get_index_dir()
+# Upload Manager
+st.markdown("<div class='section-title'>Upload files</div>", unsafe_allow_html=True)
+st.markdown("### Upload folder <span class='chip'>shared</span>", unsafe_allow_html=True)
 col_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
 if col_files:
             st.session_state.retriever = None
             st.rerun()
 else:
+    st.info("No documents in the upload folder.")
 st.markdown("<div class='section-title'>Add documents</div>", unsafe_allow_html=True)
 uploaded_files = st.file_uploader(
+    "Add PDFs",
     type=["pdf"],
     accept_multiple_files=True,
 )
                 fh.write(f.getbuffer())
             saved_any = True
     if saved_any:
+        st.success("Files uploaded! Click 'Index PDFs' to apply changes.")
         st.rerun()
 colbase_has_pdf = len(os.listdir(upload_dir)) > 0
 index_exists = os.path.exists(index_dir) and len(os.listdir(index_dir)) > 0
 if colbase_has_pdf:
+    if st.button("Index PDFs", type="primary"):
         progress_bar = st.progress(0)
         status_text  = st.empty()
         try:
             ingest_pdfs(
                 progress_callback=lambda p, m: (progress_bar.progress(p), status_text.text(m))
             )
             st.session_state.retriever = None
             progress_bar.empty()
             status_text.empty()
+            st.success("Index ready! You can now ask questions.")
             st.rerun()
         except Exception as exc:
             progress_bar.empty(); status_text.empty()
         with st.expander("Verification Report", expanded=False):
             st.markdown(msg["verification"])
+question = st.chat_input("Ask about your PDFs...")
 if question:
     if not index_exists:
+        st.warning("Please index the PDFs first before asking questions.")
         st.stop()
     if st.session_state.retriever is None:
         with st.spinner("Loading retriever..."):
             try:
+                st.session_state.retriever = HybridRetriever()
             except Exception as e:
                 st.error(str(e))
                 st.stop()

config.py CHANGED Viewed

@@ -22,20 +22,14 @@ def _select_data_dir() -> str:
 DATA_DIR = _select_data_dir()
-COLLECTIONS_DIR = os.path.join(DATA_DIR, "collections")
-os.makedirs(COLLECTIONS_DIR, exist_ok=True)
-def get_collection_dir(collection_name: str) -> str:
-    return os.path.join(COLLECTIONS_DIR, collection_name)
-def get_upload_dir(collection_name: str) -> str:
-    path = os.path.join(get_collection_dir(collection_name), "raw_pdfs")
     os.makedirs(path, exist_ok=True)
     return path
-def get_index_dir(collection_name: str) -> str:
-    path = os.path.join(get_collection_dir(collection_name), "llamaindex")
     os.makedirs(path, exist_ok=True)
     return path

 DATA_DIR = _select_data_dir()
+def get_upload_dir() -> str:
+    path = os.path.join(DATA_DIR, "raw_pdfs")
     os.makedirs(path, exist_ok=True)
     return path
+def get_index_dir() -> str:
+    path = os.path.join(DATA_DIR, "llamaindex")
     os.makedirs(path, exist_ok=True)
     return path

ingestion/index_builder.py CHANGED Viewed

@@ -6,13 +6,13 @@ from utils import get_logger
 logger = get_logger(__name__)
-def build_index(nodes: list, collection_name: str, progress_callback=None) -> VectorStoreIndex:
     def _cb(p, m):
         if progress_callback:
             progress_callback(p, m)
         logger.info(m)
-    index_dir = get_index_dir(collection_name)
     total = len(nodes)
     logger.info(f"Building index from {total} nodes")
@@ -34,7 +34,7 @@ def build_index(nodes: list, collection_name: str, progress_callback=None) -> Ve
     logger.info(f"Index persisted to {index_dir}")
     return index
-def ingest_pdfs(collection_name: str, progress_callback=None):
     from ingestion.embedding import configure_embedding
     from ingestion.loader import load_pdfs
     from ingestion.splitter import split_documents
@@ -48,7 +48,7 @@ def ingest_pdfs(collection_name: str, progress_callback=None):
     configure_embedding()
     _cb(0.10, "Loading PDF documents...")
-    docs = load_pdfs(collection_name)
     _cb(0.25, f"Loaded {len(docs)} pages(s). Splitting into chunks...")
     nodes = split_documents(docs)
@@ -59,5 +59,5 @@ def ingest_pdfs(collection_name: str, progress_callback=None):
     def _build_cb(p, m):
         _cb(0.35 + p * 0.60, m)
-    build_index(nodes, collection_name, progress_callback=_build_cb)
     _cb(1.00, f"Done! Indexed {total} chunks.")

 logger = get_logger(__name__)
+def build_index(nodes: list, progress_callback=None) -> VectorStoreIndex:
     def _cb(p, m):
         if progress_callback:
             progress_callback(p, m)
         logger.info(m)
+    index_dir = get_index_dir()
     total = len(nodes)
     logger.info(f"Building index from {total} nodes")
     logger.info(f"Index persisted to {index_dir}")
     return index
+def ingest_pdfs(progress_callback=None):
     from ingestion.embedding import configure_embedding
     from ingestion.loader import load_pdfs
     from ingestion.splitter import split_documents
     configure_embedding()
     _cb(0.10, "Loading PDF documents...")
+    docs = load_pdfs()
     _cb(0.25, f"Loaded {len(docs)} pages(s). Splitting into chunks...")
     nodes = split_documents(docs)
     def _build_cb(p, m):
         _cb(0.35 + p * 0.60, m)
+    build_index(nodes, progress_callback=_build_cb)
     _cb(1.00, f"Done! Indexed {total} chunks.")

ingestion/loader.py CHANGED Viewed

@@ -35,10 +35,10 @@ def _dedupe_lines(text: str) -> str:
     return "\n".join(deduped)
-def load_pdfs(collection_name: str) -> list:
-    """Load PDFs from a collection using PyMuPDF; fall back to SimpleDirectoryReader if needed."""
     docs = []
-    upload_dir = get_upload_dir(collection_name)
     pdf_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
     errors = []
@@ -102,7 +102,7 @@ def load_pdfs(collection_name: str) -> list:
     if not docs:
         if not pdf_files:
-            raise RuntimeError("No PDF files found in the collection upload folder.")
         if errors:
             raise RuntimeError(
                 "PDFs were found but no extractable text was produced. "

     return "\n".join(deduped)
+def load_pdfs() -> list:
+    """Load PDFs from the shared upload folder using PyMuPDF; fall back to SimpleDirectoryReader if needed."""
     docs = []
+    upload_dir = get_upload_dir()
     pdf_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
     errors = []
     if not docs:
         if not pdf_files:
+            raise RuntimeError("No PDF files found in the upload folder.")
         if errors:
             raise RuntimeError(
                 "PDFs were found but no extractable text was produced. "

retriever/hybrid_retriever.py CHANGED Viewed

@@ -21,11 +21,11 @@ def _extract_filename(metadata: dict) -> str:
     return "unknown"
 class HybridRetriever:
-    def __init__(self, collection_name: str):
-        index_dir = get_index_dir(collection_name)
         if not os.path.exists(index_dir) or not os.listdir(index_dir):
             raise RuntimeError(
-                f"No index found for collection '{collection_name}'. Upload and index PDFs first."
             )
         configure_embedding()

     return "unknown"
 class HybridRetriever:
+    def __init__(self):
+        index_dir = get_index_dir()
         if not os.path.exists(index_dir) or not os.listdir(index_dir):
             raise RuntimeError(
+                "No index found. Upload and index PDFs first."
             )
         configure_embedding()