Spaces:
Sleeping
Sleeping
Commit ·
273e15c
1
Parent(s): 07b3441
Remove collection directories and simplify upload flow
Browse files- app.py +15 -67
- config.py +4 -10
- ingestion/index_builder.py +5 -5
- ingestion/loader.py +4 -4
- retriever/hybrid_retriever.py +3 -3
app.py
CHANGED
|
@@ -1,12 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
-
import shutil
|
| 3 |
import streamlit as st
|
| 4 |
|
| 5 |
from ingestion import ingest_pdfs
|
| 6 |
from retriever import HybridRetriever
|
| 7 |
from graph import AgentWorkflow
|
| 8 |
from config import (
|
| 9 |
-
COLLECTIONS_DIR,
|
| 10 |
get_upload_dir,
|
| 11 |
get_index_dir,
|
| 12 |
GROQ_FREE_MODELS,
|
|
@@ -15,7 +13,7 @@ from config import (
|
|
| 15 |
DEFAULT_MODEL,
|
| 16 |
)
|
| 17 |
|
| 18 |
-
st.set_page_config(page_title="
|
| 19 |
|
| 20 |
st.markdown(
|
| 21 |
"""
|
|
@@ -81,27 +79,16 @@ st.markdown(
|
|
| 81 |
|
| 82 |
st.markdown('<div class="hero-title">Multi-Agent Hybrid RAG</div>', unsafe_allow_html=True)
|
| 83 |
st.markdown(
|
| 84 |
-
'<p class="hero-subtitle">
|
| 85 |
unsafe_allow_html=True,
|
| 86 |
)
|
| 87 |
|
| 88 |
-
def get_all_collections():
|
| 89 |
-
if not os.path.exists(COLLECTIONS_DIR):
|
| 90 |
-
return ["default"]
|
| 91 |
-
cols = [d for d in os.listdir(COLLECTIONS_DIR) if os.path.isdir(os.path.join(COLLECTIONS_DIR, d))]
|
| 92 |
-
if "default" not in cols:
|
| 93 |
-
cols.append("default")
|
| 94 |
-
return sorted(list(set(cols)))
|
| 95 |
-
|
| 96 |
-
os.makedirs(COLLECTIONS_DIR, exist_ok=True)
|
| 97 |
-
|
| 98 |
defaults = {
|
| 99 |
"chat_history": [],
|
| 100 |
"conversation_history": [],
|
| 101 |
"retriever": None,
|
| 102 |
"model_provider": DEFAULT_PROVIDER,
|
| 103 |
"model_name": DEFAULT_MODEL,
|
| 104 |
-
"active_collection": "default"
|
| 105 |
}
|
| 106 |
for key, val in defaults.items():
|
| 107 |
if key not in st.session_state:
|
|
@@ -126,50 +113,12 @@ with st.sidebar:
|
|
| 126 |
st.session_state.model_provider = model_provider
|
| 127 |
st.session_state.model_name = model_name
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
all_collections = get_all_collections()
|
| 132 |
-
|
| 133 |
-
selected_col = st.selectbox(
|
| 134 |
-
"Current Collection",
|
| 135 |
-
all_collections,
|
| 136 |
-
index=all_collections.index(st.session_state.active_collection) if st.session_state.active_collection in all_collections else 0
|
| 137 |
-
)
|
| 138 |
-
|
| 139 |
-
# If collection changed:
|
| 140 |
-
if selected_col != st.session_state.active_collection:
|
| 141 |
-
st.session_state.active_collection = selected_col
|
| 142 |
-
st.session_state.retriever = None
|
| 143 |
-
st.session_state.chat_history = []
|
| 144 |
-
st.session_state.conversation_history = []
|
| 145 |
-
st.rerun()
|
| 146 |
-
|
| 147 |
-
c_new = st.text_input("New Collection Name")
|
| 148 |
-
if st.button("Create Collection"):
|
| 149 |
-
if c_new and c_new.strip() not in all_collections:
|
| 150 |
-
get_upload_dir(c_new.strip())
|
| 151 |
-
st.session_state.active_collection = c_new.strip()
|
| 152 |
-
st.session_state.retriever = None
|
| 153 |
-
st.session_state.chat_history = []
|
| 154 |
-
st.session_state.conversation_history = []
|
| 155 |
-
st.rerun()
|
| 156 |
-
|
| 157 |
-
if selected_col != "default":
|
| 158 |
-
if st.button(f"Delete '{selected_col}'"):
|
| 159 |
-
shutil.rmtree(os.path.join(COLLECTIONS_DIR, selected_col))
|
| 160 |
-
st.session_state.active_collection = "default"
|
| 161 |
-
st.session_state.retriever = None
|
| 162 |
-
st.session_state.chat_history = []
|
| 163 |
-
st.session_state.conversation_history = []
|
| 164 |
-
st.rerun()
|
| 165 |
-
|
| 166 |
-
current_col = st.session_state.active_collection
|
| 167 |
-
upload_dir = get_upload_dir(current_col)
|
| 168 |
-
index_dir = get_index_dir(current_col)
|
| 169 |
|
| 170 |
-
#
|
| 171 |
-
st.markdown("<div class='section-title'>
|
| 172 |
-
st.markdown(
|
| 173 |
col_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
|
| 174 |
|
| 175 |
if col_files:
|
|
@@ -190,11 +139,11 @@ if col_files:
|
|
| 190 |
st.session_state.retriever = None
|
| 191 |
st.rerun()
|
| 192 |
else:
|
| 193 |
-
st.info("No documents in
|
| 194 |
|
| 195 |
st.markdown("<div class='section-title'>Add documents</div>", unsafe_allow_html=True)
|
| 196 |
uploaded_files = st.file_uploader(
|
| 197 |
-
|
| 198 |
type=["pdf"],
|
| 199 |
accept_multiple_files=True,
|
| 200 |
)
|
|
@@ -208,25 +157,24 @@ if uploaded_files:
|
|
| 208 |
fh.write(f.getbuffer())
|
| 209 |
saved_any = True
|
| 210 |
if saved_any:
|
| 211 |
-
st.success("Files uploaded! Click 'Index
|
| 212 |
st.rerun()
|
| 213 |
|
| 214 |
colbase_has_pdf = len(os.listdir(upload_dir)) > 0
|
| 215 |
index_exists = os.path.exists(index_dir) and len(os.listdir(index_dir)) > 0
|
| 216 |
|
| 217 |
if colbase_has_pdf:
|
| 218 |
-
if st.button("Index
|
| 219 |
progress_bar = st.progress(0)
|
| 220 |
status_text = st.empty()
|
| 221 |
try:
|
| 222 |
ingest_pdfs(
|
| 223 |
-
collection_name=current_col,
|
| 224 |
progress_callback=lambda p, m: (progress_bar.progress(p), status_text.text(m))
|
| 225 |
)
|
| 226 |
st.session_state.retriever = None
|
| 227 |
progress_bar.empty()
|
| 228 |
status_text.empty()
|
| 229 |
-
st.success("
|
| 230 |
st.rerun()
|
| 231 |
except Exception as exc:
|
| 232 |
progress_bar.empty(); status_text.empty()
|
|
@@ -245,17 +193,17 @@ for msg in st.session_state.chat_history:
|
|
| 245 |
with st.expander("Verification Report", expanded=False):
|
| 246 |
st.markdown(msg["verification"])
|
| 247 |
|
| 248 |
-
question = st.chat_input(
|
| 249 |
|
| 250 |
if question:
|
| 251 |
if not index_exists:
|
| 252 |
-
st.warning("Please index the
|
| 253 |
st.stop()
|
| 254 |
|
| 255 |
if st.session_state.retriever is None:
|
| 256 |
with st.spinner("Loading retriever..."):
|
| 257 |
try:
|
| 258 |
-
st.session_state.retriever = HybridRetriever(
|
| 259 |
except Exception as e:
|
| 260 |
st.error(str(e))
|
| 261 |
st.stop()
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import streamlit as st
|
| 3 |
|
| 4 |
from ingestion import ingest_pdfs
|
| 5 |
from retriever import HybridRetriever
|
| 6 |
from graph import AgentWorkflow
|
| 7 |
from config import (
|
|
|
|
| 8 |
get_upload_dir,
|
| 9 |
get_index_dir,
|
| 10 |
GROQ_FREE_MODELS,
|
|
|
|
| 13 |
DEFAULT_MODEL,
|
| 14 |
)
|
| 15 |
|
| 16 |
+
st.set_page_config(page_title="Docchat", layout="wide")
|
| 17 |
|
| 18 |
st.markdown(
|
| 19 |
"""
|
|
|
|
| 79 |
|
| 80 |
st.markdown('<div class="hero-title">Multi-Agent Hybrid RAG</div>', unsafe_allow_html=True)
|
| 81 |
st.markdown(
|
| 82 |
+
'<p class="hero-subtitle">Upload PDFs, index them, and chat with grounded answers.</p>',
|
| 83 |
unsafe_allow_html=True,
|
| 84 |
)
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
defaults = {
|
| 87 |
"chat_history": [],
|
| 88 |
"conversation_history": [],
|
| 89 |
"retriever": None,
|
| 90 |
"model_provider": DEFAULT_PROVIDER,
|
| 91 |
"model_name": DEFAULT_MODEL,
|
|
|
|
| 92 |
}
|
| 93 |
for key, val in defaults.items():
|
| 94 |
if key not in st.session_state:
|
|
|
|
| 113 |
st.session_state.model_provider = model_provider
|
| 114 |
st.session_state.model_name = model_name
|
| 115 |
|
| 116 |
+
upload_dir = get_upload_dir()
|
| 117 |
+
index_dir = get_index_dir()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
# Upload Manager
|
| 120 |
+
st.markdown("<div class='section-title'>Upload files</div>", unsafe_allow_html=True)
|
| 121 |
+
st.markdown("### Upload folder <span class='chip'>shared</span>", unsafe_allow_html=True)
|
| 122 |
col_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
|
| 123 |
|
| 124 |
if col_files:
|
|
|
|
| 139 |
st.session_state.retriever = None
|
| 140 |
st.rerun()
|
| 141 |
else:
|
| 142 |
+
st.info("No documents in the upload folder.")
|
| 143 |
|
| 144 |
st.markdown("<div class='section-title'>Add documents</div>", unsafe_allow_html=True)
|
| 145 |
uploaded_files = st.file_uploader(
|
| 146 |
+
"Add PDFs",
|
| 147 |
type=["pdf"],
|
| 148 |
accept_multiple_files=True,
|
| 149 |
)
|
|
|
|
| 157 |
fh.write(f.getbuffer())
|
| 158 |
saved_any = True
|
| 159 |
if saved_any:
|
| 160 |
+
st.success("Files uploaded! Click 'Index PDFs' to apply changes.")
|
| 161 |
st.rerun()
|
| 162 |
|
| 163 |
colbase_has_pdf = len(os.listdir(upload_dir)) > 0
|
| 164 |
index_exists = os.path.exists(index_dir) and len(os.listdir(index_dir)) > 0
|
| 165 |
|
| 166 |
if colbase_has_pdf:
|
| 167 |
+
if st.button("Index PDFs", type="primary"):
|
| 168 |
progress_bar = st.progress(0)
|
| 169 |
status_text = st.empty()
|
| 170 |
try:
|
| 171 |
ingest_pdfs(
|
|
|
|
| 172 |
progress_callback=lambda p, m: (progress_bar.progress(p), status_text.text(m))
|
| 173 |
)
|
| 174 |
st.session_state.retriever = None
|
| 175 |
progress_bar.empty()
|
| 176 |
status_text.empty()
|
| 177 |
+
st.success("Index ready! You can now ask questions.")
|
| 178 |
st.rerun()
|
| 179 |
except Exception as exc:
|
| 180 |
progress_bar.empty(); status_text.empty()
|
|
|
|
| 193 |
with st.expander("Verification Report", expanded=False):
|
| 194 |
st.markdown(msg["verification"])
|
| 195 |
|
| 196 |
+
question = st.chat_input("Ask about your PDFs...")
|
| 197 |
|
| 198 |
if question:
|
| 199 |
if not index_exists:
|
| 200 |
+
st.warning("Please index the PDFs first before asking questions.")
|
| 201 |
st.stop()
|
| 202 |
|
| 203 |
if st.session_state.retriever is None:
|
| 204 |
with st.spinner("Loading retriever..."):
|
| 205 |
try:
|
| 206 |
+
st.session_state.retriever = HybridRetriever()
|
| 207 |
except Exception as e:
|
| 208 |
st.error(str(e))
|
| 209 |
st.stop()
|
config.py
CHANGED
|
@@ -22,20 +22,14 @@ def _select_data_dir() -> str:
|
|
| 22 |
|
| 23 |
|
| 24 |
DATA_DIR = _select_data_dir()
|
| 25 |
-
COLLECTIONS_DIR = os.path.join(DATA_DIR, "collections")
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def get_collection_dir(collection_name: str) -> str:
|
| 30 |
-
return os.path.join(COLLECTIONS_DIR, collection_name)
|
| 31 |
-
|
| 32 |
-
def get_upload_dir(collection_name: str) -> str:
|
| 33 |
-
path = os.path.join(get_collection_dir(collection_name), "raw_pdfs")
|
| 34 |
os.makedirs(path, exist_ok=True)
|
| 35 |
return path
|
| 36 |
|
| 37 |
-
def get_index_dir(
|
| 38 |
-
path = os.path.join(
|
| 39 |
os.makedirs(path, exist_ok=True)
|
| 40 |
return path
|
| 41 |
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
DATA_DIR = _select_data_dir()
|
|
|
|
| 25 |
|
| 26 |
+
def get_upload_dir() -> str:
|
| 27 |
+
path = os.path.join(DATA_DIR, "raw_pdfs")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
os.makedirs(path, exist_ok=True)
|
| 29 |
return path
|
| 30 |
|
| 31 |
+
def get_index_dir() -> str:
|
| 32 |
+
path = os.path.join(DATA_DIR, "llamaindex")
|
| 33 |
os.makedirs(path, exist_ok=True)
|
| 34 |
return path
|
| 35 |
|
ingestion/index_builder.py
CHANGED
|
@@ -6,13 +6,13 @@ from utils import get_logger
|
|
| 6 |
|
| 7 |
logger = get_logger(__name__)
|
| 8 |
|
| 9 |
-
def build_index(nodes: list,
|
| 10 |
def _cb(p, m):
|
| 11 |
if progress_callback:
|
| 12 |
progress_callback(p, m)
|
| 13 |
logger.info(m)
|
| 14 |
|
| 15 |
-
index_dir = get_index_dir(
|
| 16 |
total = len(nodes)
|
| 17 |
logger.info(f"Building index from {total} nodes")
|
| 18 |
|
|
@@ -34,7 +34,7 @@ def build_index(nodes: list, collection_name: str, progress_callback=None) -> Ve
|
|
| 34 |
logger.info(f"Index persisted to {index_dir}")
|
| 35 |
return index
|
| 36 |
|
| 37 |
-
def ingest_pdfs(
|
| 38 |
from ingestion.embedding import configure_embedding
|
| 39 |
from ingestion.loader import load_pdfs
|
| 40 |
from ingestion.splitter import split_documents
|
|
@@ -48,7 +48,7 @@ def ingest_pdfs(collection_name: str, progress_callback=None):
|
|
| 48 |
configure_embedding()
|
| 49 |
|
| 50 |
_cb(0.10, "Loading PDF documents...")
|
| 51 |
-
docs = load_pdfs(
|
| 52 |
|
| 53 |
_cb(0.25, f"Loaded {len(docs)} pages(s). Splitting into chunks...")
|
| 54 |
nodes = split_documents(docs)
|
|
@@ -59,5 +59,5 @@ def ingest_pdfs(collection_name: str, progress_callback=None):
|
|
| 59 |
def _build_cb(p, m):
|
| 60 |
_cb(0.35 + p * 0.60, m)
|
| 61 |
|
| 62 |
-
build_index(nodes,
|
| 63 |
_cb(1.00, f"Done! Indexed {total} chunks.")
|
|
|
|
| 6 |
|
| 7 |
logger = get_logger(__name__)
|
| 8 |
|
| 9 |
+
def build_index(nodes: list, progress_callback=None) -> VectorStoreIndex:
|
| 10 |
def _cb(p, m):
|
| 11 |
if progress_callback:
|
| 12 |
progress_callback(p, m)
|
| 13 |
logger.info(m)
|
| 14 |
|
| 15 |
+
index_dir = get_index_dir()
|
| 16 |
total = len(nodes)
|
| 17 |
logger.info(f"Building index from {total} nodes")
|
| 18 |
|
|
|
|
| 34 |
logger.info(f"Index persisted to {index_dir}")
|
| 35 |
return index
|
| 36 |
|
| 37 |
+
def ingest_pdfs(progress_callback=None):
|
| 38 |
from ingestion.embedding import configure_embedding
|
| 39 |
from ingestion.loader import load_pdfs
|
| 40 |
from ingestion.splitter import split_documents
|
|
|
|
| 48 |
configure_embedding()
|
| 49 |
|
| 50 |
_cb(0.10, "Loading PDF documents...")
|
| 51 |
+
docs = load_pdfs()
|
| 52 |
|
| 53 |
_cb(0.25, f"Loaded {len(docs)} pages(s). Splitting into chunks...")
|
| 54 |
nodes = split_documents(docs)
|
|
|
|
| 59 |
def _build_cb(p, m):
|
| 60 |
_cb(0.35 + p * 0.60, m)
|
| 61 |
|
| 62 |
+
build_index(nodes, progress_callback=_build_cb)
|
| 63 |
_cb(1.00, f"Done! Indexed {total} chunks.")
|
ingestion/loader.py
CHANGED
|
@@ -35,10 +35,10 @@ def _dedupe_lines(text: str) -> str:
|
|
| 35 |
return "\n".join(deduped)
|
| 36 |
|
| 37 |
|
| 38 |
-
def load_pdfs(
|
| 39 |
-
"""Load PDFs from
|
| 40 |
docs = []
|
| 41 |
-
upload_dir = get_upload_dir(
|
| 42 |
pdf_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
|
| 43 |
errors = []
|
| 44 |
|
|
@@ -102,7 +102,7 @@ def load_pdfs(collection_name: str) -> list:
|
|
| 102 |
|
| 103 |
if not docs:
|
| 104 |
if not pdf_files:
|
| 105 |
-
raise RuntimeError("No PDF files found in the
|
| 106 |
if errors:
|
| 107 |
raise RuntimeError(
|
| 108 |
"PDFs were found but no extractable text was produced. "
|
|
|
|
| 35 |
return "\n".join(deduped)
|
| 36 |
|
| 37 |
|
| 38 |
+
def load_pdfs() -> list:
|
| 39 |
+
"""Load PDFs from the shared upload folder using PyMuPDF; fall back to SimpleDirectoryReader if needed."""
|
| 40 |
docs = []
|
| 41 |
+
upload_dir = get_upload_dir()
|
| 42 |
pdf_files = [f for f in os.listdir(upload_dir) if f.lower().endswith(".pdf")]
|
| 43 |
errors = []
|
| 44 |
|
|
|
|
| 102 |
|
| 103 |
if not docs:
|
| 104 |
if not pdf_files:
|
| 105 |
+
raise RuntimeError("No PDF files found in the upload folder.")
|
| 106 |
if errors:
|
| 107 |
raise RuntimeError(
|
| 108 |
"PDFs were found but no extractable text was produced. "
|
retriever/hybrid_retriever.py
CHANGED
|
@@ -21,11 +21,11 @@ def _extract_filename(metadata: dict) -> str:
|
|
| 21 |
return "unknown"
|
| 22 |
|
| 23 |
class HybridRetriever:
|
| 24 |
-
def __init__(self
|
| 25 |
-
index_dir = get_index_dir(
|
| 26 |
if not os.path.exists(index_dir) or not os.listdir(index_dir):
|
| 27 |
raise RuntimeError(
|
| 28 |
-
|
| 29 |
)
|
| 30 |
|
| 31 |
configure_embedding()
|
|
|
|
| 21 |
return "unknown"
|
| 22 |
|
| 23 |
class HybridRetriever:
|
| 24 |
+
def __init__(self):
|
| 25 |
+
index_dir = get_index_dir()
|
| 26 |
if not os.path.exists(index_dir) or not os.listdir(index_dir):
|
| 27 |
raise RuntimeError(
|
| 28 |
+
"No index found. Upload and index PDFs first."
|
| 29 |
)
|
| 30 |
|
| 31 |
configure_embedding()
|