|
|
""" |
|
|
Streamlit UI for the RAG system. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import streamlit as st |
|
|
import tempfile |
|
|
import logging |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
from config import get_logging_config |
|
|
import logging.config |
|
|
logging.config.dictConfig(get_logging_config()) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
st.set_page_config( |
|
|
page_title="RAG Document QA System", |
|
|
page_icon="π", |
|
|
layout="wide", |
|
|
initial_sidebar_state="expanded" |
|
|
) |
|
|
|
|
|
|
|
|
if "document_count" not in st.session_state: |
|
|
st.session_state.document_count = 0 |
|
|
if "initialized" not in st.session_state: |
|
|
st.session_state.initialized = False |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def initialize_rag_engine(): |
|
|
"""Initialize RAG engine.""" |
|
|
from embedding.model import create_embedding_model |
|
|
from storage.vector_db import create_vector_database |
|
|
from rag.engine import create_rag_engine |
|
|
|
|
|
|
|
|
embedding_model = create_embedding_model() |
|
|
vector_db = create_vector_database(dimension=embedding_model.dimension) |
|
|
rag_engine = create_rag_engine( |
|
|
embedder=embedding_model, |
|
|
vector_db=vector_db |
|
|
) |
|
|
|
|
|
st.session_state.initialized = True |
|
|
return rag_engine |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def initialize_document_processor(): |
|
|
"""Initialize document processor.""" |
|
|
from document.processor import DocumentProcessor |
|
|
return DocumentProcessor() |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main Streamlit application.""" |
|
|
|
|
|
rag_engine = initialize_rag_engine() |
|
|
doc_processor = initialize_document_processor() |
|
|
|
|
|
|
|
|
st.session_state.document_count = rag_engine.count_documents() |
|
|
|
|
|
|
|
|
st.sidebar.title("π RAG Document QA") |
|
|
|
|
|
|
|
|
st.sidebar.header("Upload Documents") |
|
|
uploaded_file = st.sidebar.file_uploader( |
|
|
"Choose a document file (PDF, TXT, DOCX)", |
|
|
type=["pdf", "txt", "md", "docx"] |
|
|
) |
|
|
|
|
|
|
|
|
st.sidebar.subheader("Document Settings") |
|
|
chunk_size = st.sidebar.slider( |
|
|
"Chunk Size", |
|
|
min_value=100, |
|
|
max_value=2000, |
|
|
value=1000, |
|
|
step=100, |
|
|
help="Size of text chunks in characters" |
|
|
) |
|
|
chunk_overlap = st.sidebar.slider( |
|
|
"Chunk Overlap", |
|
|
min_value=0, |
|
|
max_value=500, |
|
|
value=200, |
|
|
step=50, |
|
|
help="Overlap between chunks in characters" |
|
|
) |
|
|
|
|
|
|
|
|
st.sidebar.header("Search Settings") |
|
|
top_k = st.sidebar.slider( |
|
|
"Results to Return", |
|
|
min_value=1, |
|
|
max_value=10, |
|
|
value=3, |
|
|
help="Number of document chunks to retrieve" |
|
|
) |
|
|
search_type = st.sidebar.selectbox( |
|
|
"Search Type", |
|
|
options=["hybrid", "semantic", "keyword"], |
|
|
index=0, |
|
|
help="Type of search to perform" |
|
|
) |
|
|
|
|
|
|
|
|
st.sidebar.header("Document Store") |
|
|
st.sidebar.metric("Documents Stored", st.session_state.document_count) |
|
|
|
|
|
if st.sidebar.button("Clear All Documents"): |
|
|
rag_engine.clear_documents() |
|
|
st.session_state.document_count = 0 |
|
|
st.sidebar.success("Document store cleared!") |
|
|
st.experimental_rerun() |
|
|
|
|
|
|
|
|
if uploaded_file is not None: |
|
|
with st.sidebar.expander("Upload Status", expanded=True): |
|
|
with st.spinner('Processing document...'): |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file: |
|
|
tmp_file.write(uploaded_file.getvalue()) |
|
|
tmp_file_path = tmp_file.name |
|
|
|
|
|
try: |
|
|
|
|
|
doc_processor.chunk_size = chunk_size |
|
|
doc_processor.chunk_overlap = chunk_overlap |
|
|
|
|
|
chunks, chunk_metadata = doc_processor.process_file( |
|
|
tmp_file_path, |
|
|
metadata={"filename": uploaded_file.name, "source": "UI upload"} |
|
|
) |
|
|
|
|
|
if not chunks: |
|
|
st.sidebar.error("No text could be extracted from the document.") |
|
|
else: |
|
|
|
|
|
doc_ids = rag_engine.add_documents(chunks, chunk_metadata) |
|
|
|
|
|
|
|
|
st.session_state.document_count = rag_engine.count_documents() |
|
|
|
|
|
st.sidebar.success(f"Added {len(chunks)} document chunks!") |
|
|
except Exception as e: |
|
|
st.sidebar.error(f"Error processing document: {str(e)}") |
|
|
finally: |
|
|
|
|
|
os.unlink(tmp_file_path) |
|
|
|
|
|
|
|
|
st.title("π Document Query System") |
|
|
|
|
|
if st.session_state.document_count == 0: |
|
|
st.info("π Please upload documents using the sidebar to get started.") |
|
|
|
|
|
|
|
|
st.subheader("Sample Text") |
|
|
sample_text = st.text_area( |
|
|
"Or try adding some sample text directly:", |
|
|
height=200 |
|
|
) |
|
|
|
|
|
if sample_text and st.button("Add Sample Text"): |
|
|
with st.spinner('Processing text...'): |
|
|
|
|
|
chunks = doc_processor._chunk_text(sample_text, chunk_size, chunk_overlap) |
|
|
|
|
|
|
|
|
chunk_metadata = [ |
|
|
{"source": "Sample text", "chunk_id": i, "total_chunks": len(chunks)} |
|
|
for i in range(len(chunks)) |
|
|
] |
|
|
|
|
|
|
|
|
doc_ids = rag_engine.add_documents(chunks, chunk_metadata) |
|
|
|
|
|
|
|
|
st.session_state.document_count = rag_engine.count_documents() |
|
|
|
|
|
st.success(f"Added {len(chunks)} text chunks!") |
|
|
st.experimental_rerun() |
|
|
else: |
|
|
|
|
|
st.subheader("Ask a Question") |
|
|
question = st.text_input("Enter your question:") |
|
|
|
|
|
if question: |
|
|
with st.spinner('Searching for answer...'): |
|
|
try: |
|
|
|
|
|
result = rag_engine.generate_response( |
|
|
query=question, |
|
|
top_k=top_k, |
|
|
search_type=search_type |
|
|
) |
|
|
|
|
|
|
|
|
st.markdown("### Answer") |
|
|
st.write(result["response"]) |
|
|
|
|
|
|
|
|
st.markdown("### Sources") |
|
|
for i, doc in enumerate(result["retrieved_documents"]): |
|
|
with st.expander(f"Source {i+1} (Score: {doc['score']:.2f})"): |
|
|
st.markdown(f"**Source:** {doc['metadata'].get('source', 'Unknown')}") |
|
|
st.text(doc["text"]) |
|
|
except Exception as e: |
|
|
st.error(f"Error generating response: {str(e)}") |
|
|
|
|
|
|
|
|
st.sidebar.markdown("---") |
|
|
st.sidebar.info( |
|
|
"This application allows you to upload documents and ask questions about their content. " |
|
|
"The system uses embedding models for semantic search and retrieval." |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|