pdf / streamlit-app.py
navid72m's picture
Upload 9 files
43efcb9 verified
"""
Streamlit UI for the RAG system.
"""
import os
import streamlit as st
import tempfile
import logging
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Configure logging
from config import get_logging_config
import logging.config
logging.config.dictConfig(get_logging_config())
logger = logging.getLogger(__name__)
# Set page config
st.set_page_config(
page_title="RAG Document QA System",
page_icon="πŸ“š",
layout="wide",
initial_sidebar_state="expanded"
)
# Initialize session state
if "document_count" not in st.session_state:
st.session_state.document_count = 0
if "initialized" not in st.session_state:
st.session_state.initialized = False
# Initialize RAG engine
@st.cache_resource
def initialize_rag_engine():
"""Initialize RAG engine."""
from embedding.model import create_embedding_model
from storage.vector_db import create_vector_database
from rag.engine import create_rag_engine
# Create components
embedding_model = create_embedding_model()
vector_db = create_vector_database(dimension=embedding_model.dimension)
rag_engine = create_rag_engine(
embedder=embedding_model,
vector_db=vector_db
)
st.session_state.initialized = True
return rag_engine
# Initialize document processor
@st.cache_resource
def initialize_document_processor():
"""Initialize document processor."""
from document.processor import DocumentProcessor
return DocumentProcessor()
# Main application
def main():
"""Main Streamlit application."""
# Initialize components
rag_engine = initialize_rag_engine()
doc_processor = initialize_document_processor()
# Update document count
st.session_state.document_count = rag_engine.count_documents()
# Sidebar
st.sidebar.title("πŸ“š RAG Document QA")
# Document upload
st.sidebar.header("Upload Documents")
uploaded_file = st.sidebar.file_uploader(
"Choose a document file (PDF, TXT, DOCX)",
type=["pdf", "txt", "md", "docx"]
)
# Upload settings
st.sidebar.subheader("Document Settings")
chunk_size = st.sidebar.slider(
"Chunk Size",
min_value=100,
max_value=2000,
value=1000,
step=100,
help="Size of text chunks in characters"
)
chunk_overlap = st.sidebar.slider(
"Chunk Overlap",
min_value=0,
max_value=500,
value=200,
step=50,
help="Overlap between chunks in characters"
)
# Search settings
st.sidebar.header("Search Settings")
top_k = st.sidebar.slider(
"Results to Return",
min_value=1,
max_value=10,
value=3,
help="Number of document chunks to retrieve"
)
search_type = st.sidebar.selectbox(
"Search Type",
options=["hybrid", "semantic", "keyword"],
index=0,
help="Type of search to perform"
)
# Document info
st.sidebar.header("Document Store")
st.sidebar.metric("Documents Stored", st.session_state.document_count)
if st.sidebar.button("Clear All Documents"):
rag_engine.clear_documents()
st.session_state.document_count = 0
st.sidebar.success("Document store cleared!")
st.experimental_rerun()
# Process uploaded file
if uploaded_file is not None:
with st.sidebar.expander("Upload Status", expanded=True):
with st.spinner('Processing document...'):
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file_path = tmp_file.name
try:
# Process document
doc_processor.chunk_size = chunk_size
doc_processor.chunk_overlap = chunk_overlap
chunks, chunk_metadata = doc_processor.process_file(
tmp_file_path,
metadata={"filename": uploaded_file.name, "source": "UI upload"}
)
if not chunks:
st.sidebar.error("No text could be extracted from the document.")
else:
# Add chunks to RAG engine
doc_ids = rag_engine.add_documents(chunks, chunk_metadata)
# Update document count
st.session_state.document_count = rag_engine.count_documents()
st.sidebar.success(f"Added {len(chunks)} document chunks!")
except Exception as e:
st.sidebar.error(f"Error processing document: {str(e)}")
finally:
# Clean up temporary file
os.unlink(tmp_file_path)
# Main content
st.title("πŸ“š Document Query System")
if st.session_state.document_count == 0:
st.info("πŸ‘ˆ Please upload documents using the sidebar to get started.")
# Sample documents
st.subheader("Sample Text")
sample_text = st.text_area(
"Or try adding some sample text directly:",
height=200
)
if sample_text and st.button("Add Sample Text"):
with st.spinner('Processing text...'):
# Chunk the text
chunks = doc_processor._chunk_text(sample_text, chunk_size, chunk_overlap)
# Create metadata
chunk_metadata = [
{"source": "Sample text", "chunk_id": i, "total_chunks": len(chunks)}
for i in range(len(chunks))
]
# Add to RAG engine
doc_ids = rag_engine.add_documents(chunks, chunk_metadata)
# Update document count
st.session_state.document_count = rag_engine.count_documents()
st.success(f"Added {len(chunks)} text chunks!")
st.experimental_rerun()
else:
# Question answering
st.subheader("Ask a Question")
question = st.text_input("Enter your question:")
if question:
with st.spinner('Searching for answer...'):
try:
# Generate response
result = rag_engine.generate_response(
query=question,
top_k=top_k,
search_type=search_type
)
# Display response
st.markdown("### Answer")
st.write(result["response"])
# Display sources
st.markdown("### Sources")
for i, doc in enumerate(result["retrieved_documents"]):
with st.expander(f"Source {i+1} (Score: {doc['score']:.2f})"):
st.markdown(f"**Source:** {doc['metadata'].get('source', 'Unknown')}")
st.text(doc["text"])
except Exception as e:
st.error(f"Error generating response: {str(e)}")
# About section
st.sidebar.markdown("---")
st.sidebar.info(
"This application allows you to upload documents and ask questions about their content. "
"The system uses embedding models for semantic search and retrieval."
)
# Run the application
if __name__ == "__main__":
main()