nerala_ai_backend / streamlit.py
Nde Dilan
Add application file
d4e21df
import streamlit as st
import os
from pathlib import Path
import time
from main import PDFProcessor, SecurityException
# Configure page
st.set_page_config(
page_title="PDF Query Engine",
page_icon="📚",
layout="wide",
)
# Initialize processor
@st.cache_resource
def get_processor():
return PDFProcessor()
processor = get_processor()
# Create upload directory if it doesn't exist
upload_dir = Path("./uploads")
upload_dir.mkdir(exist_ok=True)
# Title and description
st.title("PDF Query Engine 🔍")
st.markdown("""
This application allows you to extract information from PDF documents using natural language queries.
Upload a PDF, wait for it to be processed, then ask questions about its content!
""")
# Sidebar
with st.sidebar:
st.header("About")
st.info("""
This tool uses natural language processing to extract and query information from PDFs.
**Features:**
- Extract text from PDFs
- Process into semantic chunks
- Query using natural language
- Get relevant context from the document
""")
st.header("Instructions")
st.markdown("""
1. Upload a PDF file (max 26MB)
2. Wait for processing to complete
3. Type your question in the query box
4. Review the results
""")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF document", type=["pdf"])
# Process the uploaded file
if uploaded_file is not None:
# Save the uploaded file temporarily
temp_file_path = os.path.join(upload_dir, uploaded_file.name)
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Check if file has already been processed
file_hash = processor.get_file_hash(temp_file_path)
persist_directory = os.path.join(processor.config["db_directory"], file_hash)
already_processed = os.path.exists(persist_directory)
# Display file info
col1, col2 = st.columns(2)
with col1:
st.success(f"File uploaded: {uploaded_file.name}")
# Show file size
file_size = os.path.getsize(temp_file_path) / (1024 * 1024) # Convert to MB
st.info(f"File size: {file_size:.2f} MB")
with col2:
if already_processed:
st.info("This file has already been processed and is ready for querying.")
process_button = st.button("Re-process file")
else:
st.warning("This file needs to be processed before querying.")
process_button = st.button("Process file")
# Process the file when button is clicked
if process_button:
try:
with st.spinner("Processing PDF... This may take a minute."):
# Process file
vector_store = processor.process_file(temp_file_path)
if vector_store:
st.success("PDF processed successfully! You can now query the document.")
else:
st.error("Failed to process PDF. The file might be empty or corrupted.")
except SecurityException as e:
st.error(f"Security error: {str(e)}")
except Exception as e:
st.error(f"Error processing file: {str(e)}")
# Query interface
st.header("Ask questions about the document")
# Check if the document can be queried
can_query = os.path.exists(persist_directory)
if can_query:
query = st.text_input("Enter your question:")
k_value = st.slider("Number of results to return", min_value=1, max_value=10, value=3)
if st.button("Search") and query:
with st.spinner("Searching for answers..."):
try:
results = processor.query_document(temp_file_path, query, k=k_value)
if not results:
st.info("No relevant information found. Try rephrasing your question.")
else:
st.subheader("Search Results")
for i, doc in enumerate(results):
with st.expander(f"Result {i+1}"):
st.markdown(doc.page_content)
except Exception as e:
st.error(f"Error during query: {str(e)}")
else:
st.info("Please process the document before querying.")
# Add footer
st.markdown("---")
st.markdown("PDF Query Engine | Built with Streamlit and LangChain")