import streamlit as st import os from pathlib import Path import time from main import PDFProcessor, SecurityException # Configure page st.set_page_config( page_title="PDF Query Engine", page_icon="📚", layout="wide", ) # Initialize processor @st.cache_resource def get_processor(): return PDFProcessor() processor = get_processor() # Create upload directory if it doesn't exist upload_dir = Path("./uploads") upload_dir.mkdir(exist_ok=True) # Title and description st.title("PDF Query Engine 🔍") st.markdown(""" This application allows you to extract information from PDF documents using natural language queries. Upload a PDF, wait for it to be processed, then ask questions about its content! """) # Sidebar with st.sidebar: st.header("About") st.info(""" This tool uses natural language processing to extract and query information from PDFs. **Features:** - Extract text from PDFs - Process into semantic chunks - Query using natural language - Get relevant context from the document """) st.header("Instructions") st.markdown(""" 1. Upload a PDF file (max 26MB) 2. Wait for processing to complete 3. Type your question in the query box 4. Review the results """) # File uploader uploaded_file = st.file_uploader("Upload a PDF document", type=["pdf"]) # Process the uploaded file if uploaded_file is not None: # Save the uploaded file temporarily temp_file_path = os.path.join(upload_dir, uploaded_file.name) with open(temp_file_path, "wb") as f: f.write(uploaded_file.getbuffer()) # Check if file has already been processed file_hash = processor.get_file_hash(temp_file_path) persist_directory = os.path.join(processor.config["db_directory"], file_hash) already_processed = os.path.exists(persist_directory) # Display file info col1, col2 = st.columns(2) with col1: st.success(f"File uploaded: {uploaded_file.name}") # Show file size file_size = os.path.getsize(temp_file_path) / (1024 * 1024) # Convert to MB st.info(f"File size: {file_size:.2f} MB") with col2: if already_processed: st.info("This file has already been processed and is ready for querying.") process_button = st.button("Re-process file") else: st.warning("This file needs to be processed before querying.") process_button = st.button("Process file") # Process the file when button is clicked if process_button: try: with st.spinner("Processing PDF... This may take a minute."): # Process file vector_store = processor.process_file(temp_file_path) if vector_store: st.success("PDF processed successfully! You can now query the document.") else: st.error("Failed to process PDF. The file might be empty or corrupted.") except SecurityException as e: st.error(f"Security error: {str(e)}") except Exception as e: st.error(f"Error processing file: {str(e)}") # Query interface st.header("Ask questions about the document") # Check if the document can be queried can_query = os.path.exists(persist_directory) if can_query: query = st.text_input("Enter your question:") k_value = st.slider("Number of results to return", min_value=1, max_value=10, value=3) if st.button("Search") and query: with st.spinner("Searching for answers..."): try: results = processor.query_document(temp_file_path, query, k=k_value) if not results: st.info("No relevant information found. Try rephrasing your question.") else: st.subheader("Search Results") for i, doc in enumerate(results): with st.expander(f"Result {i+1}"): st.markdown(doc.page_content) except Exception as e: st.error(f"Error during query: {str(e)}") else: st.info("Please process the document before querying.") # Add footer st.markdown("---") st.markdown("PDF Query Engine | Built with Streamlit and LangChain")