import streamlit as st from transformers import pipeline import pdfplumber import torch from PyPDF2 import PdfReader import re # Set page config st.set_page_config( page_title="PDF AI Chat", page_icon="📚", layout="wide" ) # Custom CSS for better chat interface st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'messages' not in st.session_state: st.session_state.messages = [] if 'text_data' not in st.session_state: st.session_state.text_data = None @st.cache_resource def load_model(): return pipeline( "question-answering", model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2" ) def extract_text_with_metadata(pdf_file): text_data = [] with pdfplumber.open(pdf_file) as pdf: for page_num, page in enumerate(pdf.pages, 1): text = page.extract_text() if text: paragraphs = text.split('\n\n') for para_num, paragraph in enumerate(paragraphs, 1): if paragraph.strip(): text_data.append({ 'text': paragraph.strip(), 'page': page_num, 'paragraph': para_num, 'context': paragraph.strip() }) return text_data def find_answer(question, text_data, qa_model): best_answer = None max_score = 0 # Combine all text for context full_text = ' '.join([item['text'] for item in text_data]) try: # Get answer from model result = qa_model(question=question, context=full_text) # Find the source paragraph answer_text = result['answer'] for item in text_data: if answer_text in item['text']: return { 'answer': answer_text, 'confidence': result['score'], 'page': item['page'], 'paragraph': item['paragraph'], 'context': item['text'] } # If exact paragraph not found, return with first paragraph return { 'answer': answer_text, 'confidence': result['score'], 'page': 1, 'paragraph': 1, 'context': text_data[0]['text'] } except Exception as e: st.error(f"Error finding answer: {str(e)}") return None def main(): st.title("📚 PDF Chat Assistant") try: qa_model = load_model() except Exception as e: st.error(f"Error loading model: {str(e)}") return # File upload pdf_file = st.file_uploader("Upload PDF Document", type=['pdf']) if pdf_file and not st.session_state.text_data: with st.spinner("Processing PDF..."): try: st.session_state.text_data = extract_text_with_metadata(pdf_file) st.success("PDF processed successfully! You can now ask questions below.") except Exception as e: st.error(f"Error processing PDF: {str(e)}") return # Display chat interface if PDF is processed if st.session_state.text_data: # Chat history st.markdown('

', unsafe_allow_html=True) for message in st.session_state.messages: if message["role"] == "user": st.markdown(f'

{message["content"]}

', unsafe_allow_html=True) else: st.markdown(f"""

{message["content"]}

Source: Page {message["metadata"]["page"]}, Paragraph {message["metadata"]["paragraph"]} (Confidence: {message["metadata"]["confidence"]:.1%})

""", unsafe_allow_html=True) st.markdown('

', unsafe_allow_html=True) # Chat input with st.container(): st.markdown('

', unsafe_allow_html=True) question = st.text_input("Ask a question about the document:", key="question_input") st.markdown('

', unsafe_allow_html=True) if question: # Add user question to chat history st.session_state.messages.append({"role": "user", "content": question}) # Get answer with st.spinner("Finding answer..."): answer = find_answer(question, st.session_state.text_data, qa_model) if answer: # Add assistant response to chat history st.session_state.messages.append({ "role": "assistant", "content": answer["answer"], "metadata": { "page": answer["page"], "paragraph": answer["paragraph"], "confidence": answer["confidence"], "context": answer["context"] } }) # Rerun to update chat display st.rerun() else: st.markdown(""" ### Instructions: 1. Upload a PDF document using the file uploader above 2. Wait for the document to be processed 3. Use the chat interface to ask questions 4. Get answers with source information ### Features: - Chat-like interface - Source tracking - Context preservation - Multiple questions support """) if __name__ == "__main__": main()