import streamlit as st from transformers import pipeline import pdfplumber import torch from PyPDF2 import PdfReader import re # Set page config st.set_page_config( page_title="PDF AI Chat", page_icon="📚", layout="wide" ) # Custom CSS for better chat interface st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'messages' not in st.session_state: st.session_state.messages = [] if 'text_data' not in st.session_state: st.session_state.text_data = None @st.cache_resource def load_model(): return pipeline( "question-answering", model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2" ) def extract_text_with_metadata(pdf_file): text_data = [] with pdfplumber.open(pdf_file) as pdf: for page_num, page in enumerate(pdf.pages, 1): text = page.extract_text() if text: paragraphs = text.split('\n\n') for para_num, paragraph in enumerate(paragraphs, 1): if paragraph.strip(): text_data.append({ 'text': paragraph.strip(), 'page': page_num, 'paragraph': para_num, 'context': paragraph.strip() }) return text_data def find_answer(question, text_data, qa_model): best_answer = None max_score = 0 # Combine all text for context full_text = ' '.join([item['text'] for item in text_data]) try: # Get answer from model result = qa_model(question=question, context=full_text) # Find the source paragraph answer_text = result['answer'] for item in text_data: if answer_text in item['text']: return { 'answer': answer_text, 'confidence': result['score'], 'page': item['page'], 'paragraph': item['paragraph'], 'context': item['text'] } # If exact paragraph not found, return with first paragraph return { 'answer': answer_text, 'confidence': result['score'], 'page': 1, 'paragraph': 1, 'context': text_data[0]['text'] } except Exception as e: st.error(f"Error finding answer: {str(e)}") return None def main(): st.title("📚 PDF Chat Assistant") try: qa_model = load_model() except Exception as e: st.error(f"Error loading model: {str(e)}") return # File upload pdf_file = st.file_uploader("Upload PDF Document", type=['pdf']) if pdf_file and not st.session_state.text_data: with st.spinner("Processing PDF..."): try: st.session_state.text_data = extract_text_with_metadata(pdf_file) st.success("PDF processed successfully! You can now ask questions below.") except Exception as e: st.error(f"Error processing PDF: {str(e)}") return # Display chat interface if PDF is processed if st.session_state.text_data: # Chat history st.markdown('