sidbhasin's picture
Update app.py
3f21fcc verified
import streamlit as st
from transformers import pipeline
import pdfplumber
import torch
from PyPDF2 import PdfReader
import re
# Set page config
st.set_page_config(
page_title="PDF AI Chat",
page_icon="πŸ“š",
layout="wide"
)
# Custom CSS for better chat interface
st.markdown("""
<style>
.chat-container {
border-radius: 10px;
margin-bottom: 20px;
padding: 20px;
}
.user-message {
background-color: #e6f3ff;
padding: 15px;
border-radius: 10px;
margin: 10px 0;
text-align: right;
}
.assistant-message {
background-color: #f0f2f6;
padding: 15px;
border-radius: 10px;
margin: 10px 0;
}
.source-info {
font-size: 0.8em;
color: #666;
margin-top: 5px;
padding-top: 5px;
border-top: 1px solid #ddd;
}
.chat-input {
position: fixed;
bottom: 0;
left: 0;
right: 0;
padding: 20px;
background: white;
border-top: 1px solid #ddd;
}
.main {
margin-bottom: 100px; /* Space for fixed chat input */
}
</style>
""", unsafe_allow_html=True)
# Initialize session state
if 'messages' not in st.session_state:
st.session_state.messages = []
if 'text_data' not in st.session_state:
st.session_state.text_data = None
@st.cache_resource
def load_model():
return pipeline(
"question-answering",
model="deepset/roberta-base-squad2",
tokenizer="deepset/roberta-base-squad2"
)
def extract_text_with_metadata(pdf_file):
text_data = []
with pdfplumber.open(pdf_file) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
text = page.extract_text()
if text:
paragraphs = text.split('\n\n')
for para_num, paragraph in enumerate(paragraphs, 1):
if paragraph.strip():
text_data.append({
'text': paragraph.strip(),
'page': page_num,
'paragraph': para_num,
'context': paragraph.strip()
})
return text_data
def find_answer(question, text_data, qa_model):
best_answer = None
max_score = 0
# Combine all text for context
full_text = ' '.join([item['text'] for item in text_data])
try:
# Get answer from model
result = qa_model(question=question, context=full_text)
# Find the source paragraph
answer_text = result['answer']
for item in text_data:
if answer_text in item['text']:
return {
'answer': answer_text,
'confidence': result['score'],
'page': item['page'],
'paragraph': item['paragraph'],
'context': item['text']
}
# If exact paragraph not found, return with first paragraph
return {
'answer': answer_text,
'confidence': result['score'],
'page': 1,
'paragraph': 1,
'context': text_data[0]['text']
}
except Exception as e:
st.error(f"Error finding answer: {str(e)}")
return None
def main():
st.title("πŸ“š PDF Chat Assistant")
try:
qa_model = load_model()
except Exception as e:
st.error(f"Error loading model: {str(e)}")
return
# File upload
pdf_file = st.file_uploader("Upload PDF Document", type=['pdf'])
if pdf_file and not st.session_state.text_data:
with st.spinner("Processing PDF..."):
try:
st.session_state.text_data = extract_text_with_metadata(pdf_file)
st.success("PDF processed successfully! You can now ask questions below.")
except Exception as e:
st.error(f"Error processing PDF: {str(e)}")
return
# Display chat interface if PDF is processed
if st.session_state.text_data:
# Chat history
st.markdown('<div class="chat-container">', unsafe_allow_html=True)
for message in st.session_state.messages:
if message["role"] == "user":
st.markdown(f'<div class="user-message">{message["content"]}</div>',
unsafe_allow_html=True)
else:
st.markdown(f"""
<div class="assistant-message">
<div>{message["content"]}</div>
<div class="source-info">
Source: Page {message["metadata"]["page"]},
Paragraph {message["metadata"]["paragraph"]}
(Confidence: {message["metadata"]["confidence"]:.1%})
</div>
</div>
""", unsafe_allow_html=True)
st.markdown('</div>', unsafe_allow_html=True)
# Chat input
with st.container():
st.markdown('<div class="chat-input">', unsafe_allow_html=True)
question = st.text_input("Ask a question about the document:", key="question_input")
st.markdown('</div>', unsafe_allow_html=True)
if question:
# Add user question to chat history
st.session_state.messages.append({"role": "user", "content": question})
# Get answer
with st.spinner("Finding answer..."):
answer = find_answer(question, st.session_state.text_data, qa_model)
if answer:
# Add assistant response to chat history
st.session_state.messages.append({
"role": "assistant",
"content": answer["answer"],
"metadata": {
"page": answer["page"],
"paragraph": answer["paragraph"],
"confidence": answer["confidence"],
"context": answer["context"]
}
})
# Rerun to update chat display
st.rerun()
else:
st.markdown("""
### Instructions:
1. Upload a PDF document using the file uploader above
2. Wait for the document to be processed
3. Use the chat interface to ask questions
4. Get answers with source information
### Features:
- Chat-like interface
- Source tracking
- Context preservation
- Multiple questions support
""")
if __name__ == "__main__":
main()