|
|
import streamlit as st |
|
|
from transformers import pipeline |
|
|
import pdfplumber |
|
|
import torch |
|
|
from PyPDF2 import PdfReader |
|
|
import re |
|
|
|
|
|
|
|
|
st.set_page_config( |
|
|
page_title="PDF AI Chat", |
|
|
page_icon="π", |
|
|
layout="wide" |
|
|
) |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
.chat-container { |
|
|
border-radius: 10px; |
|
|
margin-bottom: 20px; |
|
|
padding: 20px; |
|
|
} |
|
|
.user-message { |
|
|
background-color: #e6f3ff; |
|
|
padding: 15px; |
|
|
border-radius: 10px; |
|
|
margin: 10px 0; |
|
|
text-align: right; |
|
|
} |
|
|
.assistant-message { |
|
|
background-color: #f0f2f6; |
|
|
padding: 15px; |
|
|
border-radius: 10px; |
|
|
margin: 10px 0; |
|
|
} |
|
|
.source-info { |
|
|
font-size: 0.8em; |
|
|
color: #666; |
|
|
margin-top: 5px; |
|
|
padding-top: 5px; |
|
|
border-top: 1px solid #ddd; |
|
|
} |
|
|
.chat-input { |
|
|
position: fixed; |
|
|
bottom: 0; |
|
|
left: 0; |
|
|
right: 0; |
|
|
padding: 20px; |
|
|
background: white; |
|
|
border-top: 1px solid #ddd; |
|
|
} |
|
|
.main { |
|
|
margin-bottom: 100px; /* Space for fixed chat input */ |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
if 'messages' not in st.session_state: |
|
|
st.session_state.messages = [] |
|
|
if 'text_data' not in st.session_state: |
|
|
st.session_state.text_data = None |
|
|
|
|
|
@st.cache_resource |
|
|
def load_model(): |
|
|
return pipeline( |
|
|
"question-answering", |
|
|
model="deepset/roberta-base-squad2", |
|
|
tokenizer="deepset/roberta-base-squad2" |
|
|
) |
|
|
|
|
|
def extract_text_with_metadata(pdf_file): |
|
|
text_data = [] |
|
|
|
|
|
with pdfplumber.open(pdf_file) as pdf: |
|
|
for page_num, page in enumerate(pdf.pages, 1): |
|
|
text = page.extract_text() |
|
|
if text: |
|
|
paragraphs = text.split('\n\n') |
|
|
for para_num, paragraph in enumerate(paragraphs, 1): |
|
|
if paragraph.strip(): |
|
|
text_data.append({ |
|
|
'text': paragraph.strip(), |
|
|
'page': page_num, |
|
|
'paragraph': para_num, |
|
|
'context': paragraph.strip() |
|
|
}) |
|
|
return text_data |
|
|
|
|
|
def find_answer(question, text_data, qa_model): |
|
|
best_answer = None |
|
|
max_score = 0 |
|
|
|
|
|
|
|
|
full_text = ' '.join([item['text'] for item in text_data]) |
|
|
|
|
|
try: |
|
|
|
|
|
result = qa_model(question=question, context=full_text) |
|
|
|
|
|
|
|
|
answer_text = result['answer'] |
|
|
for item in text_data: |
|
|
if answer_text in item['text']: |
|
|
return { |
|
|
'answer': answer_text, |
|
|
'confidence': result['score'], |
|
|
'page': item['page'], |
|
|
'paragraph': item['paragraph'], |
|
|
'context': item['text'] |
|
|
} |
|
|
|
|
|
|
|
|
return { |
|
|
'answer': answer_text, |
|
|
'confidence': result['score'], |
|
|
'page': 1, |
|
|
'paragraph': 1, |
|
|
'context': text_data[0]['text'] |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"Error finding answer: {str(e)}") |
|
|
return None |
|
|
|
|
|
def main(): |
|
|
st.title("π PDF Chat Assistant") |
|
|
|
|
|
try: |
|
|
qa_model = load_model() |
|
|
except Exception as e: |
|
|
st.error(f"Error loading model: {str(e)}") |
|
|
return |
|
|
|
|
|
|
|
|
pdf_file = st.file_uploader("Upload PDF Document", type=['pdf']) |
|
|
|
|
|
if pdf_file and not st.session_state.text_data: |
|
|
with st.spinner("Processing PDF..."): |
|
|
try: |
|
|
st.session_state.text_data = extract_text_with_metadata(pdf_file) |
|
|
st.success("PDF processed successfully! You can now ask questions below.") |
|
|
except Exception as e: |
|
|
st.error(f"Error processing PDF: {str(e)}") |
|
|
return |
|
|
|
|
|
|
|
|
if st.session_state.text_data: |
|
|
|
|
|
st.markdown('<div class="chat-container">', unsafe_allow_html=True) |
|
|
for message in st.session_state.messages: |
|
|
if message["role"] == "user": |
|
|
st.markdown(f'<div class="user-message">{message["content"]}</div>', |
|
|
unsafe_allow_html=True) |
|
|
else: |
|
|
st.markdown(f""" |
|
|
<div class="assistant-message"> |
|
|
<div>{message["content"]}</div> |
|
|
<div class="source-info"> |
|
|
Source: Page {message["metadata"]["page"]}, |
|
|
Paragraph {message["metadata"]["paragraph"]} |
|
|
(Confidence: {message["metadata"]["confidence"]:.1%}) |
|
|
</div> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
with st.container(): |
|
|
st.markdown('<div class="chat-input">', unsafe_allow_html=True) |
|
|
question = st.text_input("Ask a question about the document:", key="question_input") |
|
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
|
|
if question: |
|
|
|
|
|
st.session_state.messages.append({"role": "user", "content": question}) |
|
|
|
|
|
|
|
|
with st.spinner("Finding answer..."): |
|
|
answer = find_answer(question, st.session_state.text_data, qa_model) |
|
|
|
|
|
if answer: |
|
|
|
|
|
st.session_state.messages.append({ |
|
|
"role": "assistant", |
|
|
"content": answer["answer"], |
|
|
"metadata": { |
|
|
"page": answer["page"], |
|
|
"paragraph": answer["paragraph"], |
|
|
"confidence": answer["confidence"], |
|
|
"context": answer["context"] |
|
|
} |
|
|
}) |
|
|
|
|
|
|
|
|
st.rerun() |
|
|
else: |
|
|
st.markdown(""" |
|
|
### Instructions: |
|
|
1. Upload a PDF document using the file uploader above |
|
|
2. Wait for the document to be processed |
|
|
3. Use the chat interface to ask questions |
|
|
4. Get answers with source information |
|
|
|
|
|
### Features: |
|
|
- Chat-like interface |
|
|
- Source tracking |
|
|
- Context preservation |
|
|
- Multiple questions support |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |