Spaces:

sidbhasin
/

PDF_Answer_AI_By_Syncmerce

Sleeping

App Files Files Community

PDF_Answer_AI_By_Syncmerce / app.py

sidbhasin

Update app.py

3f21fcc verified about 1 year ago

raw

history blame contribute delete

6.98 kB

	import streamlit as st
	from transformers import pipeline
	import pdfplumber
	import torch
	from PyPDF2 import PdfReader
	import re

	# Set page config
	st.set_page_config(
	page_title="PDF AI Chat",
	page_icon="📚",
	layout="wide"
	)

	# Custom CSS for better chat interface
	st.markdown("""
	<style>
	.chat-container {
	border-radius: 10px;
	margin-bottom: 20px;
	padding: 20px;
	}
	.user-message {
	background-color: #e6f3ff;
	padding: 15px;
	border-radius: 10px;
	margin: 10px 0;
	text-align: right;
	}
	.assistant-message {
	background-color: #f0f2f6;
	padding: 15px;
	border-radius: 10px;
	margin: 10px 0;
	}
	.source-info {
	font-size: 0.8em;
	color: #666;
	margin-top: 5px;
	padding-top: 5px;
	border-top: 1px solid #ddd;
	}
	.chat-input {
	position: fixed;
	bottom: 0;
	left: 0;
	right: 0;
	padding: 20px;
	background: white;
	border-top: 1px solid #ddd;
	}
	.main {
	margin-bottom: 100px; /* Space for fixed chat input */
	}
	</style>
	""", unsafe_allow_html=True)

	# Initialize session state
	if 'messages' not in st.session_state:
	st.session_state.messages = []
	if 'text_data' not in st.session_state:
	st.session_state.text_data = None

	@st.cache_resource
	def load_model():
	return pipeline(
	"question-answering",
	model="deepset/roberta-base-squad2",
	tokenizer="deepset/roberta-base-squad2"
	)

	def extract_text_with_metadata(pdf_file):
	text_data = []

	with pdfplumber.open(pdf_file) as pdf:
	for page_num, page in enumerate(pdf.pages, 1):
	text = page.extract_text()
	if text:
	paragraphs = text.split('\n\n')
	for para_num, paragraph in enumerate(paragraphs, 1):
	if paragraph.strip():
	text_data.append({
	'text': paragraph.strip(),
	'page': page_num,
	'paragraph': para_num,
	'context': paragraph.strip()
	})
	return text_data

	def find_answer(question, text_data, qa_model):
	best_answer = None
	max_score = 0

	# Combine all text for context
	full_text = ' '.join([item['text'] for item in text_data])

	try:
	# Get answer from model
	result = qa_model(question=question, context=full_text)

	# Find the source paragraph
	answer_text = result['answer']
	for item in text_data:
	if answer_text in item['text']:
	return {
	'answer': answer_text,
	'confidence': result['score'],
	'page': item['page'],
	'paragraph': item['paragraph'],
	'context': item['text']
	}

	# If exact paragraph not found, return with first paragraph
	return {
	'answer': answer_text,
	'confidence': result['score'],
	'page': 1,
	'paragraph': 1,
	'context': text_data[0]['text']
	}

	except Exception as e:
	st.error(f"Error finding answer: {str(e)}")
	return None

	def main():
	st.title("📚 PDF Chat Assistant")

	try:
	qa_model = load_model()
	except Exception as e:
	st.error(f"Error loading model: {str(e)}")
	return

	# File upload
	pdf_file = st.file_uploader("Upload PDF Document", type=['pdf'])

	if pdf_file and not st.session_state.text_data:
	with st.spinner("Processing PDF..."):
	try:
	st.session_state.text_data = extract_text_with_metadata(pdf_file)
	st.success("PDF processed successfully! You can now ask questions below.")
	except Exception as e:
	st.error(f"Error processing PDF: {str(e)}")
	return

	# Display chat interface if PDF is processed
	if st.session_state.text_data:
	# Chat history
	st.markdown('<div class="chat-container">', unsafe_allow_html=True)
	for message in st.session_state.messages:
	if message["role"] == "user":
	st.markdown(f'<div class="user-message">{message["content"]}</div>',
	unsafe_allow_html=True)
	else:
	st.markdown(f"""
	<div class="assistant-message">
	<div>{message["content"]}</div>
	<div class="source-info">
	Source: Page {message["metadata"]["page"]},
	Paragraph {message["metadata"]["paragraph"]}
	(Confidence: {message["metadata"]["confidence"]:.1%})
	</div>
	</div>
	""", unsafe_allow_html=True)
	st.markdown('</div>', unsafe_allow_html=True)

	# Chat input
	with st.container():
	st.markdown('<div class="chat-input">', unsafe_allow_html=True)
	question = st.text_input("Ask a question about the document:", key="question_input")
	st.markdown('</div>', unsafe_allow_html=True)

	if question:
	# Add user question to chat history
	st.session_state.messages.append({"role": "user", "content": question})

	# Get answer
	with st.spinner("Finding answer..."):
	answer = find_answer(question, st.session_state.text_data, qa_model)

	if answer:
	# Add assistant response to chat history
	st.session_state.messages.append({
	"role": "assistant",
	"content": answer["answer"],
	"metadata": {
	"page": answer["page"],
	"paragraph": answer["paragraph"],
	"confidence": answer["confidence"],
	"context": answer["context"]
	}
	})

	# Rerun to update chat display
	st.rerun()
	else:
	st.markdown("""
	### Instructions:
	1. Upload a PDF document using the file uploader above
	2. Wait for the document to be processed
	3. Use the chat interface to ask questions
	4. Get answers with source information

	### Features:
	- Chat-like interface
	- Source tracking
	- Context preservation
	- Multiple questions support
	""")

	if __name__ == "__main__":
	main()