chatWithPdfs / app.py
Shubham2003's picture
Upload app.py
907e56c
import streamlit as st
# from dotenv import load_dotenv
from PyPDF2 import PdfReader
from transformers import pipeline, BertTokenizer
# import fitz
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def preprocess_input(input_text):
tokens = tokenizer.tokenize(input_text)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
return input_ids
def extract_text_from_pdf(pdf_docs, input_text):
all_relevant_text = []
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
text=""
for page in pdf_reader.pages:
text += page.extract_text()
chunk_size = 1000 # Set the desired chunk size
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
relevant_text = ""
for chunk in chunks:
chunk_relevant_text = answer_question(input_text, chunk)
relevant_text += chunk_relevant_text
# relevant_text = answer_question(input_text, text)
all_relevant_text.append(relevant_text)
return all_relevant_text
def answer_question(question, context):
summarization_pipeline = pipeline("summarization", model="t5-small", tokenizer="t5-small")
input_text = f"question: {question} context: {context}"
input_ids = preprocess_input(input_text)
input_text = tokenizer.decode(input_ids)
summarized_text = summarization_pipeline(input_text, max_length=1000, min_length=100, do_sample=True)[0]['summary_text']
return summarized_text
def main():
# load_dotenv()
st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
st.header("Lets chat :books:")
user_question = st.text_input("Ask a question about your documents:")
if 'conversation_history' not in st.session_state:
st.session_state.conversation_history = []
if user_question:
with st.spinner("Processing"):
pdf_docs = st.session_state.pdf_docs
st.session_state.conversation_history.append(('user', user_question))
document_texts = extract_text_from_pdf(pdf_docs,user_question)
summarized_text =answer_question(user_question, document_texts)
st.session_state.conversation_history.append(('bot', summarized_text))
with st.sidebar:
st.subheader("Upload documents")
pdf_docs = st.file_uploader(
"Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
if st.button("Process"):
st.session_state.pdf_docs = pdf_docs
# for pdf in pdf_docs:
# pdf_reader = PdfReader(pdf)
# text=""
# for page in pdf_reader.pages:
# text += page.extract_text()
# st.write("Extracted text: ",text)
# Display conversation history
for role, message in st.session_state.conversation_history:
if role == 'user':
st.write("You:", message)
elif role == 'bot':
st.write("Bot:", message)
if __name__ == '__main__':
main()