import streamlit as st # from dotenv import load_dotenv from PyPDF2 import PdfReader from transformers import pipeline, BertTokenizer # import fitz tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') def preprocess_input(input_text): tokens = tokenizer.tokenize(input_text) input_ids = tokenizer.convert_tokens_to_ids(tokens) input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id] return input_ids def extract_text_from_pdf(pdf_docs, input_text): all_relevant_text = [] for pdf in pdf_docs: pdf_reader = PdfReader(pdf) text="" for page in pdf_reader.pages: text += page.extract_text() chunk_size = 1000 # Set the desired chunk size chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] relevant_text = "" for chunk in chunks: chunk_relevant_text = answer_question(input_text, chunk) relevant_text += chunk_relevant_text # relevant_text = answer_question(input_text, text) all_relevant_text.append(relevant_text) return all_relevant_text def answer_question(question, context): summarization_pipeline = pipeline("summarization", model="t5-small", tokenizer="t5-small") input_text = f"question: {question} context: {context}" input_ids = preprocess_input(input_text) input_text = tokenizer.decode(input_ids) summarized_text = summarization_pipeline(input_text, max_length=1000, min_length=100, do_sample=True)[0]['summary_text'] return summarized_text def main(): # load_dotenv() st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:") st.header("Lets chat :books:") user_question = st.text_input("Ask a question about your documents:") if 'conversation_history' not in st.session_state: st.session_state.conversation_history = [] if user_question: with st.spinner("Processing"): pdf_docs = st.session_state.pdf_docs st.session_state.conversation_history.append(('user', user_question)) document_texts = extract_text_from_pdf(pdf_docs,user_question) summarized_text =answer_question(user_question, document_texts) st.session_state.conversation_history.append(('bot', summarized_text)) with st.sidebar: st.subheader("Upload documents") pdf_docs = st.file_uploader( "Upload your PDFs here and click on 'Process'", accept_multiple_files=True) if st.button("Process"): st.session_state.pdf_docs = pdf_docs # for pdf in pdf_docs: # pdf_reader = PdfReader(pdf) # text="" # for page in pdf_reader.pages: # text += page.extract_text() # st.write("Extracted text: ",text) # Display conversation history for role, message in st.session_state.conversation_history: if role == 'user': st.write("You:", message) elif role == 'bot': st.write("Bot:", message) if __name__ == '__main__': main()