File size: 3,218 Bytes
907e56c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import streamlit as st
# from dotenv import load_dotenv
from PyPDF2 import PdfReader
from transformers import pipeline, BertTokenizer
# import fitz

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def preprocess_input(input_text):
    tokens = tokenizer.tokenize(input_text)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]


    return input_ids

def extract_text_from_pdf(pdf_docs, input_text):
    all_relevant_text = []  
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        text=""
        for page in pdf_reader.pages:
            text += page.extract_text()

        chunk_size = 1000  # Set the desired chunk size
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        
        relevant_text = ""
        for chunk in chunks:
            chunk_relevant_text = answer_question(input_text, chunk)
            relevant_text += chunk_relevant_text

        # relevant_text = answer_question(input_text, text)
        all_relevant_text.append(relevant_text)
    return all_relevant_text

def answer_question(question, context):
    summarization_pipeline = pipeline("summarization", model="t5-small", tokenizer="t5-small")
    input_text = f"question: {question} context: {context}"

    input_ids = preprocess_input(input_text)
    input_text = tokenizer.decode(input_ids)
    summarized_text = summarization_pipeline(input_text, max_length=1000, min_length=100, do_sample=True)[0]['summary_text']
    return summarized_text



def main():
    # load_dotenv()
    st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")

    st.header("Lets chat :books:")
    user_question = st.text_input("Ask a question about your documents:")

    
    if 'conversation_history' not in st.session_state:
        st.session_state.conversation_history = []

    if user_question:
        with st.spinner("Processing"):
            
            pdf_docs = st.session_state.pdf_docs
           
           
            st.session_state.conversation_history.append(('user', user_question))

           
            document_texts = extract_text_from_pdf(pdf_docs,user_question)
            
            summarized_text =answer_question(user_question, document_texts)

           
            st.session_state.conversation_history.append(('bot', summarized_text))

    with st.sidebar:
        st.subheader("Upload documents")
        pdf_docs = st.file_uploader(
            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
        if st.button("Process"):
            st.session_state.pdf_docs = pdf_docs
            # for pdf in pdf_docs:
            #     pdf_reader = PdfReader(pdf)
            #     text=""
            #     for page in pdf_reader.pages:
            #         text += page.extract_text()

            # st.write("Extracted text: ",text)

    # Display conversation history
    for role, message in st.session_state.conversation_history:
        if role == 'user':
            st.write("You:", message)
        elif role == 'bot':
            st.write("Bot:", message)

if __name__ == '__main__':
    main()