File size: 6,685 Bytes
031b10b
 
 
 
3c97b3b
031b10b
 
 
 
 
 
 
 
 
 
 
bd5e049
031b10b
 
 
 
b89bbe5
031b10b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5143f36
031b10b
 
 
aed0db9
 
 
 
 
 
 
031b10b
 
 
 
 
1cfe121
031b10b
 
 
 
09b9b2e
031b10b
 
 
 
 
b89bbe5
031b10b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be8b37c
 
031b10b
 
 
be8b37c
031b10b
 
 
 
 
 
 
 
4979e25
031b10b
 
 
 
 
 
 
 
 
 
 
 
 
 
62e52d3
031b10b
 
 
 
3469663
031b10b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import os
import time
import pandas as pd
import traceback


from langchain.document_loaders import OnlinePDFLoader #for laoding the pdf
from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
from langchain.vectorstores import Chroma # for the vectorization part
from langchain.chains import RetrievalQA # for conversing with chatGPT
from langchain.chat_models import ChatOpenAI # the LLM model we'll use (ChatGPT)
from langchain import PromptTemplate # to format the response


def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
    if open_ai_key is not None:
        os.environ['OPENAI_API_KEY'] = open_ai_key
        #Load the pdf file
        loader = OnlinePDFLoader(pdf_doc.name)
        pages = loader.load_and_split()
        print("PDF has been loaded and split")
        
        #Create an instance of OpenAIEmbeddings, which is responsible for generating embeddings for text
        embeddings = OpenAIEmbeddings()

        pages_to_be_loaded =[]

        if relevant_pages:
            page_numbers = relevant_pages.split(",")
            if len(page_numbers) != 0:
                for page_number in page_numbers:
                    if page_number.isdigit():
                        pageIndex = int(page_number)-1
                        if pageIndex >=0 and pageIndex <len(pages):
                            pages_to_be_loaded.append(pages[pageIndex])
        #In the scenario where none of the page numbers supplied exist in the PDF, we will revert to using the entire PDF.
        if len(pages_to_be_loaded) ==0:
            pages_to_be_loaded = pages.copy()
        print("The document has been split into # of pages:", len(pages_to_be_loaded))
            
             
        #To create a vector store, we use the Chroma class, which takes the documents (pages in our case) and the embeddings instance
        try:
            vectordb = Chroma.from_documents(documents=pages_to_be_loaded, embedding=embeddings, persist_directory='db')
            print("Vectordb has been created")
        except:
            traceback.print_exc()
        #vectordb = Chroma.from_documents(documents=pages_to_be_loaded, embedding=embeddings, persist_directory='db')
        #print("Vectordb has been created")
        
        #Finally, we create the bot using the RetrievalQA class
        global pdf_qa

        #Configuring the Prompt Template is the key to getting the desired response in the desired format.
        prompt_template = """Use the following pieces of context to answer the question at the end. If you encounter a date, return it in mm/dd/yyyy format. If there is a Preface section in the document, extract the chapter# and the short description from the Preface. Chapter numbers are listed to the left in Preface and always start with an alphabet, for example A1-1

        {context}

        Question: {question}
        Return the answer. Where applicable, break the answer into bullet points. When the sentences are long, try and break them into sub sections and include all the information and do not skip any information. If there is an exception to the answer, please do include it in a 'Note:' section. If there are no exceptions to the answer, please skip the 'Note:' section. Include a 'For additional details refer to' section when the document has more information to offer on the topic being questioned. If the document has a Preface or 'Table of Contents' section, extract the chapter# and a short description and include the info under the 'For additional details refer to' section. List only the chapters that contain information or skip this section altogether. Do not use page numbers as chapter numbers as they are different. If additional information is found in multiple pages within the same chapter, list the chapter only once. If chapter information cannot be extracted, include any other information that will help the user navigate to the relevant sections of the document. If the document does not contain a Preface or 'Table of Contents' section, please do not call that out. For example, do not include statements like the following in the answer - 'The document does not contain a Preface or 'Table of Contents' section'""" 

        PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
                                                                        
        chain_type_kwargs = {"prompt": PROMPT}
        pdf_qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, model_name="gpt-4"),chain_type="stuff", retriever=vectordb.as_retriever(search_kwargs={"k": 4}), chain_type_kwargs=chain_type_kwargs, return_source_documents=False)
        print("GPT-4 is ready to take questions!")
        return "Ready"
    else:
        return "Please provide an OpenAI gpt-4 API key"
        
    
def answer_query(query):
    question = query
    return pdf_qa.run(question)
    

css="""
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
"""

title = """
<div style="text-align: center;max-width: 700px;">
    <img src="file=Vaultedge-logo-with-name.png" style="width: 30%; min-width: 60px; display: block; margin: auto; background-color: transparent;">
    <h1>Ask Moli - Chatbot for complex documents</h1> 
    <p style="text-align: center;">'Load a File', click the "Upload file to Moli" button, <br />
    wait for the Status to show Ready. Type your question, click on "Ask Moli" <br />
    The app is built on GPT-4</p>
    
</div>
"""

with gr.Blocks(css=css,theme=gr.themes.Monochrome()) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML(title)
    
    with gr.Column():
        open_ai_key = gr.Textbox(label="Your GPT-4 OpenAI API key", type="password")
        pdf_doc = gr.File(label="Load a file",file_types=['.pdf'],type='file')
        relevant_pages = gr.Textbox(label="*Optional - Leave this field blank to use the entire PDF or provide comma separated page numbers like 3,4,5")
        
        with gr.Row():
            status = gr.Textbox(label="Status", placeholder="", interactive=False)
            load_pdf = gr.Button("Upload file to Moli").style(full_width=False)
            
              
        with gr.Row():
            input = gr.Textbox(label="Type in your question")
            output = gr.Textbox(label="Answer")
            submit_query = gr.Button("Ask Moli").style(full_width=False)
        
 
    load_pdf.click(load_pdf_and_generate_embeddings, inputs=[pdf_doc, open_ai_key, relevant_pages], outputs=status)
        
    submit_query.click(answer_query,input,output)


demo.launch(debug=True, auth=("admin", "lm0R!Rm0#97r"))