Spaces:
Runtime error
Runtime error
Commit ·
031b10b
0
Parent(s):
Duplicate from lekkalar/mortgage-mate-ask-moli
Browse files- .gitattributes +34 -0
- README.md +13 -0
- app.py +111 -0
- requirements.txt +6 -0
.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Mortage Mate - Ask Moli
|
| 3 |
+
emoji: 👁
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 3.33.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
duplicated_from: lekkalar/mortgage-mate-ask-moli
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
from langchain.document_loaders import OnlinePDFLoader #for laoding the pdf
|
| 8 |
+
from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
|
| 9 |
+
from langchain.vectorstores import Chroma # for the vectorization part
|
| 10 |
+
from langchain.chains import RetrievalQA # for conversing with chatGPT
|
| 11 |
+
from langchain.chat_models import ChatOpenAI # the LLM model we'll use (ChatGPT)
|
| 12 |
+
from langchain import PromptTemplate # to format the response
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
|
| 16 |
+
if openai_key is not None:
|
| 17 |
+
os.environ['OPENAI_API_KEY'] = open_ai_key
|
| 18 |
+
#Load the pdf file
|
| 19 |
+
loader = OnlinePDFLoader(pdf_doc.name)
|
| 20 |
+
pages = loader.load_and_split()
|
| 21 |
+
|
| 22 |
+
#Create an instance of OpenAIEmbeddings, which is responsible for generating embeddings for text
|
| 23 |
+
embeddings = OpenAIEmbeddings()
|
| 24 |
+
|
| 25 |
+
pages_to_be_loaded =[]
|
| 26 |
+
|
| 27 |
+
if relevant_pages:
|
| 28 |
+
page_numbers = relevant_pages.split(",")
|
| 29 |
+
if len(page_numbers) != 0:
|
| 30 |
+
for page_number in page_numbers:
|
| 31 |
+
if page_number.isdigit():
|
| 32 |
+
pageIndex = int(page_number)-1
|
| 33 |
+
if pageIndex >=0 and pageIndex <len(pages):
|
| 34 |
+
pages_to_be_loaded.append(pages[pageIndex])
|
| 35 |
+
#In the scenario where none of the page numbers supplied exist in the PDF, we will revert to using the entire PDF.
|
| 36 |
+
if len(pages_to_be_loaded) ==0:
|
| 37 |
+
pages_to_be_loaded = pages.copy()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
#To create a vector store, we use the Chroma class, which takes the documents (pages in our case) and the embeddings instance
|
| 41 |
+
vectordb = Chroma.from_documents(pages_to_be_loaded, embedding=embeddings)
|
| 42 |
+
|
| 43 |
+
#Finally, we create the bot using the RetrievalQA class
|
| 44 |
+
global pdf_qa
|
| 45 |
+
|
| 46 |
+
#Configuring the Prompt Template is the key to getting the desired response in the desired format.
|
| 47 |
+
prompt_template = """Use the following pieces of context to answer the question at the end. If you encounter a date, return it in mm/dd/yyyy format. If there is a Preface section in the document, extract the chapter# and the short description from the Preface section of the document. Chapter numbers are listed to the left in Preface and always start with an alphabet, for example A1-1
|
| 48 |
+
|
| 49 |
+
{context}
|
| 50 |
+
|
| 51 |
+
Question: {question}
|
| 52 |
+
Return the answer. Where applicable, break the answer into bullet points. When the sentences are long, try and break them into sub sections and include all the information and do not skip any information. If there is an exception to the answer, please do include it in a 'Note:' section. If the document has a Preface section, include a 'For additional details refer to' section with chapter# and a short description. List only the chapters that contain information or skip this section altogether. Do not use page numbers as chapter numbers as they are different. If additional information is found in multiple pages within the same chapter, list the chapter only once. If chapter information cannot be extracted, include section# from the document"""
|
| 53 |
+
|
| 54 |
+
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
|
| 55 |
+
|
| 56 |
+
chain_type_kwargs = {"prompt": PROMPT}
|
| 57 |
+
pdf_qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, model_name="gpt-4"),chain_type="stuff", retriever=vectordb.as_retriever(search_kwargs={"k": 4}), chain_type_kwargs=chain_type_kwargs, return_source_documents=False)
|
| 58 |
+
|
| 59 |
+
return "Ready"
|
| 60 |
+
else:
|
| 61 |
+
return "Please provide an OpenAI gpt-4 API key"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def answer_query(query):
|
| 65 |
+
question = query
|
| 66 |
+
return pdf_qa.run(question)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
css="""
|
| 70 |
+
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
title = """
|
| 74 |
+
<div style="text-align: center;max-width: 700px;">
|
| 75 |
+
<h1>Ask Moli - Chatbot for complex documents</h1>
|
| 76 |
+
<p style="text-align: center;">'Load a File', click the "Upload file to Moli" button, <br />
|
| 77 |
+
wait for the Status to show Ready. Type your question, click on "Ask Moli" <br />
|
| 78 |
+
The app is built on GPT-4</p>
|
| 79 |
+
</div>
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
with gr.Blocks(css=css,theme=gr.themes.Monochrome()) as demo:
|
| 83 |
+
with gr.Column(elem_id="col-container"):
|
| 84 |
+
gr.HTML(title)
|
| 85 |
+
|
| 86 |
+
with gr.Column():
|
| 87 |
+
openai_key = gr.Textbox(label="Your GPT-4 OpenAI API key", type="password")
|
| 88 |
+
pdf_doc = gr.File(label="Load a file",file_types=['.pdf'],type='file')
|
| 89 |
+
relevant_pages = gr.Textbox(label="*Optional - Leave this field blank to use the entire PDF or provide comma separated page numbers like 3,4,5")
|
| 90 |
+
|
| 91 |
+
with gr.Row():
|
| 92 |
+
status = gr.Textbox(label="Status", placeholder="", interactive=False)
|
| 93 |
+
load_pdf = gr.Button("Upload file to Moli").style(full_width=False)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
with gr.Row():
|
| 97 |
+
input = gr.Textbox(label="Type in your question")
|
| 98 |
+
output = gr.Textbox(label="Answer")
|
| 99 |
+
submit_query = gr.Button("Ask Moli").style(full_width=False)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
load_pdf.click(load_pdf_and_generate_embeddings, inputs=[pdf_doc, openai_key, relevant_pages], outputs=status)
|
| 103 |
+
|
| 104 |
+
submit_query.click(answer_query,input,output)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
demo.launch()
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openai
|
| 2 |
+
tiktoken
|
| 3 |
+
chromadb
|
| 4 |
+
langchain
|
| 5 |
+
unstructured
|
| 6 |
+
unstructured[local-inference]
|