Spaces:
Runtime error
Runtime error
Implementation of multi-pdf chat
#2
by
thomasmz1 - opened
app.py
CHANGED
|
@@ -18,6 +18,7 @@ def preprocess(text):
|
|
| 18 |
return text
|
| 19 |
|
| 20 |
|
|
|
|
| 21 |
def pdf_to_text(path, start_page=1, end_page=None):
|
| 22 |
doc = fitz.open(path)
|
| 23 |
total_pages = doc.page_count
|
|
@@ -35,8 +36,8 @@ def pdf_to_text(path, start_page=1, end_page=None):
|
|
| 35 |
doc.close()
|
| 36 |
return text_list
|
| 37 |
|
| 38 |
-
|
| 39 |
-
def text_to_chunks(texts, word_length=150, start_page=1):
|
| 40 |
text_toks = [t.split(' ') for t in texts]
|
| 41 |
page_nums = []
|
| 42 |
chunks = []
|
|
@@ -49,7 +50,7 @@ def text_to_chunks(texts, word_length=150, start_page=1):
|
|
| 49 |
text_toks[idx+1] = chunk + text_toks[idx+1]
|
| 50 |
continue
|
| 51 |
chunk = ' '.join(chunk).strip()
|
| 52 |
-
chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
|
| 53 |
chunks.append(chunk)
|
| 54 |
return chunks
|
| 55 |
|
|
@@ -91,10 +92,12 @@ class SemanticSearch:
|
|
| 91 |
|
| 92 |
|
| 93 |
|
| 94 |
-
def load_recommender(
|
| 95 |
global recommender
|
| 96 |
-
texts =
|
| 97 |
-
chunks =
|
|
|
|
|
|
|
| 98 |
recommender.fit(chunks)
|
| 99 |
return 'Corpus Loaded.'
|
| 100 |
|
|
@@ -140,7 +143,7 @@ def generate_answer(question, openAI_key, model):
|
|
| 140 |
prompt += c + '\n\n'
|
| 141 |
|
| 142 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
| 143 |
-
"Cite each reference using [ Page Number] notation. "\
|
| 144 |
"Only answer what is asked. The answer should be short and concise. \n\nQuery: "
|
| 145 |
|
| 146 |
prompt += f"{question}\nAnswer:"
|
|
@@ -148,13 +151,15 @@ def generate_answer(question, openAI_key, model):
|
|
| 148 |
return answer
|
| 149 |
|
| 150 |
|
| 151 |
-
def question_answer(chat_history, url,
|
| 152 |
try:
|
|
|
|
|
|
|
| 153 |
if openAI_key.strip()=='':
|
| 154 |
return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
|
| 155 |
-
if url.strip() == '' and
|
| 156 |
return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
|
| 157 |
-
if url.strip() != '' and
|
| 158 |
return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
|
| 159 |
if model is None or model =='':
|
| 160 |
return '[ERROR]: You have not selected any model. Please choose an LLM model.'
|
|
@@ -163,11 +168,16 @@ def question_answer(chat_history, url, file, question, openAI_key, model):
|
|
| 163 |
download_pdf(glob_url, 'corpus.pdf')
|
| 164 |
load_recommender('corpus.pdf')
|
| 165 |
else:
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
if question.strip() == '':
|
| 172 |
return '[ERROR]: Question field is empty'
|
| 173 |
if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
|
|
@@ -203,7 +213,7 @@ def generate_answer_text_davinci_003(question,openAI_key):
|
|
| 203 |
prompt += c + '\n\n'
|
| 204 |
|
| 205 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
| 206 |
-
"Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
|
| 207 |
"Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
|
| 208 |
"with the same name, create separate answers for each. Only include information found in the results and "\
|
| 209 |
"don't add any additional information. Make sure the answer is correct and don't output false content. "\
|
|
@@ -212,6 +222,7 @@ def generate_answer_text_davinci_003(question,openAI_key):
|
|
| 212 |
"answer should be short and concise. \n\nQuery: {question}\nAnswer: "
|
| 213 |
|
| 214 |
prompt += f"Query: {question}\nAnswer:"
|
|
|
|
| 215 |
answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
|
| 216 |
return answer
|
| 217 |
|
|
@@ -242,15 +253,14 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
|
|
| 242 |
gr.Markdown(f'<center><h3>{title}</h3></center>')
|
| 243 |
gr.Markdown(description)
|
| 244 |
|
| 245 |
-
with gr.Row():
|
| 246 |
-
|
| 247 |
with gr.Group():
|
| 248 |
gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
|
| 249 |
with gr.Accordion("API Key"):
|
| 250 |
openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
|
| 251 |
-
url = gr.Textbox(label='Enter PDF URL here
|
| 252 |
gr.Markdown("<center><h4>OR<h4></center>")
|
| 253 |
-
|
| 254 |
question = gr.Textbox(label='Enter your question here')
|
| 255 |
gr.Examples(
|
| 256 |
[[q] for q in questions],
|
|
@@ -273,15 +283,12 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
|
|
| 273 |
with gr.Group():
|
| 274 |
chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
|
| 275 |
|
| 276 |
-
|
| 277 |
-
#
|
| 278 |
# Bind the click event of the button to the question_answer function
|
| 279 |
btn.click(
|
| 280 |
question_answer,
|
| 281 |
-
inputs=[chatbot, url,
|
| 282 |
outputs=[chatbot],
|
| 283 |
)
|
| 284 |
|
| 285 |
-
demo.launch()
|
| 286 |
-
|
| 287 |
-
|
|
|
|
| 18 |
return text
|
| 19 |
|
| 20 |
|
| 21 |
+
# converts pdf to text
|
| 22 |
def pdf_to_text(path, start_page=1, end_page=None):
|
| 23 |
doc = fitz.open(path)
|
| 24 |
total_pages = doc.page_count
|
|
|
|
| 36 |
doc.close()
|
| 37 |
return text_list
|
| 38 |
|
| 39 |
+
# one text converts a list of chunks
|
| 40 |
+
def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
|
| 41 |
text_toks = [t.split(' ') for t in texts]
|
| 42 |
page_nums = []
|
| 43 |
chunks = []
|
|
|
|
| 50 |
text_toks[idx+1] = chunk + text_toks[idx+1]
|
| 51 |
continue
|
| 52 |
chunk = ' '.join(chunk).strip()
|
| 53 |
+
chunk = f'[File no. {file_number}] [Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
|
| 54 |
chunks.append(chunk)
|
| 55 |
return chunks
|
| 56 |
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
|
| 95 |
+
def load_recommender(paths, start_page=1):
|
| 96 |
global recommender
|
| 97 |
+
texts = []
|
| 98 |
+
chunks = []
|
| 99 |
+
for idx, path in enumerate(paths):
|
| 100 |
+
chunks += text_to_chunks(pdf_to_text(path, start_page=start_page), start_page=start_page, file_number=idx+1)
|
| 101 |
recommender.fit(chunks)
|
| 102 |
return 'Corpus Loaded.'
|
| 103 |
|
|
|
|
| 143 |
prompt += c + '\n\n'
|
| 144 |
|
| 145 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
| 146 |
+
"Cite each reference using [File number][ Page Number] notation. "\
|
| 147 |
"Only answer what is asked. The answer should be short and concise. \n\nQuery: "
|
| 148 |
|
| 149 |
prompt += f"{question}\nAnswer:"
|
|
|
|
| 151 |
return answer
|
| 152 |
|
| 153 |
|
| 154 |
+
def question_answer(chat_history, url, files, question, openAI_key, model):
|
| 155 |
try:
|
| 156 |
+
if files == None:
|
| 157 |
+
files = []
|
| 158 |
if openAI_key.strip()=='':
|
| 159 |
return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
|
| 160 |
+
if url.strip() == '' and files == []:
|
| 161 |
return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
|
| 162 |
+
if url.strip() != '' and files is not []:
|
| 163 |
return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
|
| 164 |
if model is None or model =='':
|
| 165 |
return '[ERROR]: You have not selected any model. Please choose an LLM model.'
|
|
|
|
| 168 |
download_pdf(glob_url, 'corpus.pdf')
|
| 169 |
load_recommender('corpus.pdf')
|
| 170 |
else:
|
| 171 |
+
filenames = []
|
| 172 |
+
for file in files:
|
| 173 |
+
old_file_name = file.name
|
| 174 |
+
file_name = file.name
|
| 175 |
+
file_name = file_name[:-12] + file_name[-4:]
|
| 176 |
+
os.rename(old_file_name, file_name)
|
| 177 |
+
filenames.append(file_name)
|
| 178 |
+
load_recommender(filenames)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
if question.strip() == '':
|
| 182 |
return '[ERROR]: Question field is empty'
|
| 183 |
if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
|
|
|
|
| 213 |
prompt += c + '\n\n'
|
| 214 |
|
| 215 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
| 216 |
+
"Cite each reference using [File number] [ Page Number] notation (every result has this number at the beginning). "\
|
| 217 |
"Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
|
| 218 |
"with the same name, create separate answers for each. Only include information found in the results and "\
|
| 219 |
"don't add any additional information. Make sure the answer is correct and don't output false content. "\
|
|
|
|
| 222 |
"answer should be short and concise. \n\nQuery: {question}\nAnswer: "
|
| 223 |
|
| 224 |
prompt += f"Query: {question}\nAnswer:"
|
| 225 |
+
# print("prompt == " + str(prompt))
|
| 226 |
answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
|
| 227 |
return answer
|
| 228 |
|
|
|
|
| 253 |
gr.Markdown(f'<center><h3>{title}</h3></center>')
|
| 254 |
gr.Markdown(description)
|
| 255 |
|
| 256 |
+
with gr.Row():
|
|
|
|
| 257 |
with gr.Group():
|
| 258 |
gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
|
| 259 |
with gr.Accordion("API Key"):
|
| 260 |
openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
|
| 261 |
+
url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
|
| 262 |
gr.Markdown("<center><h4>OR<h4></center>")
|
| 263 |
+
files = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'], file_count="multiple")
|
| 264 |
question = gr.Textbox(label='Enter your question here')
|
| 265 |
gr.Examples(
|
| 266 |
[[q] for q in questions],
|
|
|
|
| 283 |
with gr.Group():
|
| 284 |
chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
|
| 285 |
|
| 286 |
+
|
|
|
|
| 287 |
# Bind the click event of the button to the question_answer function
|
| 288 |
btn.click(
|
| 289 |
question_answer,
|
| 290 |
+
inputs=[chatbot, url, files, question, openAI_key, model],
|
| 291 |
outputs=[chatbot],
|
| 292 |
)
|
| 293 |
|
| 294 |
+
demo.launch()
|
|
|
|
|
|