Spaces:
Build error
Build error
Commit
·
c00c005
1
Parent(s):
87ff9aa
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain.document_loaders import PyPDFLoader
|
| 3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
+
from langchain.chat_models import ChatOpenAI
|
| 5 |
+
from langchain import OpenAI
|
| 6 |
+
from langchain import PromptTemplate
|
| 7 |
+
from langchain.chains.summarize import load_summarize_chain
|
| 8 |
+
import gradio as gr
|
| 9 |
+
|
| 10 |
+
title = '''
|
| 11 |
+
<div style="text-align: left; font-family:Arial; color:Black; font-size: 16px; max-width: 750px;">
|
| 12 |
+
<h1>Small PDF Summarizer</h1>
|
| 13 |
+
<p style="text-align: left;">Upload a .PDF from your computer, click the "Upload PDF" button and fill OpenAI API Key. <br />
|
| 14 |
+
Output will be on the textbox bellow. You can also change some LLM configurations from the 'config' tab<br/>
|
| 15 |
+
</div>
|
| 16 |
+
'''
|
| 17 |
+
|
| 18 |
+
desc_1 = '''
|
| 19 |
+
<div style="text-align: left; font-family:Arial; color:Black; font-size: 14px;">
|
| 20 |
+
<h3>Custom Prompt Template</h3>
|
| 21 |
+
<p style="text-align: left;">You can customize input prompt for the map and combine prompt of langchain's Map-Reduce Summarization pipeline
|
| 22 |
+
using the texboxt bellow.<br/>
|
| 23 |
+
Prompt which will be fed into LLM use the format of : <b>{textbox input} + {pdf_text} + "SUMMARY:"</b> <br/>
|
| 24 |
+
In essence each page of PDF will be summarized using map prompt, and each summary then be combined for final output using combine prompt.<br/>
|
| 25 |
+
<a href="https://python.langchain.com/docs/use_cases/summarization">More Info on Map-Reduce for Summarization</a>
|
| 26 |
+
</div>
|
| 27 |
+
'''
|
| 28 |
+
|
| 29 |
+
MAP_PROMPT = """
|
| 30 |
+
You will be given a page of text which section is enclosed in triple backticks (```).
|
| 31 |
+
Your goal is to give a summary of this section, ignoring references and footnote if present.
|
| 32 |
+
Your response should be at least 200 words only if input classified as academic text.
|
| 33 |
+
Your response must fully encompass what was said in the page.
|
| 34 |
+
|
| 35 |
+
```{text}```
|
| 36 |
+
SUMMARY:
|
| 37 |
+
"""
|
| 38 |
+
COMBINE_PROMPT = """
|
| 39 |
+
Write a full summary of the following text enclosed in triple backticks (```).
|
| 40 |
+
Full summary consists of a descriptive summary of at least 100 words (if possible),
|
| 41 |
+
followed by numbered list which covers key points of the text.
|
| 42 |
+
|
| 43 |
+
```{text}```
|
| 44 |
+
SUMMARY:
|
| 45 |
+
"""
|
| 46 |
+
config_info = {'temperature': 'Higher means more randomness to the output.',
|
| 47 |
+
'max_tokens' : 'The maximum number of tokens to generate in the output.',
|
| 48 |
+
'llm_list' : ''}
|
| 49 |
+
model_list = {'gpt-3.5-turbo':'chat',
|
| 50 |
+
'gpt-4':'chat',
|
| 51 |
+
'gpt-3.5-turbo-instruct':'instruct',
|
| 52 |
+
'text-davinci-003':'instruct'}
|
| 53 |
+
|
| 54 |
+
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=250)
|
| 55 |
+
|
| 56 |
+
def summarize_pdf(pdf_file, api_key,
|
| 57 |
+
model_name, temperature, llm_max_tokens,
|
| 58 |
+
custom_map_prompt, custom_combine_prompt):
|
| 59 |
+
global pdf_docs
|
| 60 |
+
# Read PDF
|
| 61 |
+
loader = PyPDFLoader(pdf_file.name)
|
| 62 |
+
pdf_docs = loader.load_and_split(text_splitter)
|
| 63 |
+
file_check(pdf_file)
|
| 64 |
+
|
| 65 |
+
# Build LLM Model
|
| 66 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
| 67 |
+
if model_list[model_name] == 'chat':
|
| 68 |
+
gpt_llm = ChatOpenAI(temperature=temperature, model_name=model_name, max_tokens=int(llm_max_tokens))
|
| 69 |
+
else:
|
| 70 |
+
gpt_llm = OpenAI(temperature=temperature, model_name=model_name, max_tokens=int(llm_max_tokens))
|
| 71 |
+
|
| 72 |
+
# Summarize PDF
|
| 73 |
+
if custom_map_prompt !="":
|
| 74 |
+
map_template = PromptTemplate(template=generate_template(custom_map_prompt), input_variables=["text"])
|
| 75 |
+
else:
|
| 76 |
+
map_template = PromptTemplate(template=MAP_PROMPT, input_variables=["text"])
|
| 77 |
+
|
| 78 |
+
if custom_combine_prompt !="":
|
| 79 |
+
combine_template = PromptTemplate(template=generate_template(custom_combine_prompt), input_variables=["text"])
|
| 80 |
+
else:
|
| 81 |
+
combine_template = PromptTemplate(template=COMBINE_PROMPT, input_variables=["text"])
|
| 82 |
+
|
| 83 |
+
map_reduce_chain = load_summarize_chain(
|
| 84 |
+
gpt_llm,
|
| 85 |
+
chain_type="map_reduce",
|
| 86 |
+
map_prompt=map_prompt,
|
| 87 |
+
combine_prompt=combine_prompt,
|
| 88 |
+
return_intermediate_steps=True,
|
| 89 |
+
token_max=3840 # limit the maximum number of tokens in the combined document (combine prompt).
|
| 90 |
+
)
|
| 91 |
+
map_reduce_outputs = map_reduce_chain({"input_documents": pdf_docs})
|
| 92 |
+
return map_reduce_outputs['output_text']
|
| 93 |
+
|
| 94 |
+
def file_check(pdf_file):
|
| 95 |
+
if os.path.getsize(pdf_file.name)/1024 **2 > 1:
|
| 96 |
+
raise gr.Error("Maximum File Size is 1MB!")
|
| 97 |
+
elif len(pdf_docs) > 15:
|
| 98 |
+
raise gr.Error("Maximum File Length is 15 Pages!")
|
| 99 |
+
else:
|
| 100 |
+
pass
|
| 101 |
+
|
| 102 |
+
def generate_template(custom_prompt):
|
| 103 |
+
custom_template = custom_prompt + '''
|
| 104 |
+
|
| 105 |
+
```{text}```
|
| 106 |
+
SUMMARY:
|
| 107 |
+
'''
|
| 108 |
+
return custom_template
|
| 109 |
+
|
| 110 |
+
def main():
|
| 111 |
+
with gr.Blocks() as demo:
|
| 112 |
+
gr.HTML(title)
|
| 113 |
+
with gr.Tab("Main"):
|
| 114 |
+
with gr.Column():
|
| 115 |
+
pdf_doc = gr.File(label="Uploaded PDF:", file_types=['.pdf'], type="file")
|
| 116 |
+
API_KEY = gr.Textbox(label="OpenAI API Key:", lines=1, type="password")
|
| 117 |
+
summarize_button = gr.Button(value="Summarize!")
|
| 118 |
+
summarized_text = gr.Textbox(label="Summary", lines=10, show_copy_button=True)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
with gr.Tab("Config"):
|
| 122 |
+
llm_model = gr.Dropdown(choices=model_list.keys(), label="LLM model used", value='gpt-3.5-turbo', interactive=True)
|
| 123 |
+
with gr.Row():
|
| 124 |
+
temperature = gr.Slider(minimum=0, maximum=0.5, step=0.1, label="temperature", info=config_info['temperature'])
|
| 125 |
+
llm_max_tokens = gr.Radio(choices=[128, 256, 512], value=256, interactive=True, label="LLM max tokens", info=config_info['max_tokens'])
|
| 126 |
+
gr.HTML(desc_1)
|
| 127 |
+
with gr.Row():
|
| 128 |
+
user_map_prompt = gr.Textbox(label="Map PROMPT", lines=10, interactive=True)
|
| 129 |
+
user_comb_prompt = gr.Textbox(label="Combine PROMPT", lines=10, interactive=True)
|
| 130 |
+
|
| 131 |
+
with gr.Accordion("Default Template", open=False):
|
| 132 |
+
with gr.Row():
|
| 133 |
+
default_map_prompt = gr.Textbox(label="Map PROMPT", value=MAP_PROMPT, lines=10, interactive=False)
|
| 134 |
+
default_comb_prompt = gr.Textbox(label="Combine PROMPT", value=COMBINE_PROMPT, lines=10, interactive=False)
|
| 135 |
+
with gr.Accordion("User Custom Prompt Preview", open=False):
|
| 136 |
+
prompt_preview_button = gr.Button(value="View Custom Prompt")
|
| 137 |
+
with gr.Row():
|
| 138 |
+
custom_map_view = gr.Textbox(label="Map PROMPT", lines=10, interactive=False)
|
| 139 |
+
custom_comb_view = gr.Textbox(label="Combine PROMPT", lines=10, interactive=False)
|
| 140 |
+
|
| 141 |
+
prompt_preview_button.click(generate_template, inputs=[user_map_prompt], outputs=[custom_map_view])
|
| 142 |
+
prompt_preview_button.click(generate_template, inputs=[user_comb_prompt], outputs=[custom_comb_view])
|
| 143 |
+
|
| 144 |
+
list_inputs = [pdf_doc, API_KEY, llm_model, temperature, llm_max_tokens, user_map_prompt, user_comb_prompt]
|
| 145 |
+
summarize_button.click(summarize_pdf, inputs=list_inputs, outputs=[summarized_text])
|
| 146 |
+
|
| 147 |
+
demo.queue().launch(share=False)
|
| 148 |
+
|
| 149 |
+
if __name__ == "__main__":
|
| 150 |
+
main()
|