Spaces:

juniorjukeko
/

small-pdf-summarizer

Build error

App Files Files Community

juniorjukeko commited on Oct 16, 2023

Commit

c00c005

1 Parent(s): 87ff9aa

Create app.py

Browse files

Files changed (1) hide show

app.py +150 -0

app.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chat_models import ChatOpenAI
+from langchain import OpenAI
+from langchain import PromptTemplate
+from langchain.chains.summarize import load_summarize_chain
+import gradio as gr
+title = '''
+<div style="text-align: left; font-family:Arial; color:Black; font-size: 16px; max-width: 750px;">
+    <h1>Small PDF Summarizer</h1>
+    <p style="text-align: left;">Upload a .PDF from your computer, click the "Upload PDF" button and fill OpenAI API Key. <br />
+    Output will be on the textbox bellow. You can also change some LLM configurations from the 'config' tab<br/>
+</div>
+'''
+desc_1 = '''
+<div style="text-align: left; font-family:Arial; color:Black; font-size: 14px;">
+    <h3>Custom Prompt Template</h3>
+    <p style="text-align: left;">You can customize input prompt for the map and combine prompt of langchain's Map-Reduce Summarization pipeline
+    using the texboxt bellow.<br/>
+    Prompt which will be fed into LLM use the format of : <b>{textbox input} + {pdf_text} +  "SUMMARY:"</b> <br/>
+    In essence each page of PDF will be summarized using map prompt, and each summary then be combined for final output using combine prompt.<br/>
+    <a href="https://python.langchain.com/docs/use_cases/summarization">More Info on Map-Reduce for Summarization</a>
+</div>
+'''
+MAP_PROMPT = """
+                You will be given a page of text which section is enclosed in triple backticks (```).
+                Your goal is to give a summary of this section, ignoring references and footnote if present.
+                Your response should be at least 200 words only if input classified as academic text.
+                Your response must fully encompass what was said in the page.
+                ```{text}```
+                SUMMARY:
+                """
+COMBINE_PROMPT = """
+                    Write a full summary of the following text enclosed in triple backticks (```).
+                    Full summary consists of a descriptive summary of at least 100 words (if possible),
+                    followed by numbered list which covers key points of the text.
+                    ```{text}```
+                    SUMMARY:
+                    """
+config_info = {'temperature': 'Higher means more randomness to the output.',
+               'max_tokens' : 'The maximum number of tokens to generate in the output.',
+               'llm_list' : ''}
+model_list = {'gpt-3.5-turbo':'chat',
+              'gpt-4':'chat',
+              'gpt-3.5-turbo-instruct':'instruct',
+              'text-davinci-003':'instruct'}
+text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=250)
+def summarize_pdf(pdf_file, api_key,
+                  model_name, temperature, llm_max_tokens,
+                  custom_map_prompt, custom_combine_prompt):
+  global pdf_docs
+  # Read PDF
+  loader = PyPDFLoader(pdf_file.name)
+  pdf_docs = loader.load_and_split(text_splitter)
+  file_check(pdf_file)
+  # Build LLM Model
+  os.environ["OPENAI_API_KEY"] = api_key
+  if model_list[model_name] == 'chat':
+    gpt_llm = ChatOpenAI(temperature=temperature, model_name=model_name, max_tokens=int(llm_max_tokens))
+  else:
+    gpt_llm = OpenAI(temperature=temperature, model_name=model_name, max_tokens=int(llm_max_tokens))
+  # Summarize PDF
+  if custom_map_prompt !="":
+    map_template = PromptTemplate(template=generate_template(custom_map_prompt), input_variables=["text"])
+  else:
+    map_template = PromptTemplate(template=MAP_PROMPT, input_variables=["text"])
+  if custom_combine_prompt !="":
+    combine_template = PromptTemplate(template=generate_template(custom_combine_prompt), input_variables=["text"])
+  else:
+    combine_template = PromptTemplate(template=COMBINE_PROMPT, input_variables=["text"])
+  map_reduce_chain = load_summarize_chain(
+    gpt_llm,
+    chain_type="map_reduce",
+    map_prompt=map_prompt,
+    combine_prompt=combine_prompt,
+    return_intermediate_steps=True,
+    token_max=3840 # limit the maximum number of tokens in the combined document (combine prompt).
+  )
+  map_reduce_outputs = map_reduce_chain({"input_documents": pdf_docs})
+  return map_reduce_outputs['output_text']
+def file_check(pdf_file):
+  if os.path.getsize(pdf_file.name)/1024 **2 > 1:
+    raise gr.Error("Maximum File Size is 1MB!")
+  elif len(pdf_docs) > 15:
+    raise gr.Error("Maximum File Length is 15 Pages!")
+  else:
+    pass
+def generate_template(custom_prompt):
+  custom_template = custom_prompt + '''
+      ```{text}```
+      SUMMARY:
+      '''
+  return custom_template
+def main():
+  with gr.Blocks() as demo:
+    gr.HTML(title)
+    with gr.Tab("Main"):
+      with gr.Column():
+        pdf_doc = gr.File(label="Uploaded PDF:", file_types=['.pdf'], type="file")
+        API_KEY = gr.Textbox(label="OpenAI API Key:", lines=1, type="password")
+        summarize_button = gr.Button(value="Summarize!")
+        summarized_text  = gr.Textbox(label="Summary", lines=10, show_copy_button=True)
+    with gr.Tab("Config"):
+      llm_model = gr.Dropdown(choices=model_list.keys(), label="LLM model used", value='gpt-3.5-turbo', interactive=True)
+      with gr.Row():
+        temperature = gr.Slider(minimum=0, maximum=0.5, step=0.1, label="temperature", info=config_info['temperature'])
+        llm_max_tokens = gr.Radio(choices=[128, 256, 512], value=256, interactive=True, label="LLM max tokens", info=config_info['max_tokens'])
+      gr.HTML(desc_1)
+      with gr.Row():
+        user_map_prompt = gr.Textbox(label="Map PROMPT", lines=10, interactive=True)
+        user_comb_prompt = gr.Textbox(label="Combine PROMPT", lines=10, interactive=True)
+      with gr.Accordion("Default Template", open=False):
+        with gr.Row():
+          default_map_prompt = gr.Textbox(label="Map PROMPT", value=MAP_PROMPT, lines=10, interactive=False)
+          default_comb_prompt = gr.Textbox(label="Combine PROMPT", value=COMBINE_PROMPT, lines=10, interactive=False)
+      with gr.Accordion("User Custom Prompt Preview", open=False):
+        prompt_preview_button = gr.Button(value="View Custom Prompt")
+        with gr.Row():
+          custom_map_view = gr.Textbox(label="Map PROMPT", lines=10, interactive=False)
+          custom_comb_view = gr.Textbox(label="Combine PROMPT", lines=10, interactive=False)
+        prompt_preview_button.click(generate_template, inputs=[user_map_prompt], outputs=[custom_map_view])
+        prompt_preview_button.click(generate_template, inputs=[user_comb_prompt], outputs=[custom_comb_view])
+    list_inputs = [pdf_doc, API_KEY, llm_model, temperature, llm_max_tokens, user_map_prompt, user_comb_prompt]
+    summarize_button.click(summarize_pdf, inputs=list_inputs, outputs=[summarized_text])
+  demo.queue().launch(share=False)
+if __name__ == "__main__":
+    main()