Spaces:
Build error
Build error
| import os | |
| from pypdf import PdfReader | |
| from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader | |
| from langchain.docstore.document import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.llms import OpenAI | |
| from langchain import PromptTemplate | |
| from langchain.chains.summarize import load_summarize_chain | |
| import gradio as gr | |
| title = ''' | |
| <div style="text-align: left; font-family:Arial; color:Black; font-size: 16px; max-width: 750px;"> | |
| <h1>Small PDF Summarizer</h1> | |
| <p style="text-align: left;">This App can be used to summarize small PDF (max. 1 MB, 15 pages)<br/> | |
| How to Use:<br/> | |
| 1. Upload a .PDF from your computer and fill OpenAI API key.<br/> | |
| 2. Click the "Upload PDF" button, if successful a preview of your PDF text will be shown.<br/> | |
| 3. Click "Summarize!" and the output will be shown on the textbox bellow.<br/> | |
| You can also change some LLM configurations from the 'config' tab.<br/> | |
| </div> | |
| ''' | |
| desc_1 = ''' | |
| <div style="text-align: left; font-family:Arial; color:Black; font-size: 14px;"> | |
| <h3>Custom Prompt Template</h3> | |
| <p style="text-align: left;">You can customize input prompt for the map and combine prompt of langchain's Map-Reduce Summarization pipeline | |
| using the texboxt bellow.<br/> | |
| Prompt which will be fed into LLM use the format of : <b>{textbox input} + {pdf_text} + "SUMMARY:"</b> <br/> | |
| In essence each page of PDF will be summarized using map prompt, and each summary then be combined for final output using combine prompt.<br/> | |
| <a href="https://python.langchain.com/docs/use_cases/summarization">More Info on Map-Reduce for Summarization</a> | |
| </div> | |
| ''' | |
| MAP_PROMPT = """ | |
| You will be given a page of text which section is enclosed in triple backticks (```). | |
| Your goal is to give a summary of this section, ignoring references and footnote if present. | |
| Your response should be at least 200 words only if input classified as academic text. | |
| Your response must fully encompass what was said in the page. | |
| ```{text}``` | |
| SUMMARY: | |
| """ | |
| COMBINE_PROMPT = """ | |
| Write a full summary of the following text enclosed in triple backticks (```). | |
| Full summary consists of a descriptive summary of at least 100 words (if possible), | |
| followed by numbered list which covers key points of the text. | |
| ```{text}``` | |
| SUMMARY: | |
| """ | |
| config_info = {'temperature': 'Higher means more randomness to the output.', | |
| 'max_tokens' : 'The maximum number of tokens to generate in the output.', | |
| 'llm_list' : ''} | |
| model_list = {'gpt-3.5-turbo':'chat', | |
| 'gpt-4':'chat', | |
| 'gpt-3.5-turbo-instruct':'instruct', | |
| 'text-davinci-003':'instruct'} | |
| text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=250) | |
| def parse_pdf(pdf_file): | |
| global pdf_docs, page_count | |
| loader = PyPDFLoader(pdf_file.name) | |
| pdf_docs = loader.load_and_split(text_splitter) | |
| page_count = len(pdf_docs) | |
| file_check(pdf_file) | |
| return pdf_docs[0].page_content[:100] | |
| def file_check(pdf_file): | |
| if os.path.getsize(pdf_file.name)/1024 **2 > 1: | |
| raise gr.Error("Maximum File Size is 1MB!") | |
| elif page_count > 15: | |
| raise gr.Error("Maximum File Length is 15 Pages!") | |
| else: | |
| pass | |
| def summarize_pdf(api_key, | |
| model_name, temperature, llm_max_tokens, | |
| custom_map_prompt, custom_combine_prompt): | |
| try: | |
| if pdf_docs[0].page_content[:1]: | |
| pass | |
| except: | |
| raise gr.Error("No PDF File Detected!") | |
| # Build LLM Model | |
| os.environ["OPENAI_API_KEY"] = api_key | |
| if model_list[model_name] == 'chat': | |
| gpt_llm = ChatOpenAI(temperature=temperature, model_name=model_name, max_tokens=int(llm_max_tokens)) | |
| else: | |
| gpt_llm = OpenAI(temperature=temperature, model_name=model_name, max_tokens=int(llm_max_tokens)) | |
| # Summarize PDF | |
| if custom_map_prompt !="": | |
| map_template = PromptTemplate(template=generate_template(custom_map_prompt), input_variables=["text"]) | |
| else: | |
| map_template = PromptTemplate(template=MAP_PROMPT, input_variables=["text"]) | |
| if custom_combine_prompt !="": | |
| combine_template = PromptTemplate(template=generate_template(custom_combine_prompt), input_variables=["text"]) | |
| else: | |
| combine_template = PromptTemplate(template=COMBINE_PROMPT, input_variables=["text"]) | |
| map_reduce_chain = load_summarize_chain( | |
| gpt_llm, | |
| chain_type="map_reduce", | |
| map_prompt=map_template, | |
| combine_prompt=combine_template, | |
| return_intermediate_steps=True, | |
| token_max=3840 # limit the maximum number of tokens in the combined document (combine prompt). | |
| ) | |
| map_reduce_outputs = map_reduce_chain({"input_documents": pdf_docs}) | |
| return map_reduce_outputs['output_text'] | |
| def generate_template(custom_prompt): | |
| custom_template = custom_prompt + ''' | |
| ```{text}``` | |
| SUMMARY: | |
| ''' | |
| return custom_template | |
| def main(): | |
| with gr.Blocks() as demo: | |
| gr.HTML(title) | |
| with gr.Tab("Main"): | |
| with gr.Column(): | |
| pdf_doc = gr.File(label="Uploaded PDF:", file_types=['.pdf']) | |
| with gr.Row(): | |
| submit_button = gr.Button(value="Upload!") | |
| pdf_preview = gr.Textbox(label="PDF Preview:", lines=2, interactive=False) | |
| API_KEY = gr.Textbox(label="OpenAI API Key:", lines=1, type="password") | |
| summarize_button = gr.Button(value="Summarize!") | |
| summarized_text = gr.Textbox(label="Summary", lines=10, show_copy_button=True) | |
| with gr.Tab("Config"): | |
| llm_model = gr.Dropdown(choices=model_list.keys(), label="LLM model used", value='gpt-3.5-turbo', interactive=True) | |
| with gr.Row(): | |
| temperature = gr.Slider(minimum=0, maximum=0.5, step=0.1, label="temperature", info=config_info['temperature']) | |
| llm_max_tokens = gr.Radio(choices=[128, 256, 512], value=256, interactive=True, label="LLM max tokens", info=config_info['max_tokens']) | |
| gr.HTML(desc_1) | |
| with gr.Row(): | |
| user_map_prompt = gr.Textbox(label="Map PROMPT", lines=10, interactive=True) | |
| user_comb_prompt = gr.Textbox(label="Combine PROMPT", lines=10, interactive=True) | |
| with gr.Accordion("Default Template", open=False): | |
| with gr.Row(): | |
| default_map_prompt = gr.Textbox(label="Map PROMPT", value=MAP_PROMPT, lines=10, interactive=False) | |
| default_comb_prompt = gr.Textbox(label="Combine PROMPT", value=COMBINE_PROMPT, lines=10, interactive=False) | |
| with gr.Accordion("User Custom Prompt Preview", open=False): | |
| prompt_preview_button = gr.Button(value="View Custom Prompt") | |
| with gr.Row(): | |
| custom_map_view = gr.Textbox(label="Map PROMPT", lines=10, interactive=False) | |
| custom_comb_view = gr.Textbox(label="Combine PROMPT", lines=10, interactive=False) | |
| prompt_preview_button.click(generate_template, inputs=[user_map_prompt], outputs=[custom_map_view]) | |
| prompt_preview_button.click(generate_template, inputs=[user_comb_prompt], outputs=[custom_comb_view]) | |
| inputs_list = [API_KEY, llm_model, temperature, llm_max_tokens, user_map_prompt, user_comb_prompt] | |
| submit_button.click(parse_pdf, inputs=[pdf_doc], outputs=[pdf_preview]) | |
| summarize_button.click(summarize_pdf, inputs=inputs_list, outputs=[summarized_text]) | |
| demo.queue(concurrency_count=1).launch(share=True) | |
| if __name__ == "__main__": | |
| main() |