Spaces:

juniorjukeko
/

small-pdf-summarizer

Build error

App Files Files Community

juniorjukeko commited on Oct 16, 2023

Commit

56950ed

1 Parent(s): 1c3a79b

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -42

app.py CHANGED Viewed

@@ -13,8 +13,11 @@ import gradio as gr
 title = '''
 <div style="text-align: left; font-family:Arial; color:Black; font-size: 16px; max-width: 750px;">
     <h1>Small PDF Summarizer</h1>
-    <p style="text-align: left;">Upload a .PDF from your computer, click the "Upload PDF" button and fill OpenAI API Key. <br />
-    Output will be on the textbox bellow. You can also change some LLM configurations from the 'config' tab<br/>
 </div>
 '''
@@ -56,49 +59,57 @@ model_list = {'gpt-3.5-turbo':'chat',
 text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=250)
-def parse_pdf(file_path):
-    output = []
-    print(file_path)
-    pdf = PdfReader(file_path)
-    for page in pdf.pages:
-        text = page.extract_text()
-        output.append(text)
-    return output, len(pdf.pages)
-def preprocess_pdf_text(pdf_file): #(list_of_text):
-  global page_num
-  pdf_txt, page_num = parse_pdf(pdf_file.name)
-  file_check(pdf_file.name)
-  page_docs = [Document(page_content=page) for page in pdf_txt]
-  text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=250, chunk_overlap=50)
-  doc_sections = []
-  for page in page_docs:
-    sections_text = text_splitter.split_text(page.page_content)
-    sections_doc = [Document(page_content=section) for section in sections_text]
-    for section in sections_doc:
-      doc_sections.append(section)
   return doc_sections
-def dummy1(pdf_file):
     loader = PyPDFLoader(pdf_file.name)
     pdf_docs = loader.load_and_split(text_splitter)
-    return "FINISH"+pdf_docs[0].page_content
-def summarize_pdf(pdf_file, api_key,
                   model_name, temperature, llm_max_tokens,
                   custom_map_prompt, custom_combine_prompt):
-  # global page_num
-  # Read PDF
-  # pdf_txt, page_num = parse_pdf(pdf_file.name)
-  # pdf_doc = preprocess_pdf_text(pdf_txt)
   # Build LLM Model
   os.environ["OPENAI_API_KEY"] = api_key
@@ -126,17 +137,9 @@ def summarize_pdf(pdf_file, api_key,
     return_intermediate_steps=True,
     token_max=3840 # limit the maximum number of tokens in the combined document (combine prompt).
   )
-  map_reduce_outputs = map_reduce_chain({"input_documents": pdf_file})
   return map_reduce_outputs['output_text']
-def file_check(pdf_file):
-  if os.path.getsize(pdf_file.name)/1024 **2 > 1:
-    raise gr.Error("Maximum File Size is 1MB!")
-  elif page_num > 15:
-    raise gr.Error("Maximum File Length is 15 Pages!")
-  else:
-    pass
 def generate_template(custom_prompt):
   custom_template = custom_prompt + '''
@@ -151,13 +154,14 @@ def main():
     with gr.Tab("Main"):
       with gr.Column():
         pdf_doc = gr.File(label="Uploaded PDF:", file_types=['.pdf'])
         API_KEY = gr.Textbox(label="OpenAI API Key:", lines=1, type="password")
-        ingest_pdf = gr.State()
-        submit_button = gr.Button(value="Upload!")
         summarize_button = gr.Button(value="Summarize!")
         summarized_text  = gr.Textbox(label="Summary", lines=10, show_copy_button=True)
     with gr.Tab("Config"):
       llm_model = gr.Dropdown(choices=model_list.keys(), label="LLM model used", value='gpt-3.5-turbo', interactive=True)
       with gr.Row():
@@ -185,7 +189,8 @@ def main():
     # summarize_click = summarize_button.click(preprocess_pdf_text, inputs=[pdf_doc], outputs=[ingest_pdf]).\
     #   then(summarize_pdf, inputs=list_inputs, outputs=[summarized_text])
-    submit_button.click(dummy1, inputs=[pdf_doc], outputs=[summarized_text])
   demo.queue(concurrency_count=1).launch(share=True)
 if __name__ == "__main__":

 title = '''
 <div style="text-align: left; font-family:Arial; color:Black; font-size: 16px; max-width: 750px;">
     <h1>Small PDF Summarizer</h1>
+    <p style="text-align: left;">How to Use:<br/>
+    1. Upload a .PDF from your computer and fill OpenAI API key.<br/>
+    2. Click the "Upload PDF" button, if successful a preview of your PDF text will be shown.<br/>
+    3. Click "Summarize!" and the output will be shown on the textbox bellow.<br/>
+    You can also change some LLM configurations from the 'config' tab.<br/>
 </div>
 '''
 text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=250)
+# def parse_pdf(file_path):
+#     output = []
+#     print(file_path)
+#     pdf = PdfReader(file_path)
+#     for page in pdf.pages:
+#         text = page.extract_text()
+#         output.append(text)
+#     return output, len(pdf.pages)
+# def preprocess_pdf_text(pdf_file): #(list_of_text):
+#   global page_num
+#   pdf_txt, page_num = parse_pdf(pdf_file.name)
+#   file_check(pdf_file.name)
+#   page_docs = [Document(page_content=page) for page in pdf_txt]
+#   text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=250, chunk_overlap=50)
+#   doc_sections = []
+#   for page in page_docs:
+#     sections_text = text_splitter.split_text(page.page_content)
+#     sections_doc = [Document(page_content=section) for section in sections_text]
+#     for section in sections_doc:
+#       doc_sections.append(section)
   return doc_sections
+def parse_pdf(pdf_file):
+    global pdf_docs, page_count
     loader = PyPDFLoader(pdf_file.name)
     pdf_docs = loader.load_and_split(text_splitter)
+    page_count = len(pdf_docs)
+    file_check(pdf_file)
+    return pdf_docs[0].page_content[:100]
+def file_check(pdf_file):
+  if os.path.getsize(pdf_file.name)/1024 **2 > 1:
+    raise gr.Error("Maximum File Size is 1MB!")
+  elif page_count > 15:
+    raise gr.Error("Maximum File Length is 15 Pages!")
+  else:
+    pass
+def summarize_pdf(api_key,
                   model_name, temperature, llm_max_tokens,
                   custom_map_prompt, custom_combine_prompt):
   # Build LLM Model
   os.environ["OPENAI_API_KEY"] = api_key
     return_intermediate_steps=True,
     token_max=3840 # limit the maximum number of tokens in the combined document (combine prompt).
   )
+  map_reduce_outputs = map_reduce_chain({"input_documents": pdf_docs})
   return map_reduce_outputs['output_text']
 def generate_template(custom_prompt):
   custom_template = custom_prompt + '''
     with gr.Tab("Main"):
       with gr.Column():
         pdf_doc = gr.File(label="Uploaded PDF:", file_types=['.pdf'])
+        with gr.Row():
+            submit_button = gr.Button(value="Upload!")
+            pdf_preview = gr.Textbox(label="PDF Preview:", lines=2, interactive=False)
         API_KEY = gr.Textbox(label="OpenAI API Key:", lines=1, type="password")
         summarize_button = gr.Button(value="Summarize!")
         summarized_text  = gr.Textbox(label="Summary", lines=10, show_copy_button=True)
     with gr.Tab("Config"):
       llm_model = gr.Dropdown(choices=model_list.keys(), label="LLM model used", value='gpt-3.5-turbo', interactive=True)
       with gr.Row():
     # summarize_click = summarize_button.click(preprocess_pdf_text, inputs=[pdf_doc], outputs=[ingest_pdf]).\
     #   then(summarize_pdf, inputs=list_inputs, outputs=[summarized_text])
+    submit_button.click(dummy1, inputs=[pdf_doc], outputs=[pdf_preview])
   demo.queue(concurrency_count=1).launch(share=True)
 if __name__ == "__main__":