juniorjukeko commited on
Commit
56950ed
·
1 Parent(s): 1c3a79b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -42
app.py CHANGED
@@ -13,8 +13,11 @@ import gradio as gr
13
  title = '''
14
  <div style="text-align: left; font-family:Arial; color:Black; font-size: 16px; max-width: 750px;">
15
  <h1>Small PDF Summarizer</h1>
16
- <p style="text-align: left;">Upload a .PDF from your computer, click the "Upload PDF" button and fill OpenAI API Key. <br />
17
- Output will be on the textbox bellow. You can also change some LLM configurations from the 'config' tab<br/>
 
 
 
18
  </div>
19
  '''
20
 
@@ -56,49 +59,57 @@ model_list = {'gpt-3.5-turbo':'chat',
56
 
57
  text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=250)
58
 
59
- def parse_pdf(file_path):
60
- output = []
61
- print(file_path)
62
- pdf = PdfReader(file_path)
63
 
64
- for page in pdf.pages:
65
- text = page.extract_text()
66
- output.append(text)
67
 
68
- return output, len(pdf.pages)
69
 
70
- def preprocess_pdf_text(pdf_file): #(list_of_text):
71
- global page_num
72
 
73
- pdf_txt, page_num = parse_pdf(pdf_file.name)
74
- file_check(pdf_file.name)
75
 
76
- page_docs = [Document(page_content=page) for page in pdf_txt]
77
 
78
- text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=250, chunk_overlap=50)
79
- doc_sections = []
80
- for page in page_docs:
81
- sections_text = text_splitter.split_text(page.page_content)
82
- sections_doc = [Document(page_content=section) for section in sections_text]
83
 
84
- for section in sections_doc:
85
- doc_sections.append(section)
86
 
87
  return doc_sections
88
 
89
- def dummy1(pdf_file):
 
90
  loader = PyPDFLoader(pdf_file.name)
91
  pdf_docs = loader.load_and_split(text_splitter)
 
92
 
93
- return "FINISH"+pdf_docs[0].page_content
94
 
95
- def summarize_pdf(pdf_file, api_key,
 
 
 
 
 
 
 
 
 
 
96
  model_name, temperature, llm_max_tokens,
97
  custom_map_prompt, custom_combine_prompt):
98
- # global page_num
99
- # Read PDF
100
- # pdf_txt, page_num = parse_pdf(pdf_file.name)
101
- # pdf_doc = preprocess_pdf_text(pdf_txt)
102
 
103
  # Build LLM Model
104
  os.environ["OPENAI_API_KEY"] = api_key
@@ -126,17 +137,9 @@ def summarize_pdf(pdf_file, api_key,
126
  return_intermediate_steps=True,
127
  token_max=3840 # limit the maximum number of tokens in the combined document (combine prompt).
128
  )
129
- map_reduce_outputs = map_reduce_chain({"input_documents": pdf_file})
130
  return map_reduce_outputs['output_text']
131
 
132
- def file_check(pdf_file):
133
- if os.path.getsize(pdf_file.name)/1024 **2 > 1:
134
- raise gr.Error("Maximum File Size is 1MB!")
135
- elif page_num > 15:
136
- raise gr.Error("Maximum File Length is 15 Pages!")
137
- else:
138
- pass
139
-
140
  def generate_template(custom_prompt):
141
  custom_template = custom_prompt + '''
142
 
@@ -151,13 +154,14 @@ def main():
151
  with gr.Tab("Main"):
152
  with gr.Column():
153
  pdf_doc = gr.File(label="Uploaded PDF:", file_types=['.pdf'])
 
 
 
 
154
  API_KEY = gr.Textbox(label="OpenAI API Key:", lines=1, type="password")
155
- ingest_pdf = gr.State()
156
- submit_button = gr.Button(value="Upload!")
157
  summarize_button = gr.Button(value="Summarize!")
158
  summarized_text = gr.Textbox(label="Summary", lines=10, show_copy_button=True)
159
 
160
-
161
  with gr.Tab("Config"):
162
  llm_model = gr.Dropdown(choices=model_list.keys(), label="LLM model used", value='gpt-3.5-turbo', interactive=True)
163
  with gr.Row():
@@ -185,7 +189,8 @@ def main():
185
 
186
  # summarize_click = summarize_button.click(preprocess_pdf_text, inputs=[pdf_doc], outputs=[ingest_pdf]).\
187
  # then(summarize_pdf, inputs=list_inputs, outputs=[summarized_text])
188
- submit_button.click(dummy1, inputs=[pdf_doc], outputs=[summarized_text])
 
189
  demo.queue(concurrency_count=1).launch(share=True)
190
 
191
  if __name__ == "__main__":
 
13
  title = '''
14
  <div style="text-align: left; font-family:Arial; color:Black; font-size: 16px; max-width: 750px;">
15
  <h1>Small PDF Summarizer</h1>
16
+ <p style="text-align: left;">How to Use:<br/>
17
+ 1. Upload a .PDF from your computer and fill OpenAI API key.<br/>
18
+ 2. Click the "Upload PDF" button, if successful a preview of your PDF text will be shown.<br/>
19
+ 3. Click "Summarize!" and the output will be shown on the textbox bellow.<br/>
20
+ You can also change some LLM configurations from the 'config' tab.<br/>
21
  </div>
22
  '''
23
 
 
59
 
60
  text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=250)
61
 
62
+ # def parse_pdf(file_path):
63
+ # output = []
64
+ # print(file_path)
65
+ # pdf = PdfReader(file_path)
66
 
67
+ # for page in pdf.pages:
68
+ # text = page.extract_text()
69
+ # output.append(text)
70
 
71
+ # return output, len(pdf.pages)
72
 
73
+ # def preprocess_pdf_text(pdf_file): #(list_of_text):
74
+ # global page_num
75
 
76
+ # pdf_txt, page_num = parse_pdf(pdf_file.name)
77
+ # file_check(pdf_file.name)
78
 
79
+ # page_docs = [Document(page_content=page) for page in pdf_txt]
80
 
81
+ # text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=250, chunk_overlap=50)
82
+ # doc_sections = []
83
+ # for page in page_docs:
84
+ # sections_text = text_splitter.split_text(page.page_content)
85
+ # sections_doc = [Document(page_content=section) for section in sections_text]
86
 
87
+ # for section in sections_doc:
88
+ # doc_sections.append(section)
89
 
90
  return doc_sections
91
 
92
+ def parse_pdf(pdf_file):
93
+ global pdf_docs, page_count
94
  loader = PyPDFLoader(pdf_file.name)
95
  pdf_docs = loader.load_and_split(text_splitter)
96
+ page_count = len(pdf_docs)
97
 
98
+ file_check(pdf_file)
99
 
100
+ return pdf_docs[0].page_content[:100]
101
+
102
+ def file_check(pdf_file):
103
+ if os.path.getsize(pdf_file.name)/1024 **2 > 1:
104
+ raise gr.Error("Maximum File Size is 1MB!")
105
+ elif page_count > 15:
106
+ raise gr.Error("Maximum File Length is 15 Pages!")
107
+ else:
108
+ pass
109
+
110
+ def summarize_pdf(api_key,
111
  model_name, temperature, llm_max_tokens,
112
  custom_map_prompt, custom_combine_prompt):
 
 
 
 
113
 
114
  # Build LLM Model
115
  os.environ["OPENAI_API_KEY"] = api_key
 
137
  return_intermediate_steps=True,
138
  token_max=3840 # limit the maximum number of tokens in the combined document (combine prompt).
139
  )
140
+ map_reduce_outputs = map_reduce_chain({"input_documents": pdf_docs})
141
  return map_reduce_outputs['output_text']
142
 
 
 
 
 
 
 
 
 
143
  def generate_template(custom_prompt):
144
  custom_template = custom_prompt + '''
145
 
 
154
  with gr.Tab("Main"):
155
  with gr.Column():
156
  pdf_doc = gr.File(label="Uploaded PDF:", file_types=['.pdf'])
157
+ with gr.Row():
158
+ submit_button = gr.Button(value="Upload!")
159
+ pdf_preview = gr.Textbox(label="PDF Preview:", lines=2, interactive=False)
160
+
161
  API_KEY = gr.Textbox(label="OpenAI API Key:", lines=1, type="password")
 
 
162
  summarize_button = gr.Button(value="Summarize!")
163
  summarized_text = gr.Textbox(label="Summary", lines=10, show_copy_button=True)
164
 
 
165
  with gr.Tab("Config"):
166
  llm_model = gr.Dropdown(choices=model_list.keys(), label="LLM model used", value='gpt-3.5-turbo', interactive=True)
167
  with gr.Row():
 
189
 
190
  # summarize_click = summarize_button.click(preprocess_pdf_text, inputs=[pdf_doc], outputs=[ingest_pdf]).\
191
  # then(summarize_pdf, inputs=list_inputs, outputs=[summarized_text])
192
+ submit_button.click(dummy1, inputs=[pdf_doc], outputs=[pdf_preview])
193
+
194
  demo.queue(concurrency_count=1).launch(share=True)
195
 
196
  if __name__ == "__main__":