CurioChen commited on
Commit
97e7c1f
·
verified ·
1 Parent(s): 48926f3

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +25 -22
  2. requirements.txt +3 -2
app.py CHANGED
@@ -16,6 +16,7 @@ import gradio as gr
16
  import re
17
  import fitz # PyMuPDF
18
  import pandas as pd
 
19
 
20
  # Configure logging
21
  logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -145,7 +146,7 @@ def json_to_excel(json_data):
145
  'amount', 'notice_publish_date']
146
  ws.append(headers)
147
 
148
- # 创建一个辅助函数来进行精确匹配
149
  def exact_match(key, target):
150
  key = ''.join(c.lower() for c in key if c.isalnum())
151
  target = ''.join(c.lower() for c in target if c.isalnum())
@@ -154,7 +155,7 @@ def json_to_excel(json_data):
154
  for contract in data['contracts']:
155
  row = []
156
  for header in headers:
157
- # 使用精确匹配来查找对应的值
158
  matched_value = next((v for k, v in contract.items() if exact_match(header, k)), '')
159
  row.append(matched_value)
160
  ws.append(row)
@@ -164,58 +165,58 @@ def json_to_excel(json_data):
164
  return tmp.name
165
 
166
  def clean_url(input_text):
167
- # 去除可能存在的首尾引号
168
  cleaned_url = input_text.strip().strip('"')
169
  return cleaned_url
170
 
171
- # 新增函数:处理上传的PDF文件
172
  def process_pdf(file):
173
- logging.info(f"开始处理PDF文件: {type(file)}")
174
  try:
175
  if hasattr(file, 'name'):
176
- # 如果file是一个文件对象
177
  with fitz.open(file.name) as doc:
178
  text_content = ""
179
  for page in doc:
180
  text_content += page.get_text()
181
  else:
182
- # 如果file是一个字符串(文件路径)
183
  with fitz.open(file) as doc:
184
  text_content = ""
185
  for page in doc:
186
  text_content += page.get_text()
187
- logging.info("PDF处理成功")
188
  return text_content
189
  except Exception as e:
190
- logging.error(f"PDF处理错误: {str(e)}")
191
  raise
192
 
193
  def preview_excel(excel_path):
194
  try:
195
- df = pd.read_excel(excel_path, nrows=3)
196
- preview = df.iloc[:3, :3].to_html(index=False)
197
- return preview
198
  except Exception as e:
199
- logging.error(f"Error previewing Excel: {str(e)}")
200
- return "Unable to generate preview"
201
 
202
  def process_pdf_file(file):
203
  if file is None:
204
  logging.warning("No file uploaded")
205
- return "Please upload a PDF file.", None, ""
206
 
207
  try:
208
  logging.info(f"Received file: {type(file)}, {file.name if hasattr(file, 'name') else 'No name'}")
209
  pdf_content = process_pdf(file)
210
  except Exception as e:
211
  logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
212
- return f"Error processing PDF file: {str(e)}", None, ""
213
 
214
  try:
215
  json_data = extract_information(pdf_content)
216
  if json_data is None:
217
  logging.error("Failed to extract information")
218
- return "Error extracting information. Please try again later.", None, ""
219
 
220
  excel_path = json_to_excel(json_data)
221
  excel_preview = preview_excel(excel_path)
@@ -224,21 +225,23 @@ def process_pdf_file(file):
224
  return "Processing successful!", excel_path, excel_preview
225
  except Exception as e:
226
  logging.error(f"Error processing file: {str(e)}", exc_info=True)
227
- return f"Error processing file: {str(e)}", None, ""
228
 
229
- # Modified Gradio interface
230
  iface = gr.Interface(
231
  fn=process_pdf_file,
232
- inputs=gr.File(label="Upload PDF File", type="filepath", file_types=[".pdf"]),
 
 
233
  outputs=[
234
  gr.Textbox(label="Processing Status"),
235
  gr.File(label="Download Excel File"),
236
- gr.HTML(label="Excel Preview")
237
  ],
238
  title="PDF Document Processing and Information Extraction",
239
  description="Upload a PDF file, and the system will process it and generate an Excel result."
240
  )
241
 
242
- # Run Gradio application
243
  if __name__ == "__main__":
244
  iface.launch()
 
16
  import re
17
  import fitz # PyMuPDF
18
  import pandas as pd
19
+ from gradio_pdf import PDF # Import the new PDF component
20
 
21
  # Configure logging
22
  logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
 
146
  'amount', 'notice_publish_date']
147
  ws.append(headers)
148
 
149
+ # Create a helper function for exact matching
150
  def exact_match(key, target):
151
  key = ''.join(c.lower() for c in key if c.isalnum())
152
  target = ''.join(c.lower() for c in target if c.isalnum())
 
155
  for contract in data['contracts']:
156
  row = []
157
  for header in headers:
158
+ # Use exact matching to find the corresponding value
159
  matched_value = next((v for k, v in contract.items() if exact_match(header, k)), '')
160
  row.append(matched_value)
161
  ws.append(row)
 
165
  return tmp.name
166
 
167
  def clean_url(input_text):
168
+ # Remove any leading or trailing quotes
169
  cleaned_url = input_text.strip().strip('"')
170
  return cleaned_url
171
 
172
+ # New function: Process uploaded PDF
173
  def process_pdf(file):
174
+ logging.info(f"Start processing PDF file: {type(file)}")
175
  try:
176
  if hasattr(file, 'name'):
177
+ # If file is a file object
178
  with fitz.open(file.name) as doc:
179
  text_content = ""
180
  for page in doc:
181
  text_content += page.get_text()
182
  else:
183
+ # If file is a string (file path)
184
  with fitz.open(file) as doc:
185
  text_content = ""
186
  for page in doc:
187
  text_content += page.get_text()
188
+ logging.info("PDF processing successful")
189
  return text_content
190
  except Exception as e:
191
+ logging.error(f"PDF processing error: {str(e)}")
192
  raise
193
 
194
  def preview_excel(excel_path):
195
  try:
196
+ df = pd.read_excel(excel_path, nrows=10)
197
+ preview_df = df.iloc[:10, :8]
198
+ return gr.Dataframe(value=preview_df)
199
  except Exception as e:
200
+ logging.error(f"Excel preview error: {str(e)}")
201
+ return gr.Dataframe()
202
 
203
  def process_pdf_file(file):
204
  if file is None:
205
  logging.warning("No file uploaded")
206
+ return "Please upload a PDF file.", None, gr.Dataframe()
207
 
208
  try:
209
  logging.info(f"Received file: {type(file)}, {file.name if hasattr(file, 'name') else 'No name'}")
210
  pdf_content = process_pdf(file)
211
  except Exception as e:
212
  logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
213
+ return f"Error processing PDF file: {str(e)}", None, gr.Dataframe()
214
 
215
  try:
216
  json_data = extract_information(pdf_content)
217
  if json_data is None:
218
  logging.error("Failed to extract information")
219
+ return "Error extracting information. Please try again later.", None, gr.Dataframe()
220
 
221
  excel_path = json_to_excel(json_data)
222
  excel_preview = preview_excel(excel_path)
 
225
  return "Processing successful!", excel_path, excel_preview
226
  except Exception as e:
227
  logging.error(f"Error processing file: {str(e)}", exc_info=True)
228
+ return f"Error processing file: {str(e)}", None, gr.Dataframe()
229
 
230
+ # Gradio interface
231
  iface = gr.Interface(
232
  fn=process_pdf_file,
233
+ inputs=[
234
+ PDF(label="Upload PDF File") # Only keep the label parameter
235
+ ],
236
  outputs=[
237
  gr.Textbox(label="Processing Status"),
238
  gr.File(label="Download Excel File"),
239
+ gr.Dataframe(label="Excel Preview (First 10 rows, 8 columns)")
240
  ],
241
  title="PDF Document Processing and Information Extraction",
242
  description="Upload a PDF file, and the system will process it and generate an Excel result."
243
  )
244
 
245
+ # Run the Gradio app
246
  if __name__ == "__main__":
247
  iface.launch()
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
  openai
2
  openpyxl
3
  gradio
 
4
  PyMuPDF
5
  pandas
6
- requests
7
- ntplib
 
1
  openai
2
  openpyxl
3
  gradio
4
+ gradio_pdf
5
  PyMuPDF
6
  pandas
7
+ ntplib
8
+ requests