Spaces:

MicroTest007
/

Info_Extraction

Sleeping

App Files Files Community

CurioChen commited on Aug 22, 2024

Commit

97e7c1f

verified ·

1 Parent(s): 48926f3

Upload 2 files

Browse files

Files changed (2) hide show

app.py +25 -22
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ import gradio as gr
 import re
 import fitz  # PyMuPDF
 import pandas as pd
 # Configure logging
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -145,7 +146,7 @@ def json_to_excel(json_data):
                'amount', 'notice_publish_date']
     ws.append(headers)
-    # 创建一个辅助函数来进行精确匹配
     def exact_match(key, target):
         key = ''.join(c.lower() for c in key if c.isalnum())
         target = ''.join(c.lower() for c in target if c.isalnum())
@@ -154,7 +155,7 @@ def json_to_excel(json_data):
     for contract in data['contracts']:
         row = []
         for header in headers:
-            # 使用精确匹配来查找对应的值
             matched_value = next((v for k, v in contract.items() if exact_match(header, k)), '')
             row.append(matched_value)
         ws.append(row)
@@ -164,58 +165,58 @@ def json_to_excel(json_data):
         return tmp.name
 def clean_url(input_text):
-    # 去除可能存在的首尾引号
     cleaned_url = input_text.strip().strip('"')
     return cleaned_url
-# 新增函数:处理上传的PDF文件
 def process_pdf(file):
-    logging.info(f"开始处理PDF文件: {type(file)}")
     try:
         if hasattr(file, 'name'):
-            # 如果file是一个文件对象
             with fitz.open(file.name) as doc:
                 text_content = ""
                 for page in doc:
                     text_content += page.get_text()
         else:
-            # 如果file是一个字符串（文件路径）
             with fitz.open(file) as doc:
                 text_content = ""
                 for page in doc:
                     text_content += page.get_text()
-        logging.info("PDF处理成功")
         return text_content
     except Exception as e:
-        logging.error(f"PDF处理错误: {str(e)}")
         raise
 def preview_excel(excel_path):
     try:
-        df = pd.read_excel(excel_path, nrows=3)
-        preview = df.iloc[:3, :3].to_html(index=False)
-        return preview
     except Exception as e:
-        logging.error(f"Error previewing Excel: {str(e)}")
-        return "Unable to generate preview"
 def process_pdf_file(file):
     if file is None:
         logging.warning("No file uploaded")
-        return "Please upload a PDF file.", None, ""
     try:
         logging.info(f"Received file: {type(file)}, {file.name if hasattr(file, 'name') else 'No name'}")
         pdf_content = process_pdf(file)
     except Exception as e:
         logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
-        return f"Error processing PDF file: {str(e)}", None, ""
     try:
         json_data = extract_information(pdf_content)
         if json_data is None:
             logging.error("Failed to extract information")
-            return "Error extracting information. Please try again later.", None, ""
         excel_path = json_to_excel(json_data)
         excel_preview = preview_excel(excel_path)
@@ -224,21 +225,23 @@ def process_pdf_file(file):
         return "Processing successful!", excel_path, excel_preview
     except Exception as e:
         logging.error(f"Error processing file: {str(e)}", exc_info=True)
-        return f"Error processing file: {str(e)}", None, ""
-# Modified Gradio interface
 iface = gr.Interface(
     fn=process_pdf_file,
-    inputs=gr.File(label="Upload PDF File", type="filepath", file_types=[".pdf"]),
     outputs=[
         gr.Textbox(label="Processing Status"),
         gr.File(label="Download Excel File"),
-        gr.HTML(label="Excel Preview")
     ],
     title="PDF Document Processing and Information Extraction",
     description="Upload a PDF file, and the system will process it and generate an Excel result."
 )
-# Run Gradio application
 if __name__ == "__main__":
     iface.launch()

 import re
 import fitz  # PyMuPDF
 import pandas as pd
+from gradio_pdf import PDF  # Import the new PDF component
 # Configure logging
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
                'amount', 'notice_publish_date']
     ws.append(headers)
+    # Create a helper function for exact matching
     def exact_match(key, target):
         key = ''.join(c.lower() for c in key if c.isalnum())
         target = ''.join(c.lower() for c in target if c.isalnum())
     for contract in data['contracts']:
         row = []
         for header in headers:
+            # Use exact matching to find the corresponding value
             matched_value = next((v for k, v in contract.items() if exact_match(header, k)), '')
             row.append(matched_value)
         ws.append(row)
         return tmp.name
 def clean_url(input_text):
+    # Remove any leading or trailing quotes
     cleaned_url = input_text.strip().strip('"')
     return cleaned_url
+# New function: Process uploaded PDF
 def process_pdf(file):
+    logging.info(f"Start processing PDF file: {type(file)}")
     try:
         if hasattr(file, 'name'):
+            # If file is a file object
             with fitz.open(file.name) as doc:
                 text_content = ""
                 for page in doc:
                     text_content += page.get_text()
         else:
+            # If file is a string (file path)
             with fitz.open(file) as doc:
                 text_content = ""
                 for page in doc:
                     text_content += page.get_text()
+        logging.info("PDF processing successful")
         return text_content
     except Exception as e:
+        logging.error(f"PDF processing error: {str(e)}")
         raise
 def preview_excel(excel_path):
     try:
+        df = pd.read_excel(excel_path, nrows=10)
+        preview_df = df.iloc[:10, :8]
+        return gr.Dataframe(value=preview_df)
     except Exception as e:
+        logging.error(f"Excel preview error: {str(e)}")
+        return gr.Dataframe()
 def process_pdf_file(file):
     if file is None:
         logging.warning("No file uploaded")
+        return "Please upload a PDF file.", None, gr.Dataframe()
     try:
         logging.info(f"Received file: {type(file)}, {file.name if hasattr(file, 'name') else 'No name'}")
         pdf_content = process_pdf(file)
     except Exception as e:
         logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
+        return f"Error processing PDF file: {str(e)}", None, gr.Dataframe()
     try:
         json_data = extract_information(pdf_content)
         if json_data is None:
             logging.error("Failed to extract information")
+            return "Error extracting information. Please try again later.", None, gr.Dataframe()
         excel_path = json_to_excel(json_data)
         excel_preview = preview_excel(excel_path)
         return "Processing successful!", excel_path, excel_preview
     except Exception as e:
         logging.error(f"Error processing file: {str(e)}", exc_info=True)
+        return f"Error processing file: {str(e)}", None, gr.Dataframe()
+# Gradio interface
 iface = gr.Interface(
     fn=process_pdf_file,
+    inputs=[
+        PDF(label="Upload PDF File")  # Only keep the label parameter
+    ],
     outputs=[
         gr.Textbox(label="Processing Status"),
         gr.File(label="Download Excel File"),
+        gr.Dataframe(label="Excel Preview (First 10 rows, 8 columns)")
     ],
     title="PDF Document Processing and Information Extraction",
     description="Upload a PDF file, and the system will process it and generate an Excel result."
 )
+# Run the Gradio app
 if __name__ == "__main__":
     iface.launch()

requirements.txt CHANGED Viewed

@@ -1,7 +1,8 @@
 openai
 openpyxl
 gradio
 PyMuPDF
 pandas
-requests
-ntplib

 openai
 openpyxl
 gradio
+gradio_pdf
 PyMuPDF
 pandas
+ntplib
+requests