Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

7cb3598

verified ·

1 Parent(s): 3403d47

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -9

app.py CHANGED Viewed

@@ -7,9 +7,21 @@ import io
 from PIL import Image
 import pandas as pd
 import pdfplumber
-import tempfile  # Import tempfile
 def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
     try:
         with open(pdf_file, 'rb') as file:
             text = ""
@@ -53,7 +65,6 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                             df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
-            # Use a temporary file for the download
             with tempfile.NamedTemporaryFile(mode="w+b", delete=False, suffix="." + output_format.lower()) as tmp:
                 if output_format == "JSON":
                     json_data = {
@@ -61,8 +72,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                         "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
                         "images": images
                     }
-                    json.dump(json_data, tmp, indent=4)
-                    download_path = tmp.name
                 elif output_format == "Markdown":
                     markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                     for i, table in enumerate(tables):
@@ -73,8 +83,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         markdown_text += f'![Image]({image_path})\n'
-                    tmp.write(markdown_text.encode('utf-8'))
-                    download_path = tmp.name
                 elif output_format == "HTML":
                     html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                     for i, table in enumerate(tables):
@@ -85,9 +94,8 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         html_text += f'<img src="{image_path}" alt="Image"><br>\n'
-                    tmp.write(html_text.encode('utf-8'))
-                    download_path = tmp.name
             return text, download_path
     except Exception as main_e:

 from PIL import Image
 import pandas as pd
 import pdfplumber
+import tempfile
 def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
+    """
+    Parses a PDF file, extracts text, tables, and images, and formats the output.
+    Args:
+        pdf_file: Path to the uploaded PDF file.
+        output_format: Desired output format ("JSON", "Markdown", or "HTML").
+        progress: Gradio Progress object for displaying progress.
+    Returns:
+        tuple: Extracted text and download data in the specified format.
+            Returns an empty string and None if there is an error.
+    """
     try:
         with open(pdf_file, 'rb') as file:
             text = ""
                             df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
             with tempfile.NamedTemporaryFile(mode="w+b", delete=False, suffix="." + output_format.lower()) as tmp:
                 if output_format == "JSON":
                     json_data = {
                         "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
                         "images": images
                     }
+                    json.dump(json_data, tmp, indent=4)
                 elif output_format == "Markdown":
                     markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                     for i, table in enumerate(tables):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         markdown_text += f'![Image]({image_path})\n'
+                    tmp.write(markdown_text.encode('utf-8'))
                 elif output_format == "HTML":
                     html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                     for i, table in enumerate(tables):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         html_text += f'<img src="{image_path}" alt="Image"><br>\n'
+                    tmp.write(html_text.encode('utf-8'))
+                download_path = tmp.name
             return text, download_path
     except Exception as main_e: