Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,9 +6,7 @@ import os
|
|
| 6 |
import io
|
| 7 |
from PIL import Image
|
| 8 |
import pandas as pd
|
| 9 |
-
import
|
| 10 |
-
import camelot
|
| 11 |
-
from PyPDF2 import PdfReader
|
| 12 |
|
| 13 |
def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
| 14 |
"""
|
|
@@ -55,17 +53,12 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
|
| 55 |
except Exception as e:
|
| 56 |
print(f"Error extracting image: {e}")
|
| 57 |
|
| 58 |
-
# Enhanced table extraction
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
camelot_tables = camelot.read_pdf(pdf_file)
|
| 65 |
-
for table in camelot_tables:
|
| 66 |
-
tables.append(table.df)
|
| 67 |
-
except Exception as e:
|
| 68 |
-
print(f"camelot also failed: {e}. No tables extracted.")
|
| 69 |
|
| 70 |
# Format extracted data based on user selection
|
| 71 |
if output_format == "JSON":
|
|
|
|
| 6 |
import io
|
| 7 |
from PIL import Image
|
| 8 |
import pandas as pd
|
| 9 |
+
import pdfplumber
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
|
| 12 |
"""
|
|
|
|
| 53 |
except Exception as e:
|
| 54 |
print(f"Error extracting image: {e}")
|
| 55 |
|
| 56 |
+
# Enhanced table extraction using pdfplumber
|
| 57 |
+
with pdfplumber.open(pdf_file) as pdf:
|
| 58 |
+
for page_num, page in enumerate(pdf.pages):
|
| 59 |
+
for table in page.extract_tables():
|
| 60 |
+
df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
|
| 61 |
+
tables.append(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
# Format extracted data based on user selection
|
| 64 |
if output_format == "JSON":
|