Spaces:
Runtime error
Runtime error
morisono commited on
Upload folder using huggingface_hub
Browse files- README.md +4 -23
- src/app/__pycache__/common.cpython-310.pyc +0 -0
- src/app/__pycache__/parser.cpython-310.pyc +0 -0
- src/app/common.py +24 -0
- src/app/parser.py +87 -0
- src/app/run.py +4 -106
README.md
CHANGED
|
@@ -32,7 +32,7 @@ This script extracts tables from PDF files and saves them as CSV files. It suppo
|
|
| 32 |
To run the script via CLI, use the following command:
|
| 33 |
|
| 34 |
```bash
|
| 35 |
-
python src/app/
|
| 36 |
```
|
| 37 |
|
| 38 |
#### Arguments:
|
|
@@ -52,7 +52,7 @@ python src/app/run.py input1.pdf input2.pdf output1.csv output2.csv
|
|
| 52 |
To run the script with the web UI, use the following command:
|
| 53 |
|
| 54 |
```bash
|
| 55 |
-
python src/app/run.py
|
| 56 |
```
|
| 57 |
|
| 58 |
This will launch a Gradio-based web application where you can upload PDFs and view the extracted tables interactively.
|
|
@@ -62,28 +62,9 @@ This will launch a Gradio-based web application where you can upload PDFs and vi
|
|
| 62 |
### CLI Example
|
| 63 |
|
| 64 |
```bash
|
| 65 |
-
python src/app/
|
| 66 |
```
|
| 67 |
|
| 68 |
-
### Web UI Example
|
| 69 |
-
|
| 70 |
-
```bash
|
| 71 |
-
python src/app/run.py data/demo.pdf data/output.csv --webui
|
| 72 |
-
```
|
| 73 |
-
|
| 74 |
-
## Handling Interruptions
|
| 75 |
-
|
| 76 |
-
The script handles `SIGINT` and `SIGTERM` signals gracefully, ensuring that processing can be interrupted safely.
|
| 77 |
-
|
| 78 |
## License
|
| 79 |
|
| 80 |
-
This project is licensed under the MIT License.
|
| 81 |
-
|
| 82 |
-
## Acknowledgements
|
| 83 |
-
|
| 84 |
-
This script uses the following libraries:
|
| 85 |
-
- [Rich](https://github.com/willmcgugan/rich) for console output and progress bars
|
| 86 |
-
- [Camelot](https://github.com/camelot-dev/camelot) for PDF table extraction
|
| 87 |
-
- [Polars](https://github.com/pola-rs/polars) for efficient DataFrame operations
|
| 88 |
-
- [Gradio](https://github.com/gradio-app/gradio) for the web UI
|
| 89 |
-
- [gradio_pdf](https://github.com/gradio-app/gradio) for PDF handling in Gradio
|
|
|
|
| 32 |
To run the script via CLI, use the following command:
|
| 33 |
|
| 34 |
```bash
|
| 35 |
+
python src/app/parser.py input1.pdf input2.pdf output1.csv output2.csv
|
| 36 |
```
|
| 37 |
|
| 38 |
#### Arguments:
|
|
|
|
| 52 |
To run the script with the web UI, use the following command:
|
| 53 |
|
| 54 |
```bash
|
| 55 |
+
python src/app/run.py
|
| 56 |
```
|
| 57 |
|
| 58 |
This will launch a Gradio-based web application where you can upload PDFs and view the extracted tables interactively.
|
|
|
|
| 62 |
### CLI Example
|
| 63 |
|
| 64 |
```bash
|
| 65 |
+
python src/app/parser.py data/demo.pdf data/output.csv --delimiter ";" --edge_tol 60 --row_tol 40
|
| 66 |
```
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
## License
|
| 69 |
|
| 70 |
+
This project is licensed under the MIT License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/app/__pycache__/common.cpython-310.pyc
ADDED
|
Binary file (1.1 kB). View file
|
|
|
src/app/__pycache__/parser.cpython-310.pyc
ADDED
|
Binary file (4.1 kB). View file
|
|
|
src/app/common.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import tempfile
|
| 3 |
+
import zipfile
|
| 4 |
+
|
| 5 |
+
class Interface:
|
| 6 |
+
def get_tempdir():
|
| 7 |
+
timestamp = int(time.time())
|
| 8 |
+
temp_dir = tempfile.mkdtemp()
|
| 9 |
+
return timestamp, temp_dir
|
| 10 |
+
|
| 11 |
+
def create_zip(file_list, zip_path, password=None):
|
| 12 |
+
with zipfile.ZipFile(zip_path, "w", zipfilep64=True) as zipf:
|
| 13 |
+
if password:
|
| 14 |
+
zipf.setpassword(bytes(password, 'utf-8'))
|
| 15 |
+
for item in file_list:
|
| 16 |
+
if os.path.isdir(item):
|
| 17 |
+
for root, _, files in os.walk(item):
|
| 18 |
+
for file in files:
|
| 19 |
+
file_path = os.path.join(root, file)
|
| 20 |
+
arcname = os.path.relpath(file_path, item)
|
| 21 |
+
zipf.write(file_path, arcname)
|
| 22 |
+
else:
|
| 23 |
+
arcname = os.path.basename(item)
|
| 24 |
+
zipf.write(item, arcname)
|
src/app/parser.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import camelot
|
| 3 |
+
import polars as pl
|
| 4 |
+
import signal
|
| 5 |
+
import argparse
|
| 6 |
+
from rich.console import Console
|
| 7 |
+
from rich.progress import track
|
| 8 |
+
|
| 9 |
+
console = Console()
|
| 10 |
+
|
| 11 |
+
class PDFTableParser:
|
| 12 |
+
def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages):
|
| 13 |
+
self.input_files = input_files
|
| 14 |
+
self.output_files = output_files
|
| 15 |
+
self.delimiter = delimiter
|
| 16 |
+
self.edge_tol = edge_tol
|
| 17 |
+
self.row_tol = row_tol
|
| 18 |
+
self.pages = pages
|
| 19 |
+
|
| 20 |
+
def read_tables(self, file_name):
|
| 21 |
+
try:
|
| 22 |
+
console.print(f"Reading tables from {file_name}...")
|
| 23 |
+
tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages)
|
| 24 |
+
console.print(f"Found {len(tables)} tables in {file_name}.")
|
| 25 |
+
return tables
|
| 26 |
+
except Exception as e:
|
| 27 |
+
console.print(f"[red]Error reading {file_name}: {e}[/red]")
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
def save_tables_as_csv(self, tables, output_file):
|
| 31 |
+
try:
|
| 32 |
+
console.print(f"Saving tables to {output_file}...")
|
| 33 |
+
df = pl.concat([pl.DataFrame(table.df) for table in tables])
|
| 34 |
+
df.write_csv(output_file, separator=self.delimiter)
|
| 35 |
+
console.print(f"Saved tables to {output_file}.")
|
| 36 |
+
except Exception as e:
|
| 37 |
+
console.print(f"[red]Error saving to {output_file}: {e}[/red]")
|
| 38 |
+
|
| 39 |
+
def estimate_processing_time(self, file_name):
|
| 40 |
+
try:
|
| 41 |
+
with open(file_name, 'rb') as f:
|
| 42 |
+
content = f.read().decode('utf-8', errors='ignore')
|
| 43 |
+
pages = content.count('\n')
|
| 44 |
+
words = len(content.split())
|
| 45 |
+
chars = len(content)
|
| 46 |
+
estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000)
|
| 47 |
+
console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.")
|
| 48 |
+
return estimated_time
|
| 49 |
+
except Exception as e:
|
| 50 |
+
console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]")
|
| 51 |
+
return 0
|
| 52 |
+
|
| 53 |
+
def process_files(self):
|
| 54 |
+
for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"):
|
| 55 |
+
self.estimate_processing_time(input_file)
|
| 56 |
+
tables = self.read_tables(input_file)
|
| 57 |
+
if tables:
|
| 58 |
+
self.save_tables_as_csv(tables, output_file)
|
| 59 |
+
|
| 60 |
+
def handle_signal(signum, frame):
|
| 61 |
+
console.print("\n[red]Process interrupted.[/red]")
|
| 62 |
+
sys.exit(1)
|
| 63 |
+
|
| 64 |
+
if __name__ == "__main__":
|
| 65 |
+
signal.signal(signal.SIGINT, handle_signal)
|
| 66 |
+
signal.signal(signal.SIGTERM, handle_signal)
|
| 67 |
+
|
| 68 |
+
parser = argparse.ArgumentParser(description="PDF Table Parser")
|
| 69 |
+
parser.add_argument("input_files", nargs='+', help="List of input PDF files")
|
| 70 |
+
parser.add_argument("output_files", nargs='+', help="List of output CSV files")
|
| 71 |
+
parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)")
|
| 72 |
+
parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)")
|
| 73 |
+
parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)")
|
| 74 |
+
parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)")
|
| 75 |
+
parser.add_argument("--webui", action='store_true', help="Launch the web UI")
|
| 76 |
+
|
| 77 |
+
args = parser.parse_args()
|
| 78 |
+
|
| 79 |
+
if len(args.input_files) != len(args.output_files):
|
| 80 |
+
console.print("[red]The number of input files and output files must match.[/red]")
|
| 81 |
+
sys.exit(1)
|
| 82 |
+
|
| 83 |
+
if args.webui:
|
| 84 |
+
webui = WebUI()
|
| 85 |
+
webui.run()
|
| 86 |
+
else:
|
| 87 |
+
main(args)
|
src/app/run.py
CHANGED
|
@@ -1,89 +1,12 @@
|
|
| 1 |
-
import argparse
|
| 2 |
import os
|
| 3 |
-
import signal
|
| 4 |
-
import sys
|
| 5 |
import json
|
| 6 |
-
import time
|
| 7 |
-
import tempfile
|
| 8 |
-
import zipfile
|
| 9 |
-
from rich.console import Console
|
| 10 |
-
from rich.progress import track
|
| 11 |
-
import camelot
|
| 12 |
import polars as pl
|
| 13 |
import gradio as gr
|
| 14 |
from gradio_pdf import PDF
|
| 15 |
|
| 16 |
-
|
|
|
|
| 17 |
|
| 18 |
-
class Interface:
|
| 19 |
-
def get_tempdir():
|
| 20 |
-
timestamp = int(time.time())
|
| 21 |
-
temp_dir = tempfile.mkdtemp()
|
| 22 |
-
return timestamp, temp_dir
|
| 23 |
-
|
| 24 |
-
def create_zip(file_list, zip_path, password=None):
|
| 25 |
-
with zipfile.ZipFile(zip_path, "w", zipfilep64=True) as zipf:
|
| 26 |
-
if password:
|
| 27 |
-
zipf.setpassword(bytes(password, 'utf-8'))
|
| 28 |
-
for item in file_list:
|
| 29 |
-
if os.path.isdir(item):
|
| 30 |
-
for root, _, files in os.walk(item):
|
| 31 |
-
for file in files:
|
| 32 |
-
file_path = os.path.join(root, file)
|
| 33 |
-
arcname = os.path.relpath(file_path, item)
|
| 34 |
-
zipf.write(file_path, arcname)
|
| 35 |
-
else:
|
| 36 |
-
arcname = os.path.basename(item)
|
| 37 |
-
zipf.write(item, arcname)
|
| 38 |
-
|
| 39 |
-
class PDFTableParser:
|
| 40 |
-
def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages):
|
| 41 |
-
self.input_files = input_files
|
| 42 |
-
self.output_files = output_files
|
| 43 |
-
self.delimiter = delimiter
|
| 44 |
-
self.edge_tol = edge_tol
|
| 45 |
-
self.row_tol = row_tol
|
| 46 |
-
self.pages = pages
|
| 47 |
-
|
| 48 |
-
def read_tables(self, file_name):
|
| 49 |
-
try:
|
| 50 |
-
console.print(f"Reading tables from {file_name}...")
|
| 51 |
-
tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages)
|
| 52 |
-
console.print(f"Found {len(tables)} tables in {file_name}.")
|
| 53 |
-
return tables
|
| 54 |
-
except Exception as e:
|
| 55 |
-
console.print(f"[red]Error reading {file_name}: {e}[/red]")
|
| 56 |
-
return None
|
| 57 |
-
|
| 58 |
-
def save_tables_as_csv(self, tables, output_file):
|
| 59 |
-
try:
|
| 60 |
-
console.print(f"Saving tables to {output_file}...")
|
| 61 |
-
df = pl.concat([pl.DataFrame(table.df) for table in tables])
|
| 62 |
-
df.write_csv(output_file, separator=self.delimiter)
|
| 63 |
-
console.print(f"Saved tables to {output_file}.")
|
| 64 |
-
except Exception as e:
|
| 65 |
-
console.print(f"[red]Error saving to {output_file}: {e}[/red]")
|
| 66 |
-
|
| 67 |
-
def estimate_processing_time(self, file_name):
|
| 68 |
-
try:
|
| 69 |
-
with open(file_name, 'rb') as f:
|
| 70 |
-
content = f.read().decode('utf-8', errors='ignore')
|
| 71 |
-
pages = content.count('\n')
|
| 72 |
-
words = len(content.split())
|
| 73 |
-
chars = len(content)
|
| 74 |
-
estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000)
|
| 75 |
-
console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.")
|
| 76 |
-
return estimated_time
|
| 77 |
-
except Exception as e:
|
| 78 |
-
console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]")
|
| 79 |
-
return 0
|
| 80 |
-
|
| 81 |
-
def process_files(self):
|
| 82 |
-
for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"):
|
| 83 |
-
self.estimate_processing_time(input_file)
|
| 84 |
-
tables = self.read_tables(input_file)
|
| 85 |
-
if tables:
|
| 86 |
-
self.save_tables_as_csv(tables, output_file)
|
| 87 |
|
| 88 |
class WebUI:
|
| 89 |
def __init__(self):
|
|
@@ -128,35 +51,10 @@ class WebUI:
|
|
| 128 |
|
| 129 |
app.launch()
|
| 130 |
|
| 131 |
-
def handle_signal(signum, frame):
|
| 132 |
-
console.print("\n[red]Process interrupted.[/red]")
|
| 133 |
-
sys.exit(1)
|
| 134 |
-
|
| 135 |
def main(args):
|
| 136 |
parser = PDFTableParser(args.input_files, args.output_files, args.delimiter, args.edge_tol, args.row_tol, args.pages)
|
| 137 |
parser.process_files()
|
| 138 |
|
| 139 |
if __name__ == "__main__":
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
parser = argparse.ArgumentParser(description="PDF Table Parser")
|
| 144 |
-
parser.add_argument("input_files", nargs='+', help="List of input PDF files")
|
| 145 |
-
parser.add_argument("output_files", nargs='+', help="List of output CSV files")
|
| 146 |
-
parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)")
|
| 147 |
-
parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)")
|
| 148 |
-
parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)")
|
| 149 |
-
parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)")
|
| 150 |
-
parser.add_argument("--webui", action='store_true', help="Launch the web UI")
|
| 151 |
-
|
| 152 |
-
args = parser.parse_args()
|
| 153 |
-
|
| 154 |
-
if len(args.input_files) != len(args.output_files):
|
| 155 |
-
console.print("[red]The number of input files and output files must match.[/red]")
|
| 156 |
-
sys.exit(1)
|
| 157 |
-
|
| 158 |
-
if args.webui:
|
| 159 |
-
webui = WebUI()
|
| 160 |
-
webui.run()
|
| 161 |
-
else:
|
| 162 |
-
main(args)
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import polars as pl
|
| 4 |
import gradio as gr
|
| 5 |
from gradio_pdf import PDF
|
| 6 |
|
| 7 |
+
from common import Interface
|
| 8 |
+
from parser import PDFTableParser
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
class WebUI:
|
| 12 |
def __init__(self):
|
|
|
|
| 51 |
|
| 52 |
app.launch()
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
def main(args):
|
| 55 |
parser = PDFTableParser(args.input_files, args.output_files, args.delimiter, args.edge_tol, args.row_tol, args.pages)
|
| 56 |
parser.process_files()
|
| 57 |
|
| 58 |
if __name__ == "__main__":
|
| 59 |
+
webui = WebUI()
|
| 60 |
+
webui.run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|