Spaces:
Runtime error
Runtime error
| import sys | |
| import camelot | |
| import polars as pl | |
| import signal | |
| import argparse | |
| from rich.console import Console | |
| from rich.progress import track | |
| console = Console() | |
| class PDFTableParser: | |
| def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages): | |
| self.input_files = input_files | |
| self.output_files = output_files | |
| self.delimiter = delimiter | |
| self.edge_tol = edge_tol | |
| self.row_tol = row_tol | |
| self.pages = pages | |
| def read_tables(self, file_name): | |
| try: | |
| console.print(f"Reading tables from {file_name}...") | |
| tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages) | |
| console.print(f"Found {len(tables)} tables in {file_name}.") | |
| return tables | |
| except Exception as e: | |
| console.print(f"[red]Error reading {file_name}: {e}[/red]") | |
| return None | |
| def save_tables_as_csv(self, tables, output_file): | |
| try: | |
| console.print(f"Saving tables to {output_file}...") | |
| df = pl.concat([pl.DataFrame(table.df) for table in tables]) | |
| df.write_csv(output_file, separator=self.delimiter) | |
| console.print(f"Saved tables to {output_file}.") | |
| except Exception as e: | |
| console.print(f"[red]Error saving to {output_file}: {e}[/red]") | |
| def estimate_processing_time(self, file_name): | |
| try: | |
| with open(file_name, 'rb') as f: | |
| content = f.read().decode('utf-8', errors='ignore') | |
| pages = content.count('\n') | |
| words = len(content.split()) | |
| chars = len(content) | |
| estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000) | |
| console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.") | |
| return estimated_time | |
| except Exception as e: | |
| console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]") | |
| return 0 | |
| def process_files(self): | |
| for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"): | |
| self.estimate_processing_time(input_file) | |
| tables = self.read_tables(input_file) | |
| if tables: | |
| self.save_tables_as_csv(tables, output_file) | |
| def handle_signal(signum, frame): | |
| console.print("\n[red]Process interrupted.[/red]") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| signal.signal(signal.SIGINT, handle_signal) | |
| signal.signal(signal.SIGTERM, handle_signal) | |
| parser = argparse.ArgumentParser(description="PDF Table Parser") | |
| parser.add_argument("input_files", nargs='+', help="List of input PDF files") | |
| parser.add_argument("output_files", nargs='+', help="List of output CSV files") | |
| parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)") | |
| parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)") | |
| parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)") | |
| parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)") | |
| parser.add_argument("--webui", action='store_true', help="Launch the web UI") | |
| args = parser.parse_args() | |
| if len(args.input_files) != len(args.output_files): | |
| console.print("[red]The number of input files and output files must match.[/red]") | |
| sys.exit(1) | |
| if args.webui: | |
| webui = WebUI() | |
| webui.run() | |
| else: | |
| main(args) | |