Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """PDF to Excel converter using pdfplumber for table extraction.""" | |
| import argparse, json, sys, csv | |
| from pathlib import Path | |
| def convert(input_path, output_path): | |
| import pdfplumber | |
| all_rows = [] | |
| with pdfplumber.open(input_path) as pdf: | |
| for page in pdf.pages: | |
| tables = page.extract_tables() | |
| for table in tables: | |
| for row in table: | |
| cleaned = [cell if cell else '' for cell in row] | |
| all_rows.append(cleaned) | |
| if not tables: | |
| text = page.extract_text() | |
| if text: | |
| for line in text.split('\n'): | |
| all_rows.append([line.strip()]) | |
| if not output_path: | |
| output_path = str(Path(input_path).with_suffix('.csv')) | |
| try: | |
| import openpyxl | |
| wb = openpyxl.Workbook() | |
| ws = wb.active | |
| ws.title = "Extracted Data" | |
| for row in all_rows: | |
| ws.append(row) | |
| xlsx_path = str(Path(output_path).with_suffix('.xlsx')) | |
| wb.save(xlsx_path) | |
| return xlsx_path | |
| except ImportError: | |
| csv_path = str(Path(output_path).with_suffix('.csv')) | |
| with open(csv_path, 'w', newline='', encoding='utf-8') as f: | |
| writer = csv.writer(f) | |
| writer.writerows(all_rows) | |
| return csv_path | |
| def main(): | |
| parser = argparse.ArgumentParser(description='Convert PDF to Excel') | |
| parser.add_argument('--input', required=True) | |
| parser.add_argument('--output', required=True) | |
| args = parser.parse_args() | |
| try: | |
| result = convert(args.input, args.output) | |
| print(json.dumps({"success": True, "output": result, "message": "PDF converted to Excel successfully"})) | |
| except Exception as e: | |
| print(json.dumps({"success": False, "output": "", "message": str(e)})) | |
| sys.exit(1) | |
| if __name__ == '__main__': | |
| main() | |