Spaces:
Sleeping
Sleeping
File size: 1,992 Bytes
80a3675 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | #!/usr/bin/env python3
"""PDF to Excel converter using pdfplumber for table extraction."""
import argparse, json, sys, csv
from pathlib import Path
def convert(input_path, output_path):
import pdfplumber
all_rows = []
with pdfplumber.open(input_path) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
for row in table:
cleaned = [cell if cell else '' for cell in row]
all_rows.append(cleaned)
if not tables:
text = page.extract_text()
if text:
for line in text.split('\n'):
all_rows.append([line.strip()])
if not output_path:
output_path = str(Path(input_path).with_suffix('.csv'))
try:
import openpyxl
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Extracted Data"
for row in all_rows:
ws.append(row)
xlsx_path = str(Path(output_path).with_suffix('.xlsx'))
wb.save(xlsx_path)
return xlsx_path
except ImportError:
csv_path = str(Path(output_path).with_suffix('.csv'))
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerows(all_rows)
return csv_path
def main():
parser = argparse.ArgumentParser(description='Convert PDF to Excel')
parser.add_argument('--input', required=True)
parser.add_argument('--output', required=True)
args = parser.parse_args()
try:
result = convert(args.input, args.output)
print(json.dumps({"success": True, "output": result, "message": "PDF converted to Excel successfully"}))
except Exception as e:
print(json.dumps({"success": False, "output": "", "message": str(e)}))
sys.exit(1)
if __name__ == '__main__':
main()
|