Spaces:

innoai
/

PDFConverter-ENG

Sleeping

File size: 1,992 Bytes

80a3675

#!/usr/bin/env python3
"""PDF to Excel converter using pdfplumber for table extraction."""
import argparse, json, sys, csv
from pathlib import Path

def convert(input_path, output_path):
    import pdfplumber
    
    all_rows = []
    with pdfplumber.open(input_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                for row in table:
                    cleaned = [cell if cell else '' for cell in row]
                    all_rows.append(cleaned)
            if not tables:
                text = page.extract_text()
                if text:
                    for line in text.split('\n'):
                        all_rows.append([line.strip()])
    
    if not output_path:
        output_path = str(Path(input_path).with_suffix('.csv'))
    
    try:
        import openpyxl
        wb = openpyxl.Workbook()
        ws = wb.active
        ws.title = "Extracted Data"
        for row in all_rows:
            ws.append(row)
        xlsx_path = str(Path(output_path).with_suffix('.xlsx'))
        wb.save(xlsx_path)
        return xlsx_path
    except ImportError:
        csv_path = str(Path(output_path).with_suffix('.csv'))
        with open(csv_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerows(all_rows)
        return csv_path

def main():
    parser = argparse.ArgumentParser(description='Convert PDF to Excel')
    parser.add_argument('--input', required=True)
    parser.add_argument('--output', required=True)
    args = parser.parse_args()
    try:
        result = convert(args.input, args.output)
        print(json.dumps({"success": True, "output": result, "message": "PDF converted to Excel successfully"}))
    except Exception as e:
        print(json.dumps({"success": False, "output": "", "message": str(e)}))
        sys.exit(1)

if __name__ == '__main__':
    main()