Spaces:

innoai
/

PDFConverter-ENG

Sleeping

File size: 1,934 Bytes

80a3675

#!/usr/bin/env python3
"""PDF to Word (DOCX) converter using pdfplumber + python-docx approach via PyPDF2."""
import argparse, json, sys, os
from pathlib import Path

def convert(input_path, output_path):
    try:
        import pdfplumber
        from reportlab.lib.pagesizes import A4
    except ImportError:
        pass
    
    from PyPDF2 import PdfReader
    
    reader = PdfReader(input_path)
    text_content = []
    for page in reader.pages:
        text = page.extract_text()
        if text:
            text_content.append(text)
    
    if not output_path:
        output_path = str(Path(input_path).with_suffix('.docx'))
    
    # Write as simple DOCX using python-docx if available, otherwise plain text
    try:
        from docx import Document
        doc = Document()
        doc.add_heading('Converted Document', 0)
        for page_text in text_content:
            doc.add_paragraph(page_text)
            doc.add_page_break()
        doc.save(output_path)
    except ImportError:
        # Fallback: save as text
        output_path = str(Path(output_path).with_suffix('.txt'))
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write('\n\n--- Page Break ---\n\n'.join(text_content))
    
    return output_path

def main():
    parser = argparse.ArgumentParser(description='Convert PDF to Word')
    parser.add_argument('--input', required=True, help='Input PDF path')
    parser.add_argument('--output', required=True, help='Output file path')
    args = parser.parse_args()
    
    try:
        result_path = convert(args.input, args.output)
        print(json.dumps({"success": True, "output": result_path, "message": "PDF converted to Word successfully"}))
    except Exception as e:
        print(json.dumps({"success": False, "output": "", "message": str(e)}))
        sys.exit(1)

if __name__ == '__main__':
    main()