Spaces:
Sleeping
Sleeping
File size: 1,934 Bytes
80a3675 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | #!/usr/bin/env python3
"""PDF to Word (DOCX) converter using pdfplumber + python-docx approach via PyPDF2."""
import argparse, json, sys, os
from pathlib import Path
def convert(input_path, output_path):
try:
import pdfplumber
from reportlab.lib.pagesizes import A4
except ImportError:
pass
from PyPDF2 import PdfReader
reader = PdfReader(input_path)
text_content = []
for page in reader.pages:
text = page.extract_text()
if text:
text_content.append(text)
if not output_path:
output_path = str(Path(input_path).with_suffix('.docx'))
# Write as simple DOCX using python-docx if available, otherwise plain text
try:
from docx import Document
doc = Document()
doc.add_heading('Converted Document', 0)
for page_text in text_content:
doc.add_paragraph(page_text)
doc.add_page_break()
doc.save(output_path)
except ImportError:
# Fallback: save as text
output_path = str(Path(output_path).with_suffix('.txt'))
with open(output_path, 'w', encoding='utf-8') as f:
f.write('\n\n--- Page Break ---\n\n'.join(text_content))
return output_path
def main():
parser = argparse.ArgumentParser(description='Convert PDF to Word')
parser.add_argument('--input', required=True, help='Input PDF path')
parser.add_argument('--output', required=True, help='Output file path')
args = parser.parse_args()
try:
result_path = convert(args.input, args.output)
print(json.dumps({"success": True, "output": result_path, "message": "PDF converted to Word successfully"}))
except Exception as e:
print(json.dumps({"success": False, "output": "", "message": str(e)}))
sys.exit(1)
if __name__ == '__main__':
main()
|