Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """PDF to Word (DOCX) converter using pdfplumber + python-docx approach via PyPDF2.""" | |
| import argparse, json, sys, os | |
| from pathlib import Path | |
| def convert(input_path, output_path): | |
| try: | |
| import pdfplumber | |
| from reportlab.lib.pagesizes import A4 | |
| except ImportError: | |
| pass | |
| from PyPDF2 import PdfReader | |
| reader = PdfReader(input_path) | |
| text_content = [] | |
| for page in reader.pages: | |
| text = page.extract_text() | |
| if text: | |
| text_content.append(text) | |
| if not output_path: | |
| output_path = str(Path(input_path).with_suffix('.docx')) | |
| # Write as simple DOCX using python-docx if available, otherwise plain text | |
| try: | |
| from docx import Document | |
| doc = Document() | |
| doc.add_heading('Converted Document', 0) | |
| for page_text in text_content: | |
| doc.add_paragraph(page_text) | |
| doc.add_page_break() | |
| doc.save(output_path) | |
| except ImportError: | |
| # Fallback: save as text | |
| output_path = str(Path(output_path).with_suffix('.txt')) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write('\n\n--- Page Break ---\n\n'.join(text_content)) | |
| return output_path | |
| def main(): | |
| parser = argparse.ArgumentParser(description='Convert PDF to Word') | |
| parser.add_argument('--input', required=True, help='Input PDF path') | |
| parser.add_argument('--output', required=True, help='Output file path') | |
| args = parser.parse_args() | |
| try: | |
| result_path = convert(args.input, args.output) | |
| print(json.dumps({"success": True, "output": result_path, "message": "PDF converted to Word successfully"})) | |
| except Exception as e: | |
| print(json.dumps({"success": False, "output": "", "message": str(e)})) | |
| sys.exit(1) | |
| if __name__ == '__main__': | |
| main() | |