Spaces:

innoai
/

PDFConverter-ENG

Sleeping

App Files Files Community

PDFConverter-ENG / tools /python /pdf_to_word.py

innoai

Deploy PDF Converter - compiled .NET 8 app

80a3675 verified 3 months ago

Raw

History Blame Contribute Delete

1.93 kB

	#!/usr/bin/env python3
	"""PDF to Word (DOCX) converter using pdfplumber + python-docx approach via PyPDF2."""
	import argparse, json, sys, os
	from pathlib import Path

	def convert(input_path, output_path):
	try:
	import pdfplumber
	from reportlab.lib.pagesizes import A4
	except ImportError:
	pass

	from PyPDF2 import PdfReader

	reader = PdfReader(input_path)
	text_content = []
	for page in reader.pages:
	text = page.extract_text()
	if text:
	text_content.append(text)

	if not output_path:
	output_path = str(Path(input_path).with_suffix('.docx'))

	# Write as simple DOCX using python-docx if available, otherwise plain text
	try:
	from docx import Document
	doc = Document()
	doc.add_heading('Converted Document', 0)
	for page_text in text_content:
	doc.add_paragraph(page_text)
	doc.add_page_break()
	doc.save(output_path)
	except ImportError:
	# Fallback: save as text
	output_path = str(Path(output_path).with_suffix('.txt'))
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write('\n\n--- Page Break ---\n\n'.join(text_content))

	return output_path

	def main():
	parser = argparse.ArgumentParser(description='Convert PDF to Word')
	parser.add_argument('--input', required=True, help='Input PDF path')
	parser.add_argument('--output', required=True, help='Output file path')
	args = parser.parse_args()

	try:
	result_path = convert(args.input, args.output)
	print(json.dumps({"success": True, "output": result_path, "message": "PDF converted to Word successfully"}))
	except Exception as e:
	print(json.dumps({"success": False, "output": "", "message": str(e)}))
	sys.exit(1)

	if __name__ == '__main__':
	main()