Spaces:

innoai
/

PDFConverter-ENG

Sleeping

Deploy PDF Converter - compiled .NET 8 app

80a3675 verified 3 months ago

1.99 kB

	#!/usr/bin/env python3
	"""PDF to Excel converter using pdfplumber for table extraction."""
	import argparse, json, sys, csv
	from pathlib import Path

	def convert(input_path, output_path):
	import pdfplumber

	all_rows = []
	with pdfplumber.open(input_path) as pdf:
	for page in pdf.pages:
	tables = page.extract_tables()
	for table in tables:
	for row in table:
	cleaned = [cell if cell else '' for cell in row]
	all_rows.append(cleaned)
	if not tables:
	text = page.extract_text()
	if text:
	for line in text.split('\n'):
	all_rows.append([line.strip()])

	if not output_path:
	output_path = str(Path(input_path).with_suffix('.csv'))

	try:
	import openpyxl
	wb = openpyxl.Workbook()
	ws = wb.active
	ws.title = "Extracted Data"
	for row in all_rows:
	ws.append(row)
	xlsx_path = str(Path(output_path).with_suffix('.xlsx'))
	wb.save(xlsx_path)
	return xlsx_path
	except ImportError:
	csv_path = str(Path(output_path).with_suffix('.csv'))
	with open(csv_path, 'w', newline='', encoding='utf-8') as f:
	writer = csv.writer(f)
	writer.writerows(all_rows)
	return csv_path

	def main():
	parser = argparse.ArgumentParser(description='Convert PDF to Excel')
	parser.add_argument('--input', required=True)
	parser.add_argument('--output', required=True)
	args = parser.parse_args()
	try:
	result = convert(args.input, args.output)
	print(json.dumps({"success": True, "output": result, "message": "PDF converted to Excel successfully"}))
	except Exception as e:
	print(json.dumps({"success": False, "output": "", "message": str(e)}))
	sys.exit(1)

	if __name__ == '__main__':
	main()