Spaces:

mfoud444
/

convert-docx

Paused

convert-docx / app.py

Mohammed Foud

first commit

c705885 9 months ago

6.74 kB

	import os
	import subprocess
	from flask import Flask, request, send_file, jsonify, after_this_request
	from docx import Document
	from docx.shared import Pt
	from docx.oxml import OxmlElement
	from docx.oxml.ns import qn
	from werkzeug.utils import secure_filename
	import glob
	import fitz # PyMuPDF for checking PDF pages

	app = Flask(__name__)

	UPLOAD_FOLDER = 'uploads'
	CONVERTED_FOLDER = 'converted'
	FONT_PATH = os.path.join('fonts', 'majalla.ttf')

	# Helper to apply font and styling
	def set_font_style(run, font_name='Sakkal Majalla', font_size=Pt(11)):
	if not os.path.exists(FONT_PATH):
	raise FileNotFoundError(f"Font file not found: {FONT_PATH}")

	run.font.name = font_name
	run.font.size = font_size

	r = run._element
	rPr = r.get_or_add_rPr()
	rFonts = OxmlElement('w:rFonts')
	rFonts.set(qn('w:ascii'), font_name)
	rFonts.set(qn('w:hAnsi'), font_name)
	rFonts.set(qn('w:cs'), font_name)
	rPr.append(rFonts)

	# Helper to remove empty cells in a table
	def remove_empty_rows_from_table(table):
	rows_to_remove = []

	# Check the first 4 rows and the row with the header "بيانات التوثيق"
	for i, row in enumerate(table.rows):
	# Remove rows that are completely empty
	if all(not any(paragraph.text.strip() for paragraph in cell.paragraphs) for cell in row.cells):
	rows_to_remove.append(i)

	# Also remove the row containing the header "بيانات التوثيق"
	if any("بيانات التوثيق" in paragraph.text for cell in row.cells for paragraph in cell.paragraphs):
	rows_to_remove.append(i)

	# Ensure we do not remove too many rows (we're removing rows based on index, so need to remove from the end)
	for index in sorted(rows_to_remove, reverse=True):
	tbl = table._element
	tr = table.rows[index]._element
	tbl.remove(tr)

	# Helper to remove rows with empty revenue type
	def remove_empty_revenue_type_rows(table):
	rows_to_remove = []
	revenue_type_col_index = None

	# Find the column index for "نوع الإيراد"
	if len(table.rows) > 1: # Check if table has at least 2 rows
	header_row = table.rows[1]
	for i, cell in enumerate(header_row.cells):
	# print(cell.text)
	if "نوع الإيراد" in cell.text:
	revenue_type_col_index = i
	break

	if revenue_type_col_index is not None:
	# Start from row 2 to skip headers
	for i in range(2, len(table.rows)):
	row = table.rows[i]
	if len(row.cells) > revenue_type_col_index:
	revenue_type_cell = row.cells[revenue_type_col_index]
	if not any(paragraph.text.strip() for paragraph in revenue_type_cell.paragraphs):
	rows_to_remove.append(i)

	# Remove rows from end to start to avoid index issues
	for index in sorted(rows_to_remove, reverse=True):
	tbl = table._element
	tr = table.rows[index]._element
	tbl.remove(tr)

	@app.route('/convert', methods=['POST'])
	def convert_docx_to_pdf():
	if 'file' not in request.files:
	return jsonify({'error': 'No file part in the request'}), 400

	file = request.files['file']
	if file.filename == '':
	return jsonify({'error': 'No selected file'}), 400

	if file and file.filename.endswith('.docx'):
	filepath = os.path.join(UPLOAD_FOLDER, secure_filename(file.filename))
	file.save(filepath)

	# Apply font styling to paragraphs
	document = Document(filepath)
	for paragraph in document.paragraphs:
	for run in paragraph.runs:
	set_font_style(run, font_name='Sakkal Majalla')

	# Apply font styling to tables
	for table in document.tables:
	for row in table.rows:
	for cell in row.cells:
	for paragraph in cell.paragraphs:
	for run in paragraph.runs:
	set_font_style(run, font_name='Sakkal Majalla')

	# Remove empty cells and revenue type rows from all tables
	for table in document.tables:
	remove_empty_rows_from_table(table)
	remove_empty_revenue_type_rows(table)

	# Save styled DOCX
	styled_docx_path = os.path.join(UPLOAD_FOLDER, 'styled_' + file.filename)
	document.save(styled_docx_path)

	# Convert DOCX to PDF using LibreOffice
	try:
	subprocess.run([
	'libreoffice',
	'--headless',
	'--convert-to',
	'pdf',
	'--outdir',
	CONVERTED_FOLDER,
	styled_docx_path
	], check=True)

	# Find the converted PDF
	base_filename = os.path.splitext(os.path.basename(styled_docx_path))[0]
	pdf_files = glob.glob(os.path.join(CONVERTED_FOLDER, f"{base_filename}*.pdf"))

	if not pdf_files:
	raise FileNotFoundError(f"No PDF file found for {base_filename} in {CONVERTED_FOLDER}")

	output_pdf_path = pdf_files[0]

	# Check if the second page is empty (except for the header/footer)
	with fitz.open(output_pdf_path) as pdf_document:
	num_pages = pdf_document.page_count

	if num_pages > 1:
	second_page = pdf_document[1]
	text = second_page.get_text("text").strip()

	if not text: # If second page has no actual text (only header/footer)
	pdf_document.delete_page(1) # Remove the second page
	new_pdf_path = os.path.join(CONVERTED_FOLDER, f"{base_filename}_fixed.pdf")
	pdf_document.save(new_pdf_path)
	output_pdf_path = new_pdf_path # Update the file path

	# Schedule file cleanup
	@after_this_request
	def cleanup_files(response):
	try:
	os.remove(filepath)
	os.remove(styled_docx_path)
	os.remove(output_pdf_path)
	except Exception as e:
	app.logger.error(f"Error cleaning up files: {e}")
	return response

	except subprocess.CalledProcessError as e:
	return jsonify({'error': f'LibreOffice failed: {e.stderr.decode()}'}), 500
	except FileNotFoundError as e:
	return jsonify({'error': str(e)}), 500

	return send_file(output_pdf_path, as_attachment=True)

	return jsonify({'error': 'Invalid file format. Only DOCX is supported.'}), 400

	# if __name__ == '__main__':
	# app.run(debug=True)