convert-docx / app.py
Mohammed Foud
first commit
c705885
import os
import subprocess
from flask import Flask, request, send_file, jsonify, after_this_request
from docx import Document
from docx.shared import Pt
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from werkzeug.utils import secure_filename
import glob
import fitz # PyMuPDF for checking PDF pages
app = Flask(__name__)
UPLOAD_FOLDER = 'uploads'
CONVERTED_FOLDER = 'converted'
FONT_PATH = os.path.join('fonts', 'majalla.ttf')
# Helper to apply font and styling
def set_font_style(run, font_name='Sakkal Majalla', font_size=Pt(11)):
if not os.path.exists(FONT_PATH):
raise FileNotFoundError(f"Font file not found: {FONT_PATH}")
run.font.name = font_name
run.font.size = font_size
r = run._element
rPr = r.get_or_add_rPr()
rFonts = OxmlElement('w:rFonts')
rFonts.set(qn('w:ascii'), font_name)
rFonts.set(qn('w:hAnsi'), font_name)
rFonts.set(qn('w:cs'), font_name)
rPr.append(rFonts)
# Helper to remove empty cells in a table
def remove_empty_rows_from_table(table):
rows_to_remove = []
# Check the first 4 rows and the row with the header "بيانات التوثيق"
for i, row in enumerate(table.rows):
# Remove rows that are completely empty
if all(not any(paragraph.text.strip() for paragraph in cell.paragraphs) for cell in row.cells):
rows_to_remove.append(i)
# Also remove the row containing the header "بيانات التوثيق"
if any("بيانات التوثيق" in paragraph.text for cell in row.cells for paragraph in cell.paragraphs):
rows_to_remove.append(i)
# Ensure we do not remove too many rows (we're removing rows based on index, so need to remove from the end)
for index in sorted(rows_to_remove, reverse=True):
tbl = table._element
tr = table.rows[index]._element
tbl.remove(tr)
# Helper to remove rows with empty revenue type
def remove_empty_revenue_type_rows(table):
rows_to_remove = []
revenue_type_col_index = None
# Find the column index for "نوع الإيراد"
if len(table.rows) > 1: # Check if table has at least 2 rows
header_row = table.rows[1]
for i, cell in enumerate(header_row.cells):
# print(cell.text)
if "نوع الإيراد" in cell.text:
revenue_type_col_index = i
break
if revenue_type_col_index is not None:
# Start from row 2 to skip headers
for i in range(2, len(table.rows)):
row = table.rows[i]
if len(row.cells) > revenue_type_col_index:
revenue_type_cell = row.cells[revenue_type_col_index]
if not any(paragraph.text.strip() for paragraph in revenue_type_cell.paragraphs):
rows_to_remove.append(i)
# Remove rows from end to start to avoid index issues
for index in sorted(rows_to_remove, reverse=True):
tbl = table._element
tr = table.rows[index]._element
tbl.remove(tr)
@app.route('/convert', methods=['POST'])
def convert_docx_to_pdf():
if 'file' not in request.files:
return jsonify({'error': 'No file part in the request'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
if file and file.filename.endswith('.docx'):
filepath = os.path.join(UPLOAD_FOLDER, secure_filename(file.filename))
file.save(filepath)
# Apply font styling to paragraphs
document = Document(filepath)
for paragraph in document.paragraphs:
for run in paragraph.runs:
set_font_style(run, font_name='Sakkal Majalla')
# Apply font styling to tables
for table in document.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
set_font_style(run, font_name='Sakkal Majalla')
# Remove empty cells and revenue type rows from all tables
for table in document.tables:
remove_empty_rows_from_table(table)
remove_empty_revenue_type_rows(table)
# Save styled DOCX
styled_docx_path = os.path.join(UPLOAD_FOLDER, 'styled_' + file.filename)
document.save(styled_docx_path)
# Convert DOCX to PDF using LibreOffice
try:
subprocess.run([
'libreoffice',
'--headless',
'--convert-to',
'pdf',
'--outdir',
CONVERTED_FOLDER,
styled_docx_path
], check=True)
# Find the converted PDF
base_filename = os.path.splitext(os.path.basename(styled_docx_path))[0]
pdf_files = glob.glob(os.path.join(CONVERTED_FOLDER, f"{base_filename}*.pdf"))
if not pdf_files:
raise FileNotFoundError(f"No PDF file found for {base_filename} in {CONVERTED_FOLDER}")
output_pdf_path = pdf_files[0]
# Check if the second page is empty (except for the header/footer)
with fitz.open(output_pdf_path) as pdf_document:
num_pages = pdf_document.page_count
if num_pages > 1:
second_page = pdf_document[1]
text = second_page.get_text("text").strip()
if not text: # If second page has no actual text (only header/footer)
pdf_document.delete_page(1) # Remove the second page
new_pdf_path = os.path.join(CONVERTED_FOLDER, f"{base_filename}_fixed.pdf")
pdf_document.save(new_pdf_path)
output_pdf_path = new_pdf_path # Update the file path
# Schedule file cleanup
@after_this_request
def cleanup_files(response):
try:
os.remove(filepath)
os.remove(styled_docx_path)
os.remove(output_pdf_path)
except Exception as e:
app.logger.error(f"Error cleaning up files: {e}")
return response
except subprocess.CalledProcessError as e:
return jsonify({'error': f'LibreOffice failed: {e.stderr.decode()}'}), 500
except FileNotFoundError as e:
return jsonify({'error': str(e)}), 500
return send_file(output_pdf_path, as_attachment=True)
return jsonify({'error': 'Invalid file format. Only DOCX is supported.'}), 400
# if __name__ == '__main__':
# app.run(debug=True)