import os import subprocess from flask import Flask, request, send_file, jsonify, after_this_request from docx import Document from docx.shared import Pt from docx.oxml import OxmlElement from docx.oxml.ns import qn from werkzeug.utils import secure_filename import glob import fitz # PyMuPDF for checking PDF pages app = Flask(__name__) UPLOAD_FOLDER = 'uploads' CONVERTED_FOLDER = 'converted' FONT_PATH = os.path.join('fonts', 'majalla.ttf') # Helper to apply font and styling def set_font_style(run, font_name='Sakkal Majalla', font_size=Pt(11)): if not os.path.exists(FONT_PATH): raise FileNotFoundError(f"Font file not found: {FONT_PATH}") run.font.name = font_name run.font.size = font_size r = run._element rPr = r.get_or_add_rPr() rFonts = OxmlElement('w:rFonts') rFonts.set(qn('w:ascii'), font_name) rFonts.set(qn('w:hAnsi'), font_name) rFonts.set(qn('w:cs'), font_name) rPr.append(rFonts) # Helper to remove empty cells in a table def remove_empty_rows_from_table(table): rows_to_remove = [] # Check the first 4 rows and the row with the header "بيانات التوثيق" for i, row in enumerate(table.rows): # Remove rows that are completely empty if all(not any(paragraph.text.strip() for paragraph in cell.paragraphs) for cell in row.cells): rows_to_remove.append(i) # Also remove the row containing the header "بيانات التوثيق" if any("بيانات التوثيق" in paragraph.text for cell in row.cells for paragraph in cell.paragraphs): rows_to_remove.append(i) # Ensure we do not remove too many rows (we're removing rows based on index, so need to remove from the end) for index in sorted(rows_to_remove, reverse=True): tbl = table._element tr = table.rows[index]._element tbl.remove(tr) # Helper to remove rows with empty revenue type def remove_empty_revenue_type_rows(table): rows_to_remove = [] revenue_type_col_index = None # Find the column index for "نوع الإيراد" if len(table.rows) > 1: # Check if table has at least 2 rows header_row = table.rows[1] for i, cell in enumerate(header_row.cells): # print(cell.text) if "نوع الإيراد" in cell.text: revenue_type_col_index = i break if revenue_type_col_index is not None: # Start from row 2 to skip headers for i in range(2, len(table.rows)): row = table.rows[i] if len(row.cells) > revenue_type_col_index: revenue_type_cell = row.cells[revenue_type_col_index] if not any(paragraph.text.strip() for paragraph in revenue_type_cell.paragraphs): rows_to_remove.append(i) # Remove rows from end to start to avoid index issues for index in sorted(rows_to_remove, reverse=True): tbl = table._element tr = table.rows[index]._element tbl.remove(tr) @app.route('/convert', methods=['POST']) def convert_docx_to_pdf(): if 'file' not in request.files: return jsonify({'error': 'No file part in the request'}), 400 file = request.files['file'] if file.filename == '': return jsonify({'error': 'No selected file'}), 400 if file and file.filename.endswith('.docx'): filepath = os.path.join(UPLOAD_FOLDER, secure_filename(file.filename)) file.save(filepath) # Apply font styling to paragraphs document = Document(filepath) for paragraph in document.paragraphs: for run in paragraph.runs: set_font_style(run, font_name='Sakkal Majalla') # Apply font styling to tables for table in document.tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: for run in paragraph.runs: set_font_style(run, font_name='Sakkal Majalla') # Remove empty cells and revenue type rows from all tables for table in document.tables: remove_empty_rows_from_table(table) remove_empty_revenue_type_rows(table) # Save styled DOCX styled_docx_path = os.path.join(UPLOAD_FOLDER, 'styled_' + file.filename) document.save(styled_docx_path) # Convert DOCX to PDF using LibreOffice try: subprocess.run([ 'libreoffice', '--headless', '--convert-to', 'pdf', '--outdir', CONVERTED_FOLDER, styled_docx_path ], check=True) # Find the converted PDF base_filename = os.path.splitext(os.path.basename(styled_docx_path))[0] pdf_files = glob.glob(os.path.join(CONVERTED_FOLDER, f"{base_filename}*.pdf")) if not pdf_files: raise FileNotFoundError(f"No PDF file found for {base_filename} in {CONVERTED_FOLDER}") output_pdf_path = pdf_files[0] # Check if the second page is empty (except for the header/footer) with fitz.open(output_pdf_path) as pdf_document: num_pages = pdf_document.page_count if num_pages > 1: second_page = pdf_document[1] text = second_page.get_text("text").strip() if not text: # If second page has no actual text (only header/footer) pdf_document.delete_page(1) # Remove the second page new_pdf_path = os.path.join(CONVERTED_FOLDER, f"{base_filename}_fixed.pdf") pdf_document.save(new_pdf_path) output_pdf_path = new_pdf_path # Update the file path # Schedule file cleanup @after_this_request def cleanup_files(response): try: os.remove(filepath) os.remove(styled_docx_path) os.remove(output_pdf_path) except Exception as e: app.logger.error(f"Error cleaning up files: {e}") return response except subprocess.CalledProcessError as e: return jsonify({'error': f'LibreOffice failed: {e.stderr.decode()}'}), 500 except FileNotFoundError as e: return jsonify({'error': str(e)}), 500 return send_file(output_pdf_path, as_attachment=True) return jsonify({'error': 'Invalid file format. Only DOCX is supported.'}), 400 # if __name__ == '__main__': # app.run(debug=True)