Spaces:

mfoud444
/

convert-docx

Paused

File size: 6,737 Bytes

c705885

import os
import subprocess
from flask import Flask, request, send_file, jsonify, after_this_request
from docx import Document
from docx.shared import Pt
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from werkzeug.utils import secure_filename
import glob
import fitz  # PyMuPDF for checking PDF pages

app = Flask(__name__)

UPLOAD_FOLDER = 'uploads'
CONVERTED_FOLDER = 'converted'
FONT_PATH = os.path.join('fonts', 'majalla.ttf')

# Helper to apply font and styling
def set_font_style(run, font_name='Sakkal Majalla', font_size=Pt(11)):
    if not os.path.exists(FONT_PATH):
        raise FileNotFoundError(f"Font file not found: {FONT_PATH}")
    
    run.font.name = font_name
    run.font.size = font_size

    r = run._element
    rPr = r.get_or_add_rPr()
    rFonts = OxmlElement('w:rFonts')
    rFonts.set(qn('w:ascii'), font_name)
    rFonts.set(qn('w:hAnsi'), font_name)
    rFonts.set(qn('w:cs'), font_name)
    rPr.append(rFonts)

# Helper to remove empty cells in a table
def remove_empty_rows_from_table(table):
    rows_to_remove = []
    
    # Check the first 4 rows and the row with the header "بيانات التوثيق"
    for i, row in enumerate(table.rows):
        # Remove rows that are completely empty
        if all(not any(paragraph.text.strip() for paragraph in cell.paragraphs) for cell in row.cells):
            rows_to_remove.append(i)
        
        # Also remove the row containing the header "بيانات التوثيق"
        if any("بيانات التوثيق" in paragraph.text for cell in row.cells for paragraph in cell.paragraphs):
            rows_to_remove.append(i)
    
    # Ensure we do not remove too many rows (we're removing rows based on index, so need to remove from the end)
    for index in sorted(rows_to_remove, reverse=True):
        tbl = table._element
        tr = table.rows[index]._element
        tbl.remove(tr)

# Helper to remove rows with empty revenue type
def remove_empty_revenue_type_rows(table):
    rows_to_remove = []
    revenue_type_col_index = None
    
    # Find the column index for "نوع الإيراد"
    if len(table.rows) > 1:  # Check if table has at least 2 rows
        header_row = table.rows[1]
        for i, cell in enumerate(header_row.cells):
            # print(cell.text)
            if "نوع الإيراد" in cell.text:
                revenue_type_col_index = i
                break
    
    if revenue_type_col_index is not None:
        # Start from row 2 to skip headers
        for i in range(2, len(table.rows)):
            row = table.rows[i]
            if len(row.cells) > revenue_type_col_index:
                revenue_type_cell = row.cells[revenue_type_col_index]
                if not any(paragraph.text.strip() for paragraph in revenue_type_cell.paragraphs):
                    rows_to_remove.append(i)
    
    # Remove rows from end to start to avoid index issues
    for index in sorted(rows_to_remove, reverse=True):
        tbl = table._element
        tr = table.rows[index]._element
        tbl.remove(tr)

@app.route('/convert', methods=['POST'])
def convert_docx_to_pdf():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part in the request'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file and file.filename.endswith('.docx'):
        filepath = os.path.join(UPLOAD_FOLDER, secure_filename(file.filename))
        file.save(filepath)

        # Apply font styling to paragraphs
        document = Document(filepath)
        for paragraph in document.paragraphs:
            for run in paragraph.runs:
                set_font_style(run, font_name='Sakkal Majalla')

        # Apply font styling to tables
        for table in document.tables:
            for row in table.rows:
                for cell in row.cells:
                    for paragraph in cell.paragraphs:
                        for run in paragraph.runs:
                            set_font_style(run, font_name='Sakkal Majalla')

        # Remove empty cells and revenue type rows from all tables
        for table in document.tables:
            remove_empty_rows_from_table(table)
            remove_empty_revenue_type_rows(table)

        # Save styled DOCX
        styled_docx_path = os.path.join(UPLOAD_FOLDER, 'styled_' + file.filename)
        document.save(styled_docx_path)

        # Convert DOCX to PDF using LibreOffice
        try:
            subprocess.run([
                'libreoffice',
                '--headless',
                '--convert-to',
                'pdf',
                '--outdir',
                CONVERTED_FOLDER,
                styled_docx_path
            ], check=True)

            # Find the converted PDF
            base_filename = os.path.splitext(os.path.basename(styled_docx_path))[0]
            pdf_files = glob.glob(os.path.join(CONVERTED_FOLDER, f"{base_filename}*.pdf"))

            if not pdf_files:
                raise FileNotFoundError(f"No PDF file found for {base_filename} in {CONVERTED_FOLDER}")

            output_pdf_path = pdf_files[0]

            # Check if the second page is empty (except for the header/footer)
            with fitz.open(output_pdf_path) as pdf_document:
                num_pages = pdf_document.page_count

                if num_pages > 1:
                    second_page = pdf_document[1]
                    text = second_page.get_text("text").strip()

                    if not text:  # If second page has no actual text (only header/footer)
                        pdf_document.delete_page(1)  # Remove the second page
                        new_pdf_path = os.path.join(CONVERTED_FOLDER, f"{base_filename}_fixed.pdf")
                        pdf_document.save(new_pdf_path)
                        output_pdf_path = new_pdf_path  # Update the file path

            # Schedule file cleanup
            @after_this_request
            def cleanup_files(response):
                try:
                    os.remove(filepath)
                    os.remove(styled_docx_path)
                    os.remove(output_pdf_path)
                except Exception as e:
                    app.logger.error(f"Error cleaning up files: {e}")
                return response

        except subprocess.CalledProcessError as e:
            return jsonify({'error': f'LibreOffice failed: {e.stderr.decode()}'}), 500
        except FileNotFoundError as e:
            return jsonify({'error': str(e)}), 500

        return send_file(output_pdf_path, as_attachment=True)

    return jsonify({'error': 'Invalid file format. Only DOCX is supported.'}), 400

# if __name__ == '__main__':
#     app.run(debug=True)