File size: 6,737 Bytes
c705885
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import os
import subprocess
from flask import Flask, request, send_file, jsonify, after_this_request
from docx import Document
from docx.shared import Pt
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from werkzeug.utils import secure_filename
import glob
import fitz  # PyMuPDF for checking PDF pages

app = Flask(__name__)

UPLOAD_FOLDER = 'uploads'
CONVERTED_FOLDER = 'converted'
FONT_PATH = os.path.join('fonts', 'majalla.ttf')

# Helper to apply font and styling
def set_font_style(run, font_name='Sakkal Majalla', font_size=Pt(11)):
    if not os.path.exists(FONT_PATH):
        raise FileNotFoundError(f"Font file not found: {FONT_PATH}")
    
    run.font.name = font_name
    run.font.size = font_size

    r = run._element
    rPr = r.get_or_add_rPr()
    rFonts = OxmlElement('w:rFonts')
    rFonts.set(qn('w:ascii'), font_name)
    rFonts.set(qn('w:hAnsi'), font_name)
    rFonts.set(qn('w:cs'), font_name)
    rPr.append(rFonts)

# Helper to remove empty cells in a table
def remove_empty_rows_from_table(table):
    rows_to_remove = []
    
    # Check the first 4 rows and the row with the header "بيانات التوثيق"
    for i, row in enumerate(table.rows):
        # Remove rows that are completely empty
        if all(not any(paragraph.text.strip() for paragraph in cell.paragraphs) for cell in row.cells):
            rows_to_remove.append(i)
        
        # Also remove the row containing the header "بيانات التوثيق"
        if any("بيانات التوثيق" in paragraph.text for cell in row.cells for paragraph in cell.paragraphs):
            rows_to_remove.append(i)
    
    # Ensure we do not remove too many rows (we're removing rows based on index, so need to remove from the end)
    for index in sorted(rows_to_remove, reverse=True):
        tbl = table._element
        tr = table.rows[index]._element
        tbl.remove(tr)

# Helper to remove rows with empty revenue type
def remove_empty_revenue_type_rows(table):
    rows_to_remove = []
    revenue_type_col_index = None
    
    # Find the column index for "نوع الإيراد"
    if len(table.rows) > 1:  # Check if table has at least 2 rows
        header_row = table.rows[1]
        for i, cell in enumerate(header_row.cells):
            # print(cell.text)
            if "نوع الإيراد" in cell.text:
                revenue_type_col_index = i
                break
    
    if revenue_type_col_index is not None:
        # Start from row 2 to skip headers
        for i in range(2, len(table.rows)):
            row = table.rows[i]
            if len(row.cells) > revenue_type_col_index:
                revenue_type_cell = row.cells[revenue_type_col_index]
                if not any(paragraph.text.strip() for paragraph in revenue_type_cell.paragraphs):
                    rows_to_remove.append(i)
    
    # Remove rows from end to start to avoid index issues
    for index in sorted(rows_to_remove, reverse=True):
        tbl = table._element
        tr = table.rows[index]._element
        tbl.remove(tr)

@app.route('/convert', methods=['POST'])
def convert_docx_to_pdf():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part in the request'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file and file.filename.endswith('.docx'):
        filepath = os.path.join(UPLOAD_FOLDER, secure_filename(file.filename))
        file.save(filepath)

        # Apply font styling to paragraphs
        document = Document(filepath)
        for paragraph in document.paragraphs:
            for run in paragraph.runs:
                set_font_style(run, font_name='Sakkal Majalla')

        # Apply font styling to tables
        for table in document.tables:
            for row in table.rows:
                for cell in row.cells:
                    for paragraph in cell.paragraphs:
                        for run in paragraph.runs:
                            set_font_style(run, font_name='Sakkal Majalla')

        # Remove empty cells and revenue type rows from all tables
        for table in document.tables:
            remove_empty_rows_from_table(table)
            remove_empty_revenue_type_rows(table)

        # Save styled DOCX
        styled_docx_path = os.path.join(UPLOAD_FOLDER, 'styled_' + file.filename)
        document.save(styled_docx_path)

        # Convert DOCX to PDF using LibreOffice
        try:
            subprocess.run([
                'libreoffice',
                '--headless',
                '--convert-to',
                'pdf',
                '--outdir',
                CONVERTED_FOLDER,
                styled_docx_path
            ], check=True)

            # Find the converted PDF
            base_filename = os.path.splitext(os.path.basename(styled_docx_path))[0]
            pdf_files = glob.glob(os.path.join(CONVERTED_FOLDER, f"{base_filename}*.pdf"))

            if not pdf_files:
                raise FileNotFoundError(f"No PDF file found for {base_filename} in {CONVERTED_FOLDER}")

            output_pdf_path = pdf_files[0]

            # Check if the second page is empty (except for the header/footer)
            with fitz.open(output_pdf_path) as pdf_document:
                num_pages = pdf_document.page_count

                if num_pages > 1:
                    second_page = pdf_document[1]
                    text = second_page.get_text("text").strip()

                    if not text:  # If second page has no actual text (only header/footer)
                        pdf_document.delete_page(1)  # Remove the second page
                        new_pdf_path = os.path.join(CONVERTED_FOLDER, f"{base_filename}_fixed.pdf")
                        pdf_document.save(new_pdf_path)
                        output_pdf_path = new_pdf_path  # Update the file path

            # Schedule file cleanup
            @after_this_request
            def cleanup_files(response):
                try:
                    os.remove(filepath)
                    os.remove(styled_docx_path)
                    os.remove(output_pdf_path)
                except Exception as e:
                    app.logger.error(f"Error cleaning up files: {e}")
                return response

        except subprocess.CalledProcessError as e:
            return jsonify({'error': f'LibreOffice failed: {e.stderr.decode()}'}), 500
        except FileNotFoundError as e:
            return jsonify({'error': str(e)}), 500

        return send_file(output_pdf_path, as_attachment=True)

    return jsonify({'error': 'Invalid file format. Only DOCX is supported.'}), 400

# if __name__ == '__main__':
#     app.run(debug=True)