Spaces:
Paused
Paused
File size: 6,737 Bytes
c705885 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import os
import subprocess
from flask import Flask, request, send_file, jsonify, after_this_request
from docx import Document
from docx.shared import Pt
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from werkzeug.utils import secure_filename
import glob
import fitz # PyMuPDF for checking PDF pages
app = Flask(__name__)
UPLOAD_FOLDER = 'uploads'
CONVERTED_FOLDER = 'converted'
FONT_PATH = os.path.join('fonts', 'majalla.ttf')
# Helper to apply font and styling
def set_font_style(run, font_name='Sakkal Majalla', font_size=Pt(11)):
if not os.path.exists(FONT_PATH):
raise FileNotFoundError(f"Font file not found: {FONT_PATH}")
run.font.name = font_name
run.font.size = font_size
r = run._element
rPr = r.get_or_add_rPr()
rFonts = OxmlElement('w:rFonts')
rFonts.set(qn('w:ascii'), font_name)
rFonts.set(qn('w:hAnsi'), font_name)
rFonts.set(qn('w:cs'), font_name)
rPr.append(rFonts)
# Helper to remove empty cells in a table
def remove_empty_rows_from_table(table):
rows_to_remove = []
# Check the first 4 rows and the row with the header "بيانات التوثيق"
for i, row in enumerate(table.rows):
# Remove rows that are completely empty
if all(not any(paragraph.text.strip() for paragraph in cell.paragraphs) for cell in row.cells):
rows_to_remove.append(i)
# Also remove the row containing the header "بيانات التوثيق"
if any("بيانات التوثيق" in paragraph.text for cell in row.cells for paragraph in cell.paragraphs):
rows_to_remove.append(i)
# Ensure we do not remove too many rows (we're removing rows based on index, so need to remove from the end)
for index in sorted(rows_to_remove, reverse=True):
tbl = table._element
tr = table.rows[index]._element
tbl.remove(tr)
# Helper to remove rows with empty revenue type
def remove_empty_revenue_type_rows(table):
rows_to_remove = []
revenue_type_col_index = None
# Find the column index for "نوع الإيراد"
if len(table.rows) > 1: # Check if table has at least 2 rows
header_row = table.rows[1]
for i, cell in enumerate(header_row.cells):
# print(cell.text)
if "نوع الإيراد" in cell.text:
revenue_type_col_index = i
break
if revenue_type_col_index is not None:
# Start from row 2 to skip headers
for i in range(2, len(table.rows)):
row = table.rows[i]
if len(row.cells) > revenue_type_col_index:
revenue_type_cell = row.cells[revenue_type_col_index]
if not any(paragraph.text.strip() for paragraph in revenue_type_cell.paragraphs):
rows_to_remove.append(i)
# Remove rows from end to start to avoid index issues
for index in sorted(rows_to_remove, reverse=True):
tbl = table._element
tr = table.rows[index]._element
tbl.remove(tr)
@app.route('/convert', methods=['POST'])
def convert_docx_to_pdf():
if 'file' not in request.files:
return jsonify({'error': 'No file part in the request'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
if file and file.filename.endswith('.docx'):
filepath = os.path.join(UPLOAD_FOLDER, secure_filename(file.filename))
file.save(filepath)
# Apply font styling to paragraphs
document = Document(filepath)
for paragraph in document.paragraphs:
for run in paragraph.runs:
set_font_style(run, font_name='Sakkal Majalla')
# Apply font styling to tables
for table in document.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
set_font_style(run, font_name='Sakkal Majalla')
# Remove empty cells and revenue type rows from all tables
for table in document.tables:
remove_empty_rows_from_table(table)
remove_empty_revenue_type_rows(table)
# Save styled DOCX
styled_docx_path = os.path.join(UPLOAD_FOLDER, 'styled_' + file.filename)
document.save(styled_docx_path)
# Convert DOCX to PDF using LibreOffice
try:
subprocess.run([
'libreoffice',
'--headless',
'--convert-to',
'pdf',
'--outdir',
CONVERTED_FOLDER,
styled_docx_path
], check=True)
# Find the converted PDF
base_filename = os.path.splitext(os.path.basename(styled_docx_path))[0]
pdf_files = glob.glob(os.path.join(CONVERTED_FOLDER, f"{base_filename}*.pdf"))
if not pdf_files:
raise FileNotFoundError(f"No PDF file found for {base_filename} in {CONVERTED_FOLDER}")
output_pdf_path = pdf_files[0]
# Check if the second page is empty (except for the header/footer)
with fitz.open(output_pdf_path) as pdf_document:
num_pages = pdf_document.page_count
if num_pages > 1:
second_page = pdf_document[1]
text = second_page.get_text("text").strip()
if not text: # If second page has no actual text (only header/footer)
pdf_document.delete_page(1) # Remove the second page
new_pdf_path = os.path.join(CONVERTED_FOLDER, f"{base_filename}_fixed.pdf")
pdf_document.save(new_pdf_path)
output_pdf_path = new_pdf_path # Update the file path
# Schedule file cleanup
@after_this_request
def cleanup_files(response):
try:
os.remove(filepath)
os.remove(styled_docx_path)
os.remove(output_pdf_path)
except Exception as e:
app.logger.error(f"Error cleaning up files: {e}")
return response
except subprocess.CalledProcessError as e:
return jsonify({'error': f'LibreOffice failed: {e.stderr.decode()}'}), 500
except FileNotFoundError as e:
return jsonify({'error': str(e)}), 500
return send_file(output_pdf_path, as_attachment=True)
return jsonify({'error': 'Invalid file format. Only DOCX is supported.'}), 400
# if __name__ == '__main__':
# app.run(debug=True)
|