Spaces:
Paused
Paused
| import os | |
| import subprocess | |
| from flask import Flask, request, send_file, jsonify, after_this_request | |
| from docx import Document | |
| from docx.shared import Pt | |
| from docx.oxml import OxmlElement | |
| from docx.oxml.ns import qn | |
| from werkzeug.utils import secure_filename | |
| import glob | |
| import fitz # PyMuPDF for checking PDF pages | |
| app = Flask(__name__) | |
| UPLOAD_FOLDER = 'uploads' | |
| CONVERTED_FOLDER = 'converted' | |
| FONT_PATH = os.path.join('fonts', 'majalla.ttf') | |
| # Helper to apply font and styling | |
| def set_font_style(run, font_name='Sakkal Majalla', font_size=Pt(11)): | |
| if not os.path.exists(FONT_PATH): | |
| raise FileNotFoundError(f"Font file not found: {FONT_PATH}") | |
| run.font.name = font_name | |
| run.font.size = font_size | |
| r = run._element | |
| rPr = r.get_or_add_rPr() | |
| rFonts = OxmlElement('w:rFonts') | |
| rFonts.set(qn('w:ascii'), font_name) | |
| rFonts.set(qn('w:hAnsi'), font_name) | |
| rFonts.set(qn('w:cs'), font_name) | |
| rPr.append(rFonts) | |
| # Helper to remove empty cells in a table | |
| def remove_empty_rows_from_table(table): | |
| rows_to_remove = [] | |
| # Check the first 4 rows and the row with the header "بيانات التوثيق" | |
| for i, row in enumerate(table.rows): | |
| # Remove rows that are completely empty | |
| if all(not any(paragraph.text.strip() for paragraph in cell.paragraphs) for cell in row.cells): | |
| rows_to_remove.append(i) | |
| # Also remove the row containing the header "بيانات التوثيق" | |
| if any("بيانات التوثيق" in paragraph.text for cell in row.cells for paragraph in cell.paragraphs): | |
| rows_to_remove.append(i) | |
| # Ensure we do not remove too many rows (we're removing rows based on index, so need to remove from the end) | |
| for index in sorted(rows_to_remove, reverse=True): | |
| tbl = table._element | |
| tr = table.rows[index]._element | |
| tbl.remove(tr) | |
| # Helper to remove rows with empty revenue type | |
| def remove_empty_revenue_type_rows(table): | |
| rows_to_remove = [] | |
| revenue_type_col_index = None | |
| # Find the column index for "نوع الإيراد" | |
| if len(table.rows) > 1: # Check if table has at least 2 rows | |
| header_row = table.rows[1] | |
| for i, cell in enumerate(header_row.cells): | |
| # print(cell.text) | |
| if "نوع الإيراد" in cell.text: | |
| revenue_type_col_index = i | |
| break | |
| if revenue_type_col_index is not None: | |
| # Start from row 2 to skip headers | |
| for i in range(2, len(table.rows)): | |
| row = table.rows[i] | |
| if len(row.cells) > revenue_type_col_index: | |
| revenue_type_cell = row.cells[revenue_type_col_index] | |
| if not any(paragraph.text.strip() for paragraph in revenue_type_cell.paragraphs): | |
| rows_to_remove.append(i) | |
| # Remove rows from end to start to avoid index issues | |
| for index in sorted(rows_to_remove, reverse=True): | |
| tbl = table._element | |
| tr = table.rows[index]._element | |
| tbl.remove(tr) | |
| def convert_docx_to_pdf(): | |
| if 'file' not in request.files: | |
| return jsonify({'error': 'No file part in the request'}), 400 | |
| file = request.files['file'] | |
| if file.filename == '': | |
| return jsonify({'error': 'No selected file'}), 400 | |
| if file and file.filename.endswith('.docx'): | |
| filepath = os.path.join(UPLOAD_FOLDER, secure_filename(file.filename)) | |
| file.save(filepath) | |
| # Apply font styling to paragraphs | |
| document = Document(filepath) | |
| for paragraph in document.paragraphs: | |
| for run in paragraph.runs: | |
| set_font_style(run, font_name='Sakkal Majalla') | |
| # Apply font styling to tables | |
| for table in document.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| for paragraph in cell.paragraphs: | |
| for run in paragraph.runs: | |
| set_font_style(run, font_name='Sakkal Majalla') | |
| # Remove empty cells and revenue type rows from all tables | |
| for table in document.tables: | |
| remove_empty_rows_from_table(table) | |
| remove_empty_revenue_type_rows(table) | |
| # Save styled DOCX | |
| styled_docx_path = os.path.join(UPLOAD_FOLDER, 'styled_' + file.filename) | |
| document.save(styled_docx_path) | |
| # Convert DOCX to PDF using LibreOffice | |
| try: | |
| subprocess.run([ | |
| 'libreoffice', | |
| '--headless', | |
| '--convert-to', | |
| 'pdf', | |
| '--outdir', | |
| CONVERTED_FOLDER, | |
| styled_docx_path | |
| ], check=True) | |
| # Find the converted PDF | |
| base_filename = os.path.splitext(os.path.basename(styled_docx_path))[0] | |
| pdf_files = glob.glob(os.path.join(CONVERTED_FOLDER, f"{base_filename}*.pdf")) | |
| if not pdf_files: | |
| raise FileNotFoundError(f"No PDF file found for {base_filename} in {CONVERTED_FOLDER}") | |
| output_pdf_path = pdf_files[0] | |
| # Check if the second page is empty (except for the header/footer) | |
| with fitz.open(output_pdf_path) as pdf_document: | |
| num_pages = pdf_document.page_count | |
| if num_pages > 1: | |
| second_page = pdf_document[1] | |
| text = second_page.get_text("text").strip() | |
| if not text: # If second page has no actual text (only header/footer) | |
| pdf_document.delete_page(1) # Remove the second page | |
| new_pdf_path = os.path.join(CONVERTED_FOLDER, f"{base_filename}_fixed.pdf") | |
| pdf_document.save(new_pdf_path) | |
| output_pdf_path = new_pdf_path # Update the file path | |
| # Schedule file cleanup | |
| def cleanup_files(response): | |
| try: | |
| os.remove(filepath) | |
| os.remove(styled_docx_path) | |
| os.remove(output_pdf_path) | |
| except Exception as e: | |
| app.logger.error(f"Error cleaning up files: {e}") | |
| return response | |
| except subprocess.CalledProcessError as e: | |
| return jsonify({'error': f'LibreOffice failed: {e.stderr.decode()}'}), 500 | |
| except FileNotFoundError as e: | |
| return jsonify({'error': str(e)}), 500 | |
| return send_file(output_pdf_path, as_attachment=True) | |
| return jsonify({'error': 'Invalid file format. Only DOCX is supported.'}), 400 | |
| # if __name__ == '__main__': | |
| # app.run(debug=True) | |