Mohammed Foud commited on
Commit
c705885
·
1 Parent(s): 4526bb8

first commit

Browse files
Files changed (9) hide show
  1. .gitattributes +1 -0
  2. .gitignore +25 -0
  3. Dockerfile +54 -0
  4. app.py +175 -0
  5. converted/.keep +0 -0
  6. d.sh +3 -0
  7. fonts/majalla.ttf +3 -0
  8. requirements.txt +11 -0
  9. uploads/.keep +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.ttf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### AL ###
2
+ #Template for AL projects for Dynamics 365 Business Central
3
+ #launch.json folder
4
+ .vscode/
5
+ #Cache folder
6
+ .alcache/
7
+ #Symbols folder
8
+ .alpackages/
9
+ #Snapshots folder
10
+ .snapshots/
11
+ #Testing Output folder
12
+ .output/
13
+ #Extension App-file
14
+ *.app
15
+ #Rapid Application Development File
16
+ rad.json
17
+ #Translation Base-file
18
+ *.g.xlf
19
+ #License-file
20
+ *.flf
21
+ #Test results file
22
+ TestResults.xml
23
+
24
+ # converted
25
+ # uploads
Dockerfile ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.9-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONDONTWRITEBYTECODE 1
6
+ ENV PYTHONUNBUFFERED 1
7
+
8
+ # Create a non-root user with ID 1000
9
+ RUN useradd -m -u 1000 user
10
+
11
+ # Set the user to the non-root user
12
+ USER user
13
+ ENV PATH="/home/user/.local/bin:$PATH"
14
+
15
+ # Set the working directory in the container
16
+ WORKDIR /app
17
+
18
+ # Copy the requirements file into the container and install dependencies
19
+ COPY --chown=user ./requirements.txt requirements.txt
20
+ RUN pip install --no-cache-dir -r requirements.txt
21
+
22
+ # Ensure upload directories exist with correct permissions
23
+ RUN mkdir -p /app/uploads /app/converted && chmod -R 777 /app/uploads /app/converted
24
+
25
+ # Install LibreOffice and Java dependencies
26
+ USER root
27
+ RUN apt-get update \
28
+ && apt-get install -y \
29
+ libreoffice-core \
30
+ libreoffice-writer \
31
+ libreoffice-java-common \
32
+ default-jre \
33
+ fontconfig \
34
+ --no-install-recommends \
35
+ && apt-get clean \
36
+ && rm -rf /var/lib/apt/lists/*
37
+
38
+ # Switch back to the non-root user
39
+ USER user
40
+
41
+ # Add the custom font
42
+ COPY --chown=user fonts/majalla.ttf /usr/share/fonts/truetype/majalla.ttf
43
+
44
+ # Rebuild font cache
45
+ RUN fc-cache -fv
46
+
47
+ # Copy the rest of the application code
48
+ COPY --chown=user . .
49
+
50
+ # Expose the port the app runs on
51
+ EXPOSE 7860
52
+
53
+ # Run the script using Gunicorn
54
+ CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:7860", "app:app"]
app.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ from flask import Flask, request, send_file, jsonify, after_this_request
4
+ from docx import Document
5
+ from docx.shared import Pt
6
+ from docx.oxml import OxmlElement
7
+ from docx.oxml.ns import qn
8
+ from werkzeug.utils import secure_filename
9
+ import glob
10
+ import fitz # PyMuPDF for checking PDF pages
11
+
12
+ app = Flask(__name__)
13
+
14
+ UPLOAD_FOLDER = 'uploads'
15
+ CONVERTED_FOLDER = 'converted'
16
+ FONT_PATH = os.path.join('fonts', 'majalla.ttf')
17
+
18
+ # Helper to apply font and styling
19
+ def set_font_style(run, font_name='Sakkal Majalla', font_size=Pt(11)):
20
+ if not os.path.exists(FONT_PATH):
21
+ raise FileNotFoundError(f"Font file not found: {FONT_PATH}")
22
+
23
+ run.font.name = font_name
24
+ run.font.size = font_size
25
+
26
+ r = run._element
27
+ rPr = r.get_or_add_rPr()
28
+ rFonts = OxmlElement('w:rFonts')
29
+ rFonts.set(qn('w:ascii'), font_name)
30
+ rFonts.set(qn('w:hAnsi'), font_name)
31
+ rFonts.set(qn('w:cs'), font_name)
32
+ rPr.append(rFonts)
33
+
34
+ # Helper to remove empty cells in a table
35
+ def remove_empty_rows_from_table(table):
36
+ rows_to_remove = []
37
+
38
+ # Check the first 4 rows and the row with the header "بيانات التوثيق"
39
+ for i, row in enumerate(table.rows):
40
+ # Remove rows that are completely empty
41
+ if all(not any(paragraph.text.strip() for paragraph in cell.paragraphs) for cell in row.cells):
42
+ rows_to_remove.append(i)
43
+
44
+ # Also remove the row containing the header "بيانات التوثيق"
45
+ if any("بيانات التوثيق" in paragraph.text for cell in row.cells for paragraph in cell.paragraphs):
46
+ rows_to_remove.append(i)
47
+
48
+ # Ensure we do not remove too many rows (we're removing rows based on index, so need to remove from the end)
49
+ for index in sorted(rows_to_remove, reverse=True):
50
+ tbl = table._element
51
+ tr = table.rows[index]._element
52
+ tbl.remove(tr)
53
+
54
+ # Helper to remove rows with empty revenue type
55
+ def remove_empty_revenue_type_rows(table):
56
+ rows_to_remove = []
57
+ revenue_type_col_index = None
58
+
59
+ # Find the column index for "نوع الإيراد"
60
+ if len(table.rows) > 1: # Check if table has at least 2 rows
61
+ header_row = table.rows[1]
62
+ for i, cell in enumerate(header_row.cells):
63
+ # print(cell.text)
64
+ if "نوع الإيراد" in cell.text:
65
+ revenue_type_col_index = i
66
+ break
67
+
68
+ if revenue_type_col_index is not None:
69
+ # Start from row 2 to skip headers
70
+ for i in range(2, len(table.rows)):
71
+ row = table.rows[i]
72
+ if len(row.cells) > revenue_type_col_index:
73
+ revenue_type_cell = row.cells[revenue_type_col_index]
74
+ if not any(paragraph.text.strip() for paragraph in revenue_type_cell.paragraphs):
75
+ rows_to_remove.append(i)
76
+
77
+ # Remove rows from end to start to avoid index issues
78
+ for index in sorted(rows_to_remove, reverse=True):
79
+ tbl = table._element
80
+ tr = table.rows[index]._element
81
+ tbl.remove(tr)
82
+
83
+ @app.route('/convert', methods=['POST'])
84
+ def convert_docx_to_pdf():
85
+ if 'file' not in request.files:
86
+ return jsonify({'error': 'No file part in the request'}), 400
87
+
88
+ file = request.files['file']
89
+ if file.filename == '':
90
+ return jsonify({'error': 'No selected file'}), 400
91
+
92
+ if file and file.filename.endswith('.docx'):
93
+ filepath = os.path.join(UPLOAD_FOLDER, secure_filename(file.filename))
94
+ file.save(filepath)
95
+
96
+ # Apply font styling to paragraphs
97
+ document = Document(filepath)
98
+ for paragraph in document.paragraphs:
99
+ for run in paragraph.runs:
100
+ set_font_style(run, font_name='Sakkal Majalla')
101
+
102
+ # Apply font styling to tables
103
+ for table in document.tables:
104
+ for row in table.rows:
105
+ for cell in row.cells:
106
+ for paragraph in cell.paragraphs:
107
+ for run in paragraph.runs:
108
+ set_font_style(run, font_name='Sakkal Majalla')
109
+
110
+ # Remove empty cells and revenue type rows from all tables
111
+ for table in document.tables:
112
+ remove_empty_rows_from_table(table)
113
+ remove_empty_revenue_type_rows(table)
114
+
115
+ # Save styled DOCX
116
+ styled_docx_path = os.path.join(UPLOAD_FOLDER, 'styled_' + file.filename)
117
+ document.save(styled_docx_path)
118
+
119
+ # Convert DOCX to PDF using LibreOffice
120
+ try:
121
+ subprocess.run([
122
+ 'libreoffice',
123
+ '--headless',
124
+ '--convert-to',
125
+ 'pdf',
126
+ '--outdir',
127
+ CONVERTED_FOLDER,
128
+ styled_docx_path
129
+ ], check=True)
130
+
131
+ # Find the converted PDF
132
+ base_filename = os.path.splitext(os.path.basename(styled_docx_path))[0]
133
+ pdf_files = glob.glob(os.path.join(CONVERTED_FOLDER, f"{base_filename}*.pdf"))
134
+
135
+ if not pdf_files:
136
+ raise FileNotFoundError(f"No PDF file found for {base_filename} in {CONVERTED_FOLDER}")
137
+
138
+ output_pdf_path = pdf_files[0]
139
+
140
+ # Check if the second page is empty (except for the header/footer)
141
+ with fitz.open(output_pdf_path) as pdf_document:
142
+ num_pages = pdf_document.page_count
143
+
144
+ if num_pages > 1:
145
+ second_page = pdf_document[1]
146
+ text = second_page.get_text("text").strip()
147
+
148
+ if not text: # If second page has no actual text (only header/footer)
149
+ pdf_document.delete_page(1) # Remove the second page
150
+ new_pdf_path = os.path.join(CONVERTED_FOLDER, f"{base_filename}_fixed.pdf")
151
+ pdf_document.save(new_pdf_path)
152
+ output_pdf_path = new_pdf_path # Update the file path
153
+
154
+ # Schedule file cleanup
155
+ @after_this_request
156
+ def cleanup_files(response):
157
+ try:
158
+ os.remove(filepath)
159
+ os.remove(styled_docx_path)
160
+ os.remove(output_pdf_path)
161
+ except Exception as e:
162
+ app.logger.error(f"Error cleaning up files: {e}")
163
+ return response
164
+
165
+ except subprocess.CalledProcessError as e:
166
+ return jsonify({'error': f'LibreOffice failed: {e.stderr.decode()}'}), 500
167
+ except FileNotFoundError as e:
168
+ return jsonify({'error': str(e)}), 500
169
+
170
+ return send_file(output_pdf_path, as_attachment=True)
171
+
172
+ return jsonify({'error': 'Invalid file format. Only DOCX is supported.'}), 400
173
+
174
+ # if __name__ == '__main__':
175
+ # app.run(debug=True)
converted/.keep ADDED
File without changes
d.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git add .
2
+ git commit -m "first commit"
3
+ git push -u origin main
fonts/majalla.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2602b7a98a0f10bf765dea99c43e7cab39d2c98a77733c5acd598ff7c7cf173d
3
+ size 370084
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask
2
+ python-docx
3
+ fpdf
4
+ docx2pdf
5
+ qrcode
6
+ werkzeug
7
+ requests
8
+ lxml
9
+ uvicorn[standard]
10
+ gunicorn
11
+ pymupdf
uploads/.keep ADDED
File without changes