ABDALLALSWAITI commited on
Commit
e80d253
·
verified ·
1 Parent(s): 8d2aae9

Update api.py

Browse files
Files changed (1) hide show
  1. api.py +146 -41
api.py CHANGED
@@ -10,7 +10,7 @@ import re
10
  import mimetypes
11
  from typing import List, Optional
12
 
13
- app = FastAPI(title="HTML to PDF Converter API")
14
 
15
  app.add_middleware(
16
  CORSMiddleware,
@@ -73,14 +73,10 @@ def image_to_base64(image_bytes, filename):
73
  return None
74
 
75
  def embed_images_as_base64(html_content, images: List[UploadFile]):
76
- """
77
- Embed all images directly as base64 data URLs in the HTML
78
- This ensures images are always included in the PDF
79
- """
80
  if not images:
81
  return html_content, {}
82
 
83
- # Create mapping of filename to base64 data URL
84
  image_data_urls = {}
85
  for img in images:
86
  img.file.seek(0)
@@ -93,11 +89,9 @@ def embed_images_as_base64(html_content, images: List[UploadFile]):
93
  if not image_data_urls:
94
  return html_content, {}
95
 
96
- # Track replacements
97
  replacements = {}
98
 
99
  for filename, data_url in image_data_urls.items():
100
- # Escape filename for regex
101
  escaped_name = re.escape(filename)
102
 
103
  # Pattern 1: img src attribute
@@ -124,7 +118,6 @@ def embed_images_as_base64(html_content, images: List[UploadFile]):
124
  html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
125
  replacements[f"{filename} (url)"] = count3
126
 
127
- # Log results
128
  if replacements:
129
  print("=== Image Replacements ===")
130
  for key, count in replacements.items():
@@ -135,31 +128,127 @@ def embed_images_as_base64(html_content, images: List[UploadFile]):
135
 
136
  return html_content, replacements
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
139
- """Convert HTML content to PDF using Puppeteer"""
140
  try:
141
- # Inject CSS to preserve styles
142
- style_injection = """
143
- <style>
144
- @page { margin: 0; }
145
- * {
146
- -webkit-print-color-adjust: exact !important;
147
- print-color-adjust: exact !important;
148
- color-adjust: exact !important;
149
- }
150
- body {
151
- -webkit-print-color-adjust: exact !important;
152
- print-color-adjust: exact !important;
153
- }
154
- </style>
155
- """
156
-
157
- if '</head>' in html_content:
158
- html_content = html_content.replace('</head>', style_injection + '</head>')
159
- elif '<body' in html_content:
160
- html_content = html_content.replace('<body', style_injection + '<body', 1)
161
- else:
162
- html_content = style_injection + html_content
163
 
164
  # Save HTML to temp file
165
  html_file = os.path.join(temp_dir, "input.html")
@@ -208,17 +297,24 @@ def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
208
  @app.get("/")
209
  async def root():
210
  return {
211
- "message": "HTML to PDF Converter API with Base64 Image Embedding",
212
- "version": "2.1",
 
 
 
 
 
 
213
  "endpoints": {
214
- "/convert": "POST - Convert HTML to PDF (images embedded as base64)",
215
- "/health": "GET - Health check"
 
216
  }
217
  }
218
 
219
  @app.get("/health")
220
  async def health():
221
- return {"status": "healthy"}
222
 
223
  @app.post("/convert")
224
  async def convert_to_pdf(
@@ -229,7 +325,7 @@ async def convert_to_pdf(
229
  images: Optional[List[UploadFile]] = File(None)
230
  ):
231
  """
232
- Convert HTML to PDF with embedded base64 images
233
 
234
  Parameters:
235
  - html_file: HTML file upload (optional)
@@ -238,8 +334,13 @@ async def convert_to_pdf(
238
  - auto_detect: Auto-detect aspect ratio from HTML (default: True)
239
  - images: List of image files - will be embedded as base64 in HTML (optional)
240
 
 
 
 
 
 
241
  Returns:
242
- - PDF file as bytes with images embedded
243
  """
244
  temp_dir = None
245
 
@@ -289,7 +390,7 @@ async def convert_to_pdf(
289
  raise HTTPException(status_code=400, detail="Invalid aspect_ratio. Must be '16:9', '1:1', or '9:16'")
290
  print(f"Using specified aspect ratio: {aspect_ratio}")
291
 
292
- # Convert to PDF
293
  pdf_bytes, error = convert_html_to_pdf(html, aspect_ratio, temp_dir)
294
 
295
  if error:
@@ -310,7 +411,8 @@ async def convert_to_pdf(
310
  headers={
311
  "Content-Disposition": f"attachment; filename={output_filename}",
312
  "X-Aspect-Ratio": aspect_ratio,
313
- "X-Images-Embedded": str(len(images)) if images else "0"
 
314
  }
315
  )
316
 
@@ -326,4 +428,7 @@ async def convert_to_pdf(
326
 
327
  if __name__ == "__main__":
328
  import uvicorn
 
 
 
329
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
10
  import mimetypes
11
  from typing import List, Optional
12
 
13
+ app = FastAPI(title="HTML to PDF Converter API with Page Break Support")
14
 
15
  app.add_middleware(
16
  CORSMiddleware,
 
73
  return None
74
 
75
  def embed_images_as_base64(html_content, images: List[UploadFile]):
76
+ """Embed all images directly as base64 data URLs in the HTML"""
 
 
 
77
  if not images:
78
  return html_content, {}
79
 
 
80
  image_data_urls = {}
81
  for img in images:
82
  img.file.seek(0)
 
89
  if not image_data_urls:
90
  return html_content, {}
91
 
 
92
  replacements = {}
93
 
94
  for filename, data_url in image_data_urls.items():
 
95
  escaped_name = re.escape(filename)
96
 
97
  # Pattern 1: img src attribute
 
118
  html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
119
  replacements[f"{filename} (url)"] = count3
120
 
 
121
  if replacements:
122
  print("=== Image Replacements ===")
123
  for key, count in replacements.items():
 
128
 
129
  return html_content, replacements
130
 
131
+ def inject_page_breaks(html_content: str, aspect_ratio: str):
132
+ """Automatically inject page breaks and page sizing CSS"""
133
+
134
+ # Determine page orientation
135
+ if aspect_ratio == "16:9":
136
+ page_size = "A4 landscape"
137
+ orientation = "landscape"
138
+ elif aspect_ratio == "1:1":
139
+ page_size = "210mm 210mm"
140
+ orientation = "portrait"
141
+ else: # 9:16
142
+ page_size = "A4 portrait"
143
+ orientation = "portrait"
144
+
145
+ # Comprehensive page break CSS
146
+ page_css = f"""
147
+ <style id="auto-page-breaks">
148
+ /* Define page size */
149
+ @page {{
150
+ size: {page_size};
151
+ margin: 0;
152
+ }}
153
+
154
+ /* Reset body */
155
+ html, body {{
156
+ margin: 0 !important;
157
+ padding: 0 !important;
158
+ width: 100% !important;
159
+ height: 100% !important;
160
+ }}
161
+
162
+ /* Page containers - each should be one page */
163
+ .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
164
+ width: 100% !important;
165
+ min-height: 100vh !important;
166
+ height: 100vh !important;
167
+ page-break-after: always !important;
168
+ break-after: page !important;
169
+ page-break-inside: avoid !important;
170
+ break-inside: avoid !important;
171
+ position: relative !important;
172
+ box-sizing: border-box !important;
173
+ overflow: hidden !important;
174
+ }}
175
+
176
+ /* Last page shouldn't force a break */
177
+ .page:last-child, .slide:last-child,
178
+ section.page:last-child, article.page:last-child {{
179
+ page-break-after: auto !important;
180
+ break-after: auto !important;
181
+ }}
182
+
183
+ /* If no explicit page class, treat direct body children as pages */
184
+ body > section:not(.no-page-break),
185
+ body > article:not(.no-page-break),
186
+ body > div:not(.no-page-break) {{
187
+ page-break-after: always !important;
188
+ break-after: page !important;
189
+ min-height: 100vh;
190
+ }}
191
+
192
+ body > section:last-child,
193
+ body > article:last-child,
194
+ body > div:last-child {{
195
+ page-break-after: auto !important;
196
+ }}
197
+
198
+ /* Utility classes for manual control */
199
+ .page-break, .page-break-after {{
200
+ page-break-after: always !important;
201
+ break-after: page !important;
202
+ }}
203
+
204
+ .page-break-before {{
205
+ page-break-before: always !important;
206
+ break-before: page !important;
207
+ }}
208
+
209
+ .no-page-break, .keep-together {{
210
+ page-break-inside: avoid !important;
211
+ break-inside: avoid !important;
212
+ }}
213
+
214
+ /* Prevent awkward breaks in content */
215
+ h1, h2, h3, h4, h5, h6 {{
216
+ page-break-after: avoid !important;
217
+ break-after: avoid !important;
218
+ page-break-inside: avoid !important;
219
+ break-inside: avoid !important;
220
+ }}
221
+
222
+ img, figure, table, pre, blockquote {{
223
+ page-break-inside: avoid !important;
224
+ break-inside: avoid !important;
225
+ }}
226
+
227
+ /* Preserve colors and backgrounds */
228
+ * {{
229
+ -webkit-print-color-adjust: exact !important;
230
+ print-color-adjust: exact !important;
231
+ color-adjust: exact !important;
232
+ }}
233
+ </style>
234
+ """
235
+
236
+ # Inject CSS into HTML
237
+ if '</head>' in html_content:
238
+ html_content = html_content.replace('</head>', page_css + '</head>')
239
+ elif '<body' in html_content:
240
+ html_content = html_content.replace('<body', page_css + '<body', 1)
241
+ else:
242
+ html_content = page_css + html_content
243
+
244
+ return html_content
245
+
246
  def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
247
+ """Convert HTML content to PDF using Puppeteer with proper page breaks"""
248
  try:
249
+ # Step 1: Inject page break CSS
250
+ print("Injecting page break CSS...")
251
+ html_content = inject_page_breaks(html_content, aspect_ratio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
  # Save HTML to temp file
254
  html_file = os.path.join(temp_dir, "input.html")
 
297
  @app.get("/")
298
  async def root():
299
  return {
300
+ "message": "HTML to PDF Converter API with Proper Page Break Support",
301
+ "version": "3.0",
302
+ "features": [
303
+ "Base64 image embedding",
304
+ "Automatic page break detection",
305
+ "Custom CSS @page rules",
306
+ "Multiple aspect ratios (16:9, 1:1, 9:16)"
307
+ ],
308
  "endpoints": {
309
+ "/convert": "POST - Convert HTML to PDF",
310
+ "/health": "GET - Health check",
311
+ "/docs": "GET - API documentation"
312
  }
313
  }
314
 
315
  @app.get("/health")
316
  async def health():
317
+ return {"status": "healthy", "version": "3.0"}
318
 
319
  @app.post("/convert")
320
  async def convert_to_pdf(
 
325
  images: Optional[List[UploadFile]] = File(None)
326
  ):
327
  """
328
+ Convert HTML to PDF with proper page breaks and embedded base64 images
329
 
330
  Parameters:
331
  - html_file: HTML file upload (optional)
 
334
  - auto_detect: Auto-detect aspect ratio from HTML (default: True)
335
  - images: List of image files - will be embedded as base64 in HTML (optional)
336
 
337
+ HTML Structure for Page Breaks:
338
+ - Use class="page" on div elements for separate pages
339
+ - Or use class="slide" for presentation-style pages
340
+ - Each page will automatically break to a new PDF page
341
+
342
  Returns:
343
+ - PDF file as bytes with proper page separation
344
  """
345
  temp_dir = None
346
 
 
390
  raise HTTPException(status_code=400, detail="Invalid aspect_ratio. Must be '16:9', '1:1', or '9:16'")
391
  print(f"Using specified aspect ratio: {aspect_ratio}")
392
 
393
+ # Convert to PDF with page breaks
394
  pdf_bytes, error = convert_html_to_pdf(html, aspect_ratio, temp_dir)
395
 
396
  if error:
 
411
  headers={
412
  "Content-Disposition": f"attachment; filename={output_filename}",
413
  "X-Aspect-Ratio": aspect_ratio,
414
+ "X-Images-Embedded": str(len(images)) if images else "0",
415
+ "X-Page-Breaks": "enabled"
416
  }
417
  )
418
 
 
428
 
429
  if __name__ == "__main__":
430
  import uvicorn
431
+ print("Starting HTML to PDF Converter API with Page Break Support")
432
+ print("Features: Base64 images, automatic page breaks, multiple aspect ratios")
433
+ print("API docs available at: http://localhost:7860/docs")
434
  uvicorn.run(app, host="0.0.0.0", port=7860)