ABDALLALSWAITI commited on
Commit
f5befe6
·
verified ·
1 Parent(s): 7857567

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -84
app.py CHANGED
@@ -9,7 +9,6 @@ import subprocess
9
  import os
10
  import tempfile
11
  import shutil
12
- import base64
13
  import re
14
  import mimetypes
15
  from typing import List, Optional
@@ -57,62 +56,51 @@ def detect_aspect_ratio(html_content):
57
 
58
  return "9:16"
59
 
60
- def image_to_base64(image_bytes, filename):
61
- """Convert image bytes to base64 data URL"""
62
- try:
63
- mime_type, _ = mimetypes.guess_type(filename)
64
- if not mime_type:
65
- ext = os.path.splitext(filename)[1].lower()
66
- mime_map = {
67
- '.jpg': 'image/jpeg',
68
- '.jpeg': 'image/jpeg',
69
- '.png': 'image/png',
70
- '.gif': 'image/gif',
71
- '.svg': 'image/svg+xml',
72
- '.webp': 'image/webp',
73
- '.bmp': 'image/bmp'
74
- }
75
- mime_type = mime_map.get(ext, 'image/png')
76
-
77
- b64_data = base64.b64encode(image_bytes).decode('utf-8')
78
- data_url = f"data:{mime_type};base64,{b64_data}"
79
- return data_url
80
- except Exception as e:
81
- raise HTTPException(status_code=400, detail=f"Error converting {filename} to base64: {str(e)}")
82
-
83
- def embed_images_as_base64(html_content, images_dict):
84
- """Embed all images directly as base64 data URLs in the HTML"""
85
- if not images_dict:
86
- return html_content, {}
87
-
88
  replacements = {}
89
 
90
- for filename, data_url in images_dict.items():
91
- escaped_name = re.escape(filename)
92
-
93
- # Pattern 1: img src attribute
94
- pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
95
- matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
96
- count1 = len(matches1)
97
- if matches1:
98
- html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
99
- replacements[f"{filename} (img src)"] = count1
100
-
101
- # Pattern 2: background-image
102
- pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
103
- matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
104
- count2 = len(matches2)
105
- if matches2:
106
- html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
107
- replacements[f"{filename} (bg-image)"] = count2
108
-
109
- # Pattern 3: CSS url()
110
- pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
111
- matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
112
- count3 = len(matches3)
113
- if matches3:
114
- html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
115
- replacements[f"{filename} (url)"] = count3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  return html_content, replacements
118
 
@@ -217,15 +205,27 @@ def inject_page_breaks(html_content: str, aspect_ratio: str):
217
 
218
  return html_content
219
 
220
- def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
221
  """Convert HTML content to PDF using Puppeteer"""
222
  try:
 
 
 
 
223
  html_content = inject_page_breaks(html_content, aspect_ratio)
224
 
 
225
  html_file = os.path.join(temp_dir, "input.html")
226
  with open(html_file, 'w', encoding='utf-8') as f:
227
  f.write(html_content)
228
 
 
 
 
 
 
 
 
229
  # Find puppeteer script
230
  possible_paths = [
231
  'puppeteer_pdf.js',
@@ -247,7 +247,7 @@ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
247
  capture_output=True,
248
  text=True,
249
  timeout=60,
250
- cwd=os.path.dirname(os.path.abspath(puppeteer_script))
251
  )
252
 
253
  if result.returncode != 0:
@@ -260,7 +260,7 @@ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
260
  with open(pdf_file, 'rb') as f:
261
  pdf_bytes = f.read()
262
 
263
- return pdf_bytes
264
 
265
  except subprocess.TimeoutExpired:
266
  raise Exception("PDF conversion timed out (60 seconds)")
@@ -290,15 +290,15 @@ async def convert_to_pdf(
290
  html_file: UploadFile = File(..., description="HTML file to convert"),
291
  aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
292
  auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML"),
293
- images: Optional[List[UploadFile]] = File(None, description="Images to embed in HTML")
294
  ):
295
  """
296
- Convert HTML to PDF with optional image embedding
297
 
298
  - **html_file**: HTML file to convert (required)
299
  - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
300
  - **auto_detect**: Auto-detect aspect ratio from HTML content
301
- - **images**: Image files to embed as base64 in HTML
302
  """
303
  temp_dir = None
304
  try:
@@ -320,20 +320,36 @@ async def convert_to_pdf(
320
  if aspect_ratio not in ["16:9", "1:1", "9:16"]:
321
  raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
322
 
323
- # Process images if provided
324
- image_replacements = {}
 
 
 
325
  if images:
326
- images_dict = {}
327
  for img in images:
328
  img_bytes = await img.read()
329
- data_url = image_to_base64(img_bytes, img.filename)
330
- images_dict[img.filename] = data_url
331
-
332
- html_content, image_replacements = embed_images_as_base64(html_content, images_dict)
333
-
334
- # Create temp directory and convert
335
- temp_dir = tempfile.mkdtemp()
336
- pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
  # Return PDF
339
  return Response(
@@ -342,7 +358,7 @@ async def convert_to_pdf(
342
  headers={
343
  "Content-Disposition": f"attachment; filename=converted.pdf",
344
  "X-Aspect-Ratio": aspect_ratio,
345
- "X-Image-Replacements": str(len(image_replacements)),
346
  "X-PDF-Size": str(len(pdf_bytes))
347
  }
348
  )
@@ -362,7 +378,7 @@ async def convert_to_pdf_base64(
362
  auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML")
363
  ):
364
  """
365
- Convert HTML string to PDF and return as base64
366
 
367
  - **html_content**: HTML content as string (required)
368
  - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
@@ -383,17 +399,18 @@ async def convert_to_pdf_base64(
383
 
384
  # Create temp directory and convert
385
  temp_dir = tempfile.mkdtemp()
386
- pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
387
 
388
- # Convert to base64
389
- pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
390
-
391
- return JSONResponse({
392
- "success": True,
393
- "pdf_base64": pdf_base64,
394
- "aspect_ratio": aspect_ratio,
395
- "size_bytes": len(pdf_bytes)
396
- })
 
397
 
398
  except HTTPException:
399
  raise
 
9
  import os
10
  import tempfile
11
  import shutil
 
12
  import re
13
  import mimetypes
14
  from typing import List, Optional
 
56
 
57
  return "9:16"
58
 
59
+ def normalize_image_paths(html_content):
60
+ """Replace complex image paths with just filenames"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  replacements = {}
62
 
63
+ # Pattern 1: img src with paths - extract filename only
64
+ pattern1 = r'(<img[^>]*\s+src\s*=\s*)(["\'])([^"\']*?/)?([^/"\'>]+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)'
65
+
66
+ def replace_img_src(match):
67
+ prefix = match.group(1)
68
+ quote = match.group(2)
69
+ filename = match.group(4)
70
+ replacements[f"img src: {match.group(0)}"] = filename
71
+ return f'{prefix}{quote}{filename}{quote}'
72
+
73
+ html_content = re.sub(pattern1, replace_img_src, html_content, flags=re.IGNORECASE)
74
+
75
+ # Pattern 2: background-image with paths
76
+ pattern2 = r'(background-image\s*:\s*url\s*\()(["\']?)([^)"\']*/)?([^/")\']+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)(\))'
77
+
78
+ def replace_bg_image(match):
79
+ prefix = match.group(1)
80
+ quote = match.group(2)
81
+ filename = match.group(4)
82
+ suffix = match.group(7)
83
+ replacements[f"bg-image: {match.group(0)}"] = filename
84
+ return f'{prefix}{quote}{filename}{quote}{suffix}'
85
+
86
+ html_content = re.sub(pattern2, replace_bg_image, html_content, flags=re.IGNORECASE)
87
+
88
+ # Pattern 3: CSS url() with paths
89
+ pattern3 = r'(url\s*\()(["\']?)([^)"\']*/)?([^/")\']+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)(\))'
90
+
91
+ def replace_url(match):
92
+ # Skip if already processed by background-image pattern
93
+ if 'background-image' in html_content[max(0, match.start()-50):match.start()]:
94
+ return match.group(0)
95
+
96
+ prefix = match.group(1)
97
+ quote = match.group(2)
98
+ filename = match.group(4)
99
+ suffix = match.group(7)
100
+ replacements[f"url: {match.group(0)}"] = filename
101
+ return f'{prefix}{quote}{filename}{quote}{suffix}'
102
+
103
+ html_content = re.sub(pattern3, replace_url, html_content, flags=re.IGNORECASE)
104
 
105
  return html_content, replacements
106
 
 
205
 
206
  return html_content
207
 
208
+ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir, images=None):
209
  """Convert HTML content to PDF using Puppeteer"""
210
  try:
211
+ # Normalize image paths in HTML
212
+ html_content, path_replacements = normalize_image_paths(html_content)
213
+
214
+ # Inject page breaks
215
  html_content = inject_page_breaks(html_content, aspect_ratio)
216
 
217
+ # Save HTML file
218
  html_file = os.path.join(temp_dir, "input.html")
219
  with open(html_file, 'w', encoding='utf-8') as f:
220
  f.write(html_content)
221
 
222
+ # Save image files to the same directory
223
+ if images:
224
+ for img in images:
225
+ img_path = os.path.join(temp_dir, img.filename)
226
+ with open(img_path, 'wb') as f:
227
+ f.write(img.file.read())
228
+
229
  # Find puppeteer script
230
  possible_paths = [
231
  'puppeteer_pdf.js',
 
247
  capture_output=True,
248
  text=True,
249
  timeout=60,
250
+ cwd=temp_dir # Run in temp directory so images are accessible
251
  )
252
 
253
  if result.returncode != 0:
 
260
  with open(pdf_file, 'rb') as f:
261
  pdf_bytes = f.read()
262
 
263
+ return pdf_bytes, path_replacements
264
 
265
  except subprocess.TimeoutExpired:
266
  raise Exception("PDF conversion timed out (60 seconds)")
 
290
  html_file: UploadFile = File(..., description="HTML file to convert"),
291
  aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
292
  auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML"),
293
+ images: Optional[List[UploadFile]] = File(None, description="Images referenced in HTML")
294
  ):
295
  """
296
+ Convert HTML to PDF with image files in same directory
297
 
298
  - **html_file**: HTML file to convert (required)
299
  - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
300
  - **auto_detect**: Auto-detect aspect ratio from HTML content
301
+ - **images**: Image files referenced in HTML (saved to temp directory)
302
  """
303
  temp_dir = None
304
  try:
 
320
  if aspect_ratio not in ["16:9", "1:1", "9:16"]:
321
  raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
322
 
323
+ # Create temp directory and convert
324
+ temp_dir = tempfile.mkdtemp()
325
+
326
+ # Read images into memory before conversion
327
+ images_list = []
328
  if images:
 
329
  for img in images:
330
  img_bytes = await img.read()
331
+ # Create a simple object to hold filename and bytes
332
+ class ImageFile:
333
+ def __init__(self, filename, content):
334
+ self.filename = filename
335
+ self.content = content
336
+ self.file = None
337
+
338
+ def get_bytes(self):
339
+ return self.content
340
+
341
+ img_obj = ImageFile(img.filename, img_bytes)
342
+ # Create a file-like object for backwards compatibility
343
+ import io
344
+ img_obj.file = io.BytesIO(img_bytes)
345
+ images_list.append(img_obj)
346
+
347
+ pdf_bytes, path_replacements = convert_html_to_pdf(
348
+ html_content,
349
+ aspect_ratio,
350
+ temp_dir,
351
+ images_list if images_list else None
352
+ )
353
 
354
  # Return PDF
355
  return Response(
 
358
  headers={
359
  "Content-Disposition": f"attachment; filename=converted.pdf",
360
  "X-Aspect-Ratio": aspect_ratio,
361
+ "X-Path-Replacements": str(len(path_replacements)),
362
  "X-PDF-Size": str(len(pdf_bytes))
363
  }
364
  )
 
378
  auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML")
379
  ):
380
  """
381
+ Convert HTML string to PDF (for HTML without external images)
382
 
383
  - **html_content**: HTML content as string (required)
384
  - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
 
399
 
400
  # Create temp directory and convert
401
  temp_dir = tempfile.mkdtemp()
402
+ pdf_bytes, path_replacements = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
403
 
404
+ return Response(
405
+ content=pdf_bytes,
406
+ media_type="application/pdf",
407
+ headers={
408
+ "Content-Disposition": f"attachment; filename=converted.pdf",
409
+ "X-Aspect-Ratio": aspect_ratio,
410
+ "X-Path-Replacements": str(len(path_replacements)),
411
+ "X-PDF-Size": str(len(pdf_bytes))
412
+ }
413
+ )
414
 
415
  except HTTPException:
416
  raise