ABDALLALSWAITI commited on
Commit
83a36bb
·
verified ·
1 Parent(s): f5befe6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -50
app.py CHANGED
@@ -60,47 +60,67 @@ def normalize_image_paths(html_content):
60
  """Replace complex image paths with just filenames"""
61
  replacements = {}
62
 
63
- # Pattern 1: img src with paths - extract filename only
64
- pattern1 = r'(<img[^>]*\s+src\s*=\s*)(["\'])([^"\']*?/)?([^/"\'>]+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)'
65
-
66
  def replace_img_src(match):
 
67
  prefix = match.group(1)
68
  quote = match.group(2)
 
69
  filename = match.group(4)
70
- replacements[f"img src: {match.group(0)}"] = filename
71
- return f'{prefix}{quote}{filename}{quote}'
72
-
73
- html_content = re.sub(pattern1, replace_img_src, html_content, flags=re.IGNORECASE)
 
74
 
75
- # Pattern 2: background-image with paths
76
- pattern2 = r'(background-image\s*:\s*url\s*\()(["\']?)([^)"\']*/)?([^/")\']+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)(\))'
 
 
 
 
77
 
 
78
  def replace_bg_image(match):
 
79
  prefix = match.group(1)
80
  quote = match.group(2)
 
81
  filename = match.group(4)
82
  suffix = match.group(7)
83
- replacements[f"bg-image: {match.group(0)}"] = filename
84
- return f'{prefix}{quote}{filename}{quote}{suffix}'
85
-
86
- html_content = re.sub(pattern2, replace_bg_image, html_content, flags=re.IGNORECASE)
 
87
 
88
- # Pattern 3: CSS url() with paths
89
- pattern3 = r'(url\s*\()(["\']?)([^)"\']*/)?([^/")\']+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)(\))'
 
 
 
 
90
 
 
91
  def replace_url(match):
92
- # Skip if already processed by background-image pattern
93
- if 'background-image' in html_content[max(0, match.start()-50):match.start()]:
94
- return match.group(0)
95
-
96
  prefix = match.group(1)
97
  quote = match.group(2)
 
98
  filename = match.group(4)
99
  suffix = match.group(7)
100
- replacements[f"url: {match.group(0)}"] = filename
101
- return f'{prefix}{quote}{filename}{quote}{suffix}'
 
 
 
102
 
103
- html_content = re.sub(pattern3, replace_url, html_content, flags=re.IGNORECASE)
 
 
 
 
 
104
 
105
  return html_content, replacements
106
 
@@ -205,7 +225,7 @@ def inject_page_breaks(html_content: str, aspect_ratio: str):
205
 
206
  return html_content
207
 
208
- def convert_html_to_pdf(html_content, aspect_ratio, temp_dir, images=None):
209
  """Convert HTML content to PDF using Puppeteer"""
210
  try:
211
  # Normalize image paths in HTML
@@ -220,11 +240,11 @@ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir, images=None):
220
  f.write(html_content)
221
 
222
  # Save image files to the same directory
223
- if images:
224
- for img in images:
225
- img_path = os.path.join(temp_dir, img.filename)
226
  with open(img_path, 'wb') as f:
227
- f.write(img.file.read())
228
 
229
  # Find puppeteer script
230
  possible_paths = [
@@ -242,12 +262,13 @@ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir, images=None):
242
  if not puppeteer_script:
243
  raise Exception("puppeteer_pdf.js not found")
244
 
 
245
  result = subprocess.run(
246
  ['node', puppeteer_script, html_file, aspect_ratio],
247
  capture_output=True,
248
  text=True,
249
  timeout=60,
250
- cwd=temp_dir # Run in temp directory so images are accessible
251
  )
252
 
253
  if result.returncode != 0:
@@ -320,35 +341,22 @@ async def convert_to_pdf(
320
  if aspect_ratio not in ["16:9", "1:1", "9:16"]:
321
  raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
322
 
323
- # Create temp directory and convert
324
  temp_dir = tempfile.mkdtemp()
325
 
326
- # Read images into memory before conversion
327
- images_list = []
328
  if images:
329
  for img in images:
330
  img_bytes = await img.read()
331
- # Create a simple object to hold filename and bytes
332
- class ImageFile:
333
- def __init__(self, filename, content):
334
- self.filename = filename
335
- self.content = content
336
- self.file = None
337
-
338
- def get_bytes(self):
339
- return self.content
340
-
341
- img_obj = ImageFile(img.filename, img_bytes)
342
- # Create a file-like object for backwards compatibility
343
- import io
344
- img_obj.file = io.BytesIO(img_bytes)
345
- images_list.append(img_obj)
346
 
 
347
  pdf_bytes, path_replacements = convert_html_to_pdf(
348
  html_content,
349
  aspect_ratio,
350
  temp_dir,
351
- images_list if images_list else None
352
  )
353
 
354
  # Return PDF
@@ -371,8 +379,8 @@ async def convert_to_pdf(
371
  if temp_dir and os.path.exists(temp_dir):
372
  shutil.rmtree(temp_dir, ignore_errors=True)
373
 
374
- @app.post("/convert-base64")
375
- async def convert_to_pdf_base64(
376
  html_content: str = Form(..., description="HTML content as string"),
377
  aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
378
  auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML")
@@ -399,7 +407,7 @@ async def convert_to_pdf_base64(
399
 
400
  # Create temp directory and convert
401
  temp_dir = tempfile.mkdtemp()
402
- pdf_bytes, path_replacements = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
403
 
404
  return Response(
405
  content=pdf_bytes,
 
60
  """Replace complex image paths with just filenames"""
61
  replacements = {}
62
 
63
+ # Pattern for img src with paths
 
 
64
  def replace_img_src(match):
65
+ full_match = match.group(0)
66
  prefix = match.group(1)
67
  quote = match.group(2)
68
+ path = match.group(3) if match.group(3) else ""
69
  filename = match.group(4)
70
+
71
+ if path: # Only replace if there's a path
72
+ replacements[f"img: {path}{filename}"] = filename
73
+ return f'{prefix}{quote}{filename}{quote}'
74
+ return full_match
75
 
76
+ html_content = re.sub(
77
+ r'(<img[^>]*\s+src\s*=\s*)(["\'])([^"\']*?/)?([^/"\'>]+\.(jpg|jpeg|png|gif|svg|webp|bmp|JPG|JPEG|PNG|GIF|SVG|WEBP|BMP))(\2)',
78
+ replace_img_src,
79
+ html_content,
80
+ flags=re.IGNORECASE
81
+ )
82
 
83
+ # Pattern for background-image
84
  def replace_bg_image(match):
85
+ full_match = match.group(0)
86
  prefix = match.group(1)
87
  quote = match.group(2)
88
+ path = match.group(3) if match.group(3) else ""
89
  filename = match.group(4)
90
  suffix = match.group(7)
91
+
92
+ if path: # Only replace if there's a path
93
+ replacements[f"bg: {path}{filename}"] = filename
94
+ return f'{prefix}{quote}{filename}{quote}{suffix}'
95
+ return full_match
96
 
97
+ html_content = re.sub(
98
+ r'(background-image\s*:\s*url\s*\()(["\']?)([^)"\']*/)?([^/")\']+\.(jpg|jpeg|png|gif|svg|webp|bmp|JPG|JPEG|PNG|GIF|SVG|WEBP|BMP))(\2)(\))',
99
+ replace_bg_image,
100
+ html_content,
101
+ flags=re.IGNORECASE
102
+ )
103
 
104
+ # Pattern for CSS url()
105
  def replace_url(match):
106
+ full_match = match.group(0)
 
 
 
107
  prefix = match.group(1)
108
  quote = match.group(2)
109
+ path = match.group(3) if match.group(3) else ""
110
  filename = match.group(4)
111
  suffix = match.group(7)
112
+
113
+ if path: # Only replace if there's a path
114
+ replacements[f"url: {path}{filename}"] = filename
115
+ return f'{prefix}{quote}{filename}{quote}{suffix}'
116
+ return full_match
117
 
118
+ html_content = re.sub(
119
+ r'(url\s*\()(["\']?)([^)"\']*/)?([^/")\']+\.(jpg|jpeg|png|gif|svg|webp|bmp|JPG|JPEG|PNG|GIF|SVG|WEBP|BMP))(\2)(\))',
120
+ replace_url,
121
+ html_content,
122
+ flags=re.IGNORECASE
123
+ )
124
 
125
  return html_content, replacements
126
 
 
225
 
226
  return html_content
227
 
228
+ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir, image_files=None):
229
  """Convert HTML content to PDF using Puppeteer"""
230
  try:
231
  # Normalize image paths in HTML
 
240
  f.write(html_content)
241
 
242
  # Save image files to the same directory
243
+ if image_files:
244
+ for filename, img_bytes in image_files.items():
245
+ img_path = os.path.join(temp_dir, filename)
246
  with open(img_path, 'wb') as f:
247
+ f.write(img_bytes)
248
 
249
  # Find puppeteer script
250
  possible_paths = [
 
262
  if not puppeteer_script:
263
  raise Exception("puppeteer_pdf.js not found")
264
 
265
+ # Run Puppeteer
266
  result = subprocess.run(
267
  ['node', puppeteer_script, html_file, aspect_ratio],
268
  capture_output=True,
269
  text=True,
270
  timeout=60,
271
+ cwd=os.path.dirname(os.path.abspath(puppeteer_script))
272
  )
273
 
274
  if result.returncode != 0:
 
341
  if aspect_ratio not in ["16:9", "1:1", "9:16"]:
342
  raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
343
 
344
+ # Create temp directory
345
  temp_dir = tempfile.mkdtemp()
346
 
347
+ # Read images into dictionary
348
+ image_files = {}
349
  if images:
350
  for img in images:
351
  img_bytes = await img.read()
352
+ image_files[img.filename] = img_bytes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
+ # Convert to PDF
355
  pdf_bytes, path_replacements = convert_html_to_pdf(
356
  html_content,
357
  aspect_ratio,
358
  temp_dir,
359
+ image_files
360
  )
361
 
362
  # Return PDF
 
379
  if temp_dir and os.path.exists(temp_dir):
380
  shutil.rmtree(temp_dir, ignore_errors=True)
381
 
382
+ @app.post("/convert-string")
383
+ async def convert_string_to_pdf(
384
  html_content: str = Form(..., description="HTML content as string"),
385
  aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
386
  auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML")
 
407
 
408
  # Create temp directory and convert
409
  temp_dir = tempfile.mkdtemp()
410
+ pdf_bytes, path_replacements = convert_html_to_pdf(html_content, aspect_ratio, temp_dir, None)
411
 
412
  return Response(
413
  content=pdf_bytes,