htmlpdfs

Sleeping

App Files Files Community

ABDALLALSWAITI commited on Oct 17, 2025

Commit

f5befe6

verified ·

1 Parent(s): 7857567

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -84

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ import subprocess
 import os
 import tempfile
 import shutil
-import base64
 import re
 import mimetypes
 from typing import List, Optional
@@ -57,62 +56,51 @@ def detect_aspect_ratio(html_content):
     return "9:16"
-def image_to_base64(image_bytes, filename):
-    """Convert image bytes to base64 data URL"""
-    try:
-        mime_type, _ = mimetypes.guess_type(filename)
-        if not mime_type:
-            ext = os.path.splitext(filename)[1].lower()
-            mime_map = {
-                '.jpg': 'image/jpeg',
-                '.jpeg': 'image/jpeg',
-                '.png': 'image/png',
-                '.gif': 'image/gif',
-                '.svg': 'image/svg+xml',
-                '.webp': 'image/webp',
-                '.bmp': 'image/bmp'
-            }
-            mime_type = mime_map.get(ext, 'image/png')
-        b64_data = base64.b64encode(image_bytes).decode('utf-8')
-        data_url = f"data:{mime_type};base64,{b64_data}"
-        return data_url
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Error converting {filename} to base64: {str(e)}")
-def embed_images_as_base64(html_content, images_dict):
-    """Embed all images directly as base64 data URLs in the HTML"""
-    if not images_dict:
-        return html_content, {}
     replacements = {}
-    for filename, data_url in images_dict.items():
-        escaped_name = re.escape(filename)
-        # Pattern 1: img src attribute
-        pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
-        matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
-        count1 = len(matches1)
-        if matches1:
-            html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
-            replacements[f"{filename} (img src)"] = count1
-        # Pattern 2: background-image
-        pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
-        matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
-        count2 = len(matches2)
-        if matches2:
-            html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
-            replacements[f"{filename} (bg-image)"] = count2
-        # Pattern 3: CSS url()
-        pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
-        matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
-        count3 = len(matches3)
-        if matches3:
-            html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
-            replacements[f"{filename} (url)"] = count3
     return html_content, replacements
@@ -217,15 +205,27 @@ def inject_page_breaks(html_content: str, aspect_ratio: str):
     return html_content
-def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
     """Convert HTML content to PDF using Puppeteer"""
     try:
         html_content = inject_page_breaks(html_content, aspect_ratio)
         html_file = os.path.join(temp_dir, "input.html")
         with open(html_file, 'w', encoding='utf-8') as f:
             f.write(html_content)
         # Find puppeteer script
         possible_paths = [
             'puppeteer_pdf.js',
@@ -247,7 +247,7 @@ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
             capture_output=True,
             text=True,
             timeout=60,
-            cwd=os.path.dirname(os.path.abspath(puppeteer_script))
         )
         if result.returncode != 0:
@@ -260,7 +260,7 @@ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
         with open(pdf_file, 'rb') as f:
             pdf_bytes = f.read()
-        return pdf_bytes
     except subprocess.TimeoutExpired:
         raise Exception("PDF conversion timed out (60 seconds)")
@@ -290,15 +290,15 @@ async def convert_to_pdf(
     html_file: UploadFile = File(..., description="HTML file to convert"),
     aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
     auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML"),
-    images: Optional[List[UploadFile]] = File(None, description="Images to embed in HTML")
 ):
     """
-    Convert HTML to PDF with optional image embedding
     - **html_file**: HTML file to convert (required)
     - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
     - **auto_detect**: Auto-detect aspect ratio from HTML content
-    - **images**: Image files to embed as base64 in HTML
     """
     temp_dir = None
     try:
@@ -320,20 +320,36 @@ async def convert_to_pdf(
         if aspect_ratio not in ["16:9", "1:1", "9:16"]:
             raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
-        # Process images if provided
-        image_replacements = {}
         if images:
-            images_dict = {}
             for img in images:
                 img_bytes = await img.read()
-                data_url = image_to_base64(img_bytes, img.filename)
-                images_dict[img.filename] = data_url
-            html_content, image_replacements = embed_images_as_base64(html_content, images_dict)
-        # Create temp directory and convert
-        temp_dir = tempfile.mkdtemp()
-        pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
         # Return PDF
         return Response(
@@ -342,7 +358,7 @@ async def convert_to_pdf(
             headers={
                 "Content-Disposition": f"attachment; filename=converted.pdf",
                 "X-Aspect-Ratio": aspect_ratio,
-                "X-Image-Replacements": str(len(image_replacements)),
                 "X-PDF-Size": str(len(pdf_bytes))
             }
         )
@@ -362,7 +378,7 @@ async def convert_to_pdf_base64(
     auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML")
 ):
     """
-    Convert HTML string to PDF and return as base64
     - **html_content**: HTML content as string (required)
     - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
@@ -383,17 +399,18 @@ async def convert_to_pdf_base64(
         # Create temp directory and convert
         temp_dir = tempfile.mkdtemp()
-        pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
-        # Convert to base64
-        pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
-        return JSONResponse({
-            "success": True,
-            "pdf_base64": pdf_base64,
-            "aspect_ratio": aspect_ratio,
-            "size_bytes": len(pdf_bytes)
-        })
     except HTTPException:
         raise

 import os
 import tempfile
 import shutil
 import re
 import mimetypes
 from typing import List, Optional
     return "9:16"
+def normalize_image_paths(html_content):
+    """Replace complex image paths with just filenames"""
     replacements = {}
+    # Pattern 1: img src with paths - extract filename only
+    pattern1 = r'(<img[^>]*\s+src\s*=\s*)(["\'])([^"\']*?/)?([^/"\'>]+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)'
+    def replace_img_src(match):
+        prefix = match.group(1)
+        quote = match.group(2)
+        filename = match.group(4)
+        replacements[f"img src: {match.group(0)}"] = filename
+        return f'{prefix}{quote}{filename}{quote}'
+    html_content = re.sub(pattern1, replace_img_src, html_content, flags=re.IGNORECASE)
+    # Pattern 2: background-image with paths
+    pattern2 = r'(background-image\s*:\s*url\s*\()(["\']?)([^)"\']*/)?([^/")\']+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)(\))'
+    def replace_bg_image(match):
+        prefix = match.group(1)
+        quote = match.group(2)
+        filename = match.group(4)
+        suffix = match.group(7)
+        replacements[f"bg-image: {match.group(0)}"] = filename
+        return f'{prefix}{quote}{filename}{quote}{suffix}'
+    html_content = re.sub(pattern2, replace_bg_image, html_content, flags=re.IGNORECASE)
+    # Pattern 3: CSS url() with paths
+    pattern3 = r'(url\s*\()(["\']?)([^)"\']*/)?([^/")\']+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)(\))'
+    def replace_url(match):
+        # Skip if already processed by background-image pattern
+        if 'background-image' in html_content[max(0, match.start()-50):match.start()]:
+            return match.group(0)
+        prefix = match.group(1)
+        quote = match.group(2)
+        filename = match.group(4)
+        suffix = match.group(7)
+        replacements[f"url: {match.group(0)}"] = filename
+        return f'{prefix}{quote}{filename}{quote}{suffix}'
+    html_content = re.sub(pattern3, replace_url, html_content, flags=re.IGNORECASE)
     return html_content, replacements
     return html_content
+def convert_html_to_pdf(html_content, aspect_ratio, temp_dir, images=None):
     """Convert HTML content to PDF using Puppeteer"""
     try:
+        # Normalize image paths in HTML
+        html_content, path_replacements = normalize_image_paths(html_content)
+        # Inject page breaks
         html_content = inject_page_breaks(html_content, aspect_ratio)
+        # Save HTML file
         html_file = os.path.join(temp_dir, "input.html")
         with open(html_file, 'w', encoding='utf-8') as f:
             f.write(html_content)
+        # Save image files to the same directory
+        if images:
+            for img in images:
+                img_path = os.path.join(temp_dir, img.filename)
+                with open(img_path, 'wb') as f:
+                    f.write(img.file.read())
         # Find puppeteer script
         possible_paths = [
             'puppeteer_pdf.js',
             capture_output=True,
             text=True,
             timeout=60,
+            cwd=temp_dir  # Run in temp directory so images are accessible
         )
         if result.returncode != 0:
         with open(pdf_file, 'rb') as f:
             pdf_bytes = f.read()
+        return pdf_bytes, path_replacements
     except subprocess.TimeoutExpired:
         raise Exception("PDF conversion timed out (60 seconds)")
     html_file: UploadFile = File(..., description="HTML file to convert"),
     aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
     auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML"),
+    images: Optional[List[UploadFile]] = File(None, description="Images referenced in HTML")
 ):
     """
+    Convert HTML to PDF with image files in same directory
     - **html_file**: HTML file to convert (required)
     - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
     - **auto_detect**: Auto-detect aspect ratio from HTML content
+    - **images**: Image files referenced in HTML (saved to temp directory)
     """
     temp_dir = None
     try:
         if aspect_ratio not in ["16:9", "1:1", "9:16"]:
             raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
+        # Create temp directory and convert
+        temp_dir = tempfile.mkdtemp()
+        # Read images into memory before conversion
+        images_list = []
         if images:
             for img in images:
                 img_bytes = await img.read()
+                # Create a simple object to hold filename and bytes
+                class ImageFile:
+                    def __init__(self, filename, content):
+                        self.filename = filename
+                        self.content = content
+                        self.file = None
+                    def get_bytes(self):
+                        return self.content
+                img_obj = ImageFile(img.filename, img_bytes)
+                # Create a file-like object for backwards compatibility
+                import io
+                img_obj.file = io.BytesIO(img_bytes)
+                images_list.append(img_obj)
+        pdf_bytes, path_replacements = convert_html_to_pdf(
+            html_content,
+            aspect_ratio,
+            temp_dir,
+            images_list if images_list else None
+        )
         # Return PDF
         return Response(
             headers={
                 "Content-Disposition": f"attachment; filename=converted.pdf",
                 "X-Aspect-Ratio": aspect_ratio,
+                "X-Path-Replacements": str(len(path_replacements)),
                 "X-PDF-Size": str(len(pdf_bytes))
             }
         )
     auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML")
 ):
     """
+    Convert HTML string to PDF (for HTML without external images)
     - **html_content**: HTML content as string (required)
     - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
         # Create temp directory and convert
         temp_dir = tempfile.mkdtemp()
+        pdf_bytes, path_replacements = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
+        return Response(
+            content=pdf_bytes,
+            media_type="application/pdf",
+            headers={
+                "Content-Disposition": f"attachment; filename=converted.pdf",
+                "X-Aspect-Ratio": aspect_ratio,
+                "X-Path-Replacements": str(len(path_replacements)),
+                "X-PDF-Size": str(len(pdf_bytes))
+            }
+        )
     except HTTPException:
         raise