Spaces:

ABDALLALSWAITI
/

htmlpdf

Sleeping

App Files Files Community

ABDALLALSWAITI commited on Oct 16, 2025

Commit

e80d253

verified ·

1 Parent(s): 8d2aae9

Update api.py

Browse files

Files changed (1) hide show

api.py +146 -41

api.py CHANGED Viewed

@@ -10,7 +10,7 @@ import re
 import mimetypes
 from typing import List, Optional
-app = FastAPI(title="HTML to PDF Converter API")
 app.add_middleware(
     CORSMiddleware,
@@ -73,14 +73,10 @@ def image_to_base64(image_bytes, filename):
         return None
 def embed_images_as_base64(html_content, images: List[UploadFile]):
-    """
-    Embed all images directly as base64 data URLs in the HTML
-    This ensures images are always included in the PDF
-    """
     if not images:
         return html_content, {}
-    # Create mapping of filename to base64 data URL
     image_data_urls = {}
     for img in images:
         img.file.seek(0)
@@ -93,11 +89,9 @@ def embed_images_as_base64(html_content, images: List[UploadFile]):
     if not image_data_urls:
         return html_content, {}
-    # Track replacements
     replacements = {}
     for filename, data_url in image_data_urls.items():
-        # Escape filename for regex
         escaped_name = re.escape(filename)
         # Pattern 1: img src attribute
@@ -124,7 +118,6 @@ def embed_images_as_base64(html_content, images: List[UploadFile]):
             html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
             replacements[f"{filename} (url)"] = count3
-    # Log results
     if replacements:
         print("=== Image Replacements ===")
         for key, count in replacements.items():
@@ -135,31 +128,127 @@ def embed_images_as_base64(html_content, images: List[UploadFile]):
     return html_content, replacements
 def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
-    """Convert HTML content to PDF using Puppeteer"""
     try:
-        # Inject CSS to preserve styles
-        style_injection = """
-        <style>
-            @page { margin: 0; }
-            * {
-                -webkit-print-color-adjust: exact !important;
-                print-color-adjust: exact !important;
-                color-adjust: exact !important;
-            }
-            body {
-                -webkit-print-color-adjust: exact !important;
-                print-color-adjust: exact !important;
-            }
-        </style>
-        """
-        if '</head>' in html_content:
-            html_content = html_content.replace('</head>', style_injection + '</head>')
-        elif '<body' in html_content:
-            html_content = html_content.replace('<body', style_injection + '<body', 1)
-        else:
-            html_content = style_injection + html_content
         # Save HTML to temp file
         html_file = os.path.join(temp_dir, "input.html")
@@ -208,17 +297,24 @@ def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
 @app.get("/")
 async def root():
     return {
-        "message": "HTML to PDF Converter API with Base64 Image Embedding",
-        "version": "2.1",
         "endpoints": {
-            "/convert": "POST - Convert HTML to PDF (images embedded as base64)",
-            "/health": "GET - Health check"
         }
     }
 @app.get("/health")
 async def health():
-    return {"status": "healthy"}
 @app.post("/convert")
 async def convert_to_pdf(
@@ -229,7 +325,7 @@ async def convert_to_pdf(
     images: Optional[List[UploadFile]] = File(None)
 ):
     """
-    Convert HTML to PDF with embedded base64 images
     Parameters:
     - html_file: HTML file upload (optional)
@@ -238,8 +334,13 @@ async def convert_to_pdf(
     - auto_detect: Auto-detect aspect ratio from HTML (default: True)
     - images: List of image files - will be embedded as base64 in HTML (optional)
     Returns:
-    - PDF file as bytes with images embedded
     """
     temp_dir = None
@@ -289,7 +390,7 @@ async def convert_to_pdf(
                 raise HTTPException(status_code=400, detail="Invalid aspect_ratio. Must be '16:9', '1:1', or '9:16'")
             print(f"Using specified aspect ratio: {aspect_ratio}")
-        # Convert to PDF
         pdf_bytes, error = convert_html_to_pdf(html, aspect_ratio, temp_dir)
         if error:
@@ -310,7 +411,8 @@ async def convert_to_pdf(
             headers={
                 "Content-Disposition": f"attachment; filename={output_filename}",
                 "X-Aspect-Ratio": aspect_ratio,
-                "X-Images-Embedded": str(len(images)) if images else "0"
             }
         )
@@ -326,4 +428,7 @@ async def convert_to_pdf(
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import mimetypes
 from typing import List, Optional
+app = FastAPI(title="HTML to PDF Converter API with Page Break Support")
 app.add_middleware(
     CORSMiddleware,
         return None
 def embed_images_as_base64(html_content, images: List[UploadFile]):
+    """Embed all images directly as base64 data URLs in the HTML"""
     if not images:
         return html_content, {}
     image_data_urls = {}
     for img in images:
         img.file.seek(0)
     if not image_data_urls:
         return html_content, {}
     replacements = {}
     for filename, data_url in image_data_urls.items():
         escaped_name = re.escape(filename)
         # Pattern 1: img src attribute
             html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
             replacements[f"{filename} (url)"] = count3
     if replacements:
         print("=== Image Replacements ===")
         for key, count in replacements.items():
     return html_content, replacements
+def inject_page_breaks(html_content: str, aspect_ratio: str):
+    """Automatically inject page breaks and page sizing CSS"""
+    # Determine page orientation
+    if aspect_ratio == "16:9":
+        page_size = "A4 landscape"
+        orientation = "landscape"
+    elif aspect_ratio == "1:1":
+        page_size = "210mm 210mm"
+        orientation = "portrait"
+    else:  # 9:16
+        page_size = "A4 portrait"
+        orientation = "portrait"
+    # Comprehensive page break CSS
+    page_css = f"""
+    <style id="auto-page-breaks">
+        /* Define page size */
+        @page {{
+            size: {page_size};
+            margin: 0;
+        }}
+        /* Reset body */
+        html, body {{
+            margin: 0 !important;
+            padding: 0 !important;
+            width: 100% !important;
+            height: 100% !important;
+        }}
+        /* Page containers - each should be one page */
+        .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
+            width: 100% !important;
+            min-height: 100vh !important;
+            height: 100vh !important;
+            page-break-after: always !important;
+            break-after: page !important;
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+            position: relative !important;
+            box-sizing: border-box !important;
+            overflow: hidden !important;
+        }}
+        /* Last page shouldn't force a break */
+        .page:last-child, .slide:last-child,
+        section.page:last-child, article.page:last-child {{
+            page-break-after: auto !important;
+            break-after: auto !important;
+        }}
+        /* If no explicit page class, treat direct body children as pages */
+        body > section:not(.no-page-break),
+        body > article:not(.no-page-break),
+        body > div:not(.no-page-break) {{
+            page-break-after: always !important;
+            break-after: page !important;
+            min-height: 100vh;
+        }}
+        body > section:last-child,
+        body > article:last-child,
+        body > div:last-child {{
+            page-break-after: auto !important;
+        }}
+        /* Utility classes for manual control */
+        .page-break, .page-break-after {{
+            page-break-after: always !important;
+            break-after: page !important;
+        }}
+        .page-break-before {{
+            page-break-before: always !important;
+            break-before: page !important;
+        }}
+        .no-page-break, .keep-together {{
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        /* Prevent awkward breaks in content */
+        h1, h2, h3, h4, h5, h6 {{
+            page-break-after: avoid !important;
+            break-after: avoid !important;
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        img, figure, table, pre, blockquote {{
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        /* Preserve colors and backgrounds */
+        * {{
+            -webkit-print-color-adjust: exact !important;
+            print-color-adjust: exact !important;
+            color-adjust: exact !important;
+        }}
+    </style>
+    """
+    # Inject CSS into HTML
+    if '</head>' in html_content:
+        html_content = html_content.replace('</head>', page_css + '</head>')
+    elif '<body' in html_content:
+        html_content = html_content.replace('<body', page_css + '<body', 1)
+    else:
+        html_content = page_css + html_content
+    return html_content
 def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
+    """Convert HTML content to PDF using Puppeteer with proper page breaks"""
     try:
+        # Step 1: Inject page break CSS
+        print("Injecting page break CSS...")
+        html_content = inject_page_breaks(html_content, aspect_ratio)
         # Save HTML to temp file
         html_file = os.path.join(temp_dir, "input.html")
 @app.get("/")
 async def root():
     return {
+        "message": "HTML to PDF Converter API with Proper Page Break Support",
+        "version": "3.0",
+        "features": [
+            "Base64 image embedding",
+            "Automatic page break detection",
+            "Custom CSS @page rules",
+            "Multiple aspect ratios (16:9, 1:1, 9:16)"
+        ],
         "endpoints": {
+            "/convert": "POST - Convert HTML to PDF",
+            "/health": "GET - Health check",
+            "/docs": "GET - API documentation"
         }
     }
 @app.get("/health")
 async def health():
+    return {"status": "healthy", "version": "3.0"}
 @app.post("/convert")
 async def convert_to_pdf(
     images: Optional[List[UploadFile]] = File(None)
 ):
     """
+    Convert HTML to PDF with proper page breaks and embedded base64 images
     Parameters:
     - html_file: HTML file upload (optional)
     - auto_detect: Auto-detect aspect ratio from HTML (default: True)
     - images: List of image files - will be embedded as base64 in HTML (optional)
+    HTML Structure for Page Breaks:
+    - Use class="page" on div elements for separate pages
+    - Or use class="slide" for presentation-style pages
+    - Each page will automatically break to a new PDF page
     Returns:
+    - PDF file as bytes with proper page separation
     """
     temp_dir = None
                 raise HTTPException(status_code=400, detail="Invalid aspect_ratio. Must be '16:9', '1:1', or '9:16'")
             print(f"Using specified aspect ratio: {aspect_ratio}")
+        # Convert to PDF with page breaks
         pdf_bytes, error = convert_html_to_pdf(html, aspect_ratio, temp_dir)
         if error:
             headers={
                 "Content-Disposition": f"attachment; filename={output_filename}",
                 "X-Aspect-Ratio": aspect_ratio,
+                "X-Images-Embedded": str(len(images)) if images else "0",
+                "X-Page-Breaks": "enabled"
             }
         )
 if __name__ == "__main__":
     import uvicorn
+    print("Starting HTML to PDF Converter API with Page Break Support")
+    print("Features: Base64 images, automatic page breaks, multiple aspect ratios")
+    print("API docs available at: http://localhost:7860/docs")
     uvicorn.run(app, host="0.0.0.0", port=7860)