htmlpdfs

Sleeping

App Files Files Community

ABDALLALSWAITI commited on Oct 17, 2025

Commit

6830c81

verified ·

1 Parent(s): e58da64

Upload api.py

Browse files

Files changed (1) hide show

api.py +408 -0

api.py ADDED Viewed

	@@ -0,0 +1,408 @@

+"""
+FastAPI Backend for HTML to PDF Conversion
+Runs alongside Streamlit on port 7860
+"""
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi.responses import Response, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+import subprocess
+import os
+import tempfile
+import shutil
+import base64
+import re
+import mimetypes
+from typing import List, Optional
+from pathlib import Path
+app = FastAPI(
+    title="HTML to PDF API",
+    description="Convert HTML to PDF with image support and page breaks",
+    version="1.0.0"
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+def detect_aspect_ratio(html_content):
+    """Detect aspect ratio from HTML content"""
+    viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
+    if viewport_match:
+        viewport = viewport_match.group(1).lower()
+        if 'orientation=portrait' in viewport:
+            return "9:16"
+        elif 'orientation=landscape' in viewport:
+            return "16:9"
+    aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
+    if aspect_match:
+        width = int(aspect_match.group(1))
+        height = int(aspect_match.group(2))
+        ratio = width / height
+        if ratio > 1.5:
+            return "16:9"
+        elif ratio < 0.7:
+            return "9:16"
+        else:
+            return "1:1"
+    if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
+        return "16:9"
+    return "9:16"
+def image_to_base64(image_bytes, filename):
+    """Convert image bytes to base64 data URL"""
+    try:
+        mime_type, _ = mimetypes.guess_type(filename)
+        if not mime_type:
+            ext = os.path.splitext(filename)[1].lower()
+            mime_map = {
+                '.jpg': 'image/jpeg',
+                '.jpeg': 'image/jpeg',
+                '.png': 'image/png',
+                '.gif': 'image/gif',
+                '.svg': 'image/svg+xml',
+                '.webp': 'image/webp',
+                '.bmp': 'image/bmp'
+            }
+            mime_type = mime_map.get(ext, 'image/png')
+        b64_data = base64.b64encode(image_bytes).decode('utf-8')
+        data_url = f"data:{mime_type};base64,{b64_data}"
+        return data_url
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Error converting {filename} to base64: {str(e)}")
+def embed_images_as_base64(html_content, images_dict):
+    """Embed all images directly as base64 data URLs in the HTML"""
+    if not images_dict:
+        return html_content, {}
+    replacements = {}
+    for filename, data_url in images_dict.items():
+        escaped_name = re.escape(filename)
+        # Pattern 1: img src attribute
+        pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
+        matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
+        count1 = len(matches1)
+        if matches1:
+            html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
+            replacements[f"{filename} (img src)"] = count1
+        # Pattern 2: background-image
+        pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
+        matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
+        count2 = len(matches2)
+        if matches2:
+            html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
+            replacements[f"{filename} (bg-image)"] = count2
+        # Pattern 3: CSS url()
+        pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
+        matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
+        count3 = len(matches3)
+        if matches3:
+            html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
+            replacements[f"{filename} (url)"] = count3
+    return html_content, replacements
+def inject_page_breaks(html_content: str, aspect_ratio: str):
+    """Automatically inject page breaks and page sizing CSS"""
+    if aspect_ratio == "16:9":
+        page_size = "A4 landscape"
+    elif aspect_ratio == "1:1":
+        page_size = "210mm 210mm"
+    else:
+        page_size = "A4 portrait"
+    page_css = f"""
+    <style id="auto-page-breaks">
+        @page {{
+            size: {page_size};
+            margin: 0;
+        }}
+        html, body {{
+            margin: 0 !important;
+            padding: 0 !important;
+            width: 100% !important;
+            height: 100% !important;
+        }}
+        .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
+            width: 100% !important;
+            min-height: 100vh !important;
+            height: 100vh !important;
+            page-break-after: always !important;
+            break-after: page !important;
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+            position: relative !important;
+            box-sizing: border-box !important;
+            overflow: hidden !important;
+        }}
+        .page:last-child, .slide:last-child,
+        section.page:last-child, article.page:last-child {{
+            page-break-after: auto !important;
+            break-after: auto !important;
+        }}
+        body > section:not(.no-page-break),
+        body > article:not(.no-page-break),
+        body > div:not(.no-page-break) {{
+            page-break-after: always !important;
+            break-after: page !important;
+            min-height: 100vh;
+        }}
+        body > section:last-child,
+        body > article:last-child,
+        body > div:last-child {{
+            page-break-after: auto !important;
+        }}
+        .page-break, .page-break-after {{
+            page-break-after: always !important;
+            break-after: page !important;
+        }}
+        .page-break-before {{
+            page-break-before: always !important;
+            break-before: page !important;
+        }}
+        .no-page-break, .keep-together {{
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        h1, h2, h3, h4, h5, h6 {{
+            page-break-after: avoid !important;
+            break-after: avoid !important;
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        img, figure, table, pre, blockquote {{
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        * {{
+            -webkit-print-color-adjust: exact !important;
+            print-color-adjust: exact !important;
+            color-adjust: exact !important;
+        }}
+    </style>
+    """
+    if '</head>' in html_content:
+        html_content = html_content.replace('</head>', page_css + '</head>')
+    elif '<body' in html_content:
+        html_content = html_content.replace('<body', page_css + '<body', 1)
+    else:
+        html_content = page_css + html_content
+    return html_content
+def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
+    """Convert HTML content to PDF using Puppeteer"""
+    try:
+        html_content = inject_page_breaks(html_content, aspect_ratio)
+        html_file = os.path.join(temp_dir, "input.html")
+        with open(html_file, 'w', encoding='utf-8') as f:
+            f.write(html_content)
+        # Find puppeteer script
+        possible_paths = [
+            'puppeteer_pdf.js',
+            '/app/puppeteer_pdf.js',
+            os.path.join(os.path.dirname(__file__), 'puppeteer_pdf.js'),
+        ]
+        puppeteer_script = None
+        for path in possible_paths:
+            if os.path.exists(path):
+                puppeteer_script = path
+                break
+        if not puppeteer_script:
+            raise Exception("puppeteer_pdf.js not found")
+        result = subprocess.run(
+            ['node', puppeteer_script, html_file, aspect_ratio],
+            capture_output=True,
+            text=True,
+            timeout=60,
+            cwd=os.path.dirname(os.path.abspath(puppeteer_script))
+        )
+        if result.returncode != 0:
+            raise Exception(f"PDF conversion failed: {result.stderr}")
+        pdf_file = html_file.replace('.html', '.pdf')
+        if not os.path.exists(pdf_file):
+            raise Exception("PDF file was not generated")
+        with open(pdf_file, 'rb') as f:
+            pdf_bytes = f.read()
+        return pdf_bytes
+    except subprocess.TimeoutExpired:
+        raise Exception("PDF conversion timed out (60 seconds)")
+    except Exception as e:
+        raise Exception(f"Error: {str(e)}")
+@app.get("/")
+async def root():
+    """API root endpoint"""
+    return {
+        "message": "HTML to PDF Converter API",
+        "version": "1.0.0",
+        "endpoints": {
+            "POST /convert": "Convert HTML to PDF",
+            "GET /health": "Health check",
+            "GET /docs": "API documentation"
+        }
+    }
+@app.get("/health")
+async def health():
+    """Health check endpoint"""
+    return {"status": "healthy"}
+@app.post("/convert")
+async def convert_to_pdf(
+    html_file: UploadFile = File(..., description="HTML file to convert"),
+    aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
+    auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML"),
+    images: Optional[List[UploadFile]] = File(None, description="Images to embed in HTML")
+):
+    """
+    Convert HTML to PDF with optional image embedding
+    - **html_file**: HTML file to convert (required)
+    - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
+    - **auto_detect**: Auto-detect aspect ratio from HTML content
+    - **images**: Image files to embed as base64 in HTML
+    """
+    temp_dir = None
+    try:
+        # Read HTML content
+        html_content = await html_file.read()
+        try:
+            html_content = html_content.decode('utf-8')
+        except UnicodeDecodeError:
+            html_content = html_content.decode('latin-1')
+        # Detect or use provided aspect ratio
+        if auto_detect:
+            detected_ratio = detect_aspect_ratio(html_content)
+            aspect_ratio = detected_ratio
+        elif not aspect_ratio:
+            aspect_ratio = "9:16"
+        # Validate aspect ratio
+        if aspect_ratio not in ["16:9", "1:1", "9:16"]:
+            raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
+        # Process images if provided
+        image_replacements = {}
+        if images:
+            images_dict = {}
+            for img in images:
+                img_bytes = await img.read()
+                data_url = image_to_base64(img_bytes, img.filename)
+                images_dict[img.filename] = data_url
+            html_content, image_replacements = embed_images_as_base64(html_content, images_dict)
+        # Create temp directory and convert
+        temp_dir = tempfile.mkdtemp()
+        pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
+        # Return PDF
+        return Response(
+            content=pdf_bytes,
+            media_type="application/pdf",
+            headers={
+                "Content-Disposition": f"attachment; filename=converted.pdf",
+                "X-Aspect-Ratio": aspect_ratio,
+                "X-Image-Replacements": str(len(image_replacements)),
+                "X-PDF-Size": str(len(pdf_bytes))
+            }
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        if temp_dir and os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir, ignore_errors=True)
+@app.post("/convert-base64")
+async def convert_to_pdf_base64(
+    html_content: str = Form(..., description="HTML content as string"),
+    aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
+    auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML")
+):
+    """
+    Convert HTML string to PDF and return as base64
+    - **html_content**: HTML content as string (required)
+    - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
+    - **auto_detect**: Auto-detect aspect ratio from HTML content
+    """
+    temp_dir = None
+    try:
+        # Detect or use provided aspect ratio
+        if auto_detect:
+            detected_ratio = detect_aspect_ratio(html_content)
+            aspect_ratio = detected_ratio
+        elif not aspect_ratio:
+            aspect_ratio = "9:16"
+        # Validate aspect ratio
+        if aspect_ratio not in ["16:9", "1:1", "9:16"]:
+            raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
+        # Create temp directory and convert
+        temp_dir = tempfile.mkdtemp()
+        pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
+        # Convert to base64
+        pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
+        return JSONResponse({
+            "success": True,
+            "pdf_base64": pdf_base64,
+            "aspect_ratio": aspect_ratio,
+            "size_bytes": len(pdf_bytes)
+        })
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        if temp_dir and os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir, ignore_errors=True)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)