Spaces:

ABDALLALSWAITI
/

htmlpdf

Sleeping

App Files Files Community

ABDALLALSWAITI commited on Oct 16, 2025

Commit

4c51b8c

verified ·

1 Parent(s): 703de2f

Update api.py

Browse files

Files changed (1) hide show

api.py +223 -152

api.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """
-FastAPI HTML to PDF Converter for Hugging Face Spaces
-File: app.py
 """
-from fastapi import FastAPI, File, UploadFile, Form, HTTPException
-from fastapi.responses import Response, HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
 import subprocess
 import os
@@ -13,13 +13,15 @@ import base64
 import re
 import mimetypes
 from typing import List, Optional
 app = FastAPI(
-    title="HTML to PDF Converter API",
-    description="Convert HTML to PDF with page breaks and embedded images",
-    version="3.0"
 )
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -76,83 +78,133 @@ def image_to_base64(image_bytes, filename):
         data_url = f"data:{mime_type};base64,{b64_data}"
         return data_url
     except Exception as e:
-        print(f"Error converting {filename} to base64: {str(e)}")
-        return None
-def embed_images_as_base64(html_content, images: List[UploadFile]):
     """Embed all images directly as base64 data URLs in the HTML"""
-    if not images:
-        return html_content, {}
-    image_data_urls = {}
-    for img in images:
-        img.file.seek(0)
-        image_bytes = img.file.read()
-        data_url = image_to_base64(image_bytes, img.filename)
-        if data_url:
-            image_data_urls[img.filename] = data_url
-            print(f"✓ Converted {img.filename} to base64")
-    if not image_data_urls:
         return html_content, {}
     replacements = {}
-    for filename, data_url in image_data_urls.items():
         escaped_name = re.escape(filename)
-        # Pattern 1: img src
         pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
         matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
         if matches1:
             html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
-            replacements[f"{filename} (img)"] = len(matches1)
         # Pattern 2: background-image
         pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
         matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
         if matches2:
             html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
-            replacements[f"{filename} (bg)"] = len(matches2)
-        # Pattern 3: url()
         pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
         matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
         if matches3:
             html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
-            replacements[f"{filename} (url)"] = len(matches3)
     return html_content, replacements
 def inject_page_breaks(html_content: str, aspect_ratio: str):
-    """Inject page break CSS"""
-    page_size = "A4 landscape" if aspect_ratio == "16:9" else ("210mm 210mm" if aspect_ratio == "1:1" else "A4 portrait")
     page_css = f"""
     <style id="auto-page-breaks">
-        @page {{ size: {page_size}; margin: 0; }}
-        html, body {{ margin: 0 !important; padding: 0 !important; width: 100% !important; height: 100% !important; }}
         .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
-            width: 100% !important; min-height: 100vh !important; height: 100vh !important;
-            page-break-after: always !important; break-after: page !important;
-            page-break-inside: avoid !important; break-inside: avoid !important;
-            position: relative !important; box-sizing: border-box !important; overflow: hidden !important;
         }}
-        .page:last-child, .slide:last-child, section.page:last-child, article.page:last-child {{
-            page-break-after: auto !important; break-after: auto !important;
         }}
-        body > section:not(.no-page-break), body > article:not(.no-page-break), body > div:not(.no-page-break) {{
-            page-break-after: always !important; break-after: page !important; min-height: 100vh;
         }}
-        body > section:last-child, body > article:last-child, body > div:last-child {{
             page-break-after: auto !important;
         }}
-        .page-break, .page-break-after {{ page-break-after: always !important; break-after: page !important; }}
-        .page-break-before {{ page-break-before: always !important; break-before: page !important; }}
-        .no-page-break, .keep-together {{ page-break-inside: avoid !important; break-inside: avoid !important; }}
-        h1, h2, h3, h4, h5, h6 {{ page-break-after: avoid !important; break-after: avoid !important; }}
-        img, figure, table, pre, blockquote {{ page-break-inside: avoid !important; break-inside: avoid !important; }}
-        * {{ -webkit-print-color-adjust: exact !important; print-color-adjust: exact !important; color-adjust: exact !important; }}
     </style>
     """
@@ -165,8 +217,8 @@ def inject_page_breaks(html_content: str, aspect_ratio: str):
     return html_content
-def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
-    """Convert HTML to PDF using Puppeteer"""
     try:
         html_content = inject_page_breaks(html_content, aspect_ratio)
@@ -174,10 +226,21 @@ def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
         with open(html_file, 'w', encoding='utf-8') as f:
             f.write(html_content)
-        puppeteer_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'puppeteer_pdf.js')
-        if not os.path.exists(puppeteer_script):
-            return None, f"Error: puppeteer_pdf.js not found at {puppeteer_script}"
         result = subprocess.run(
             ['node', puppeteer_script, html_file, aspect_ratio],
@@ -188,138 +251,99 @@ def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
         )
         if result.returncode != 0:
-            return None, f"PDF conversion failed: {result.stderr}"
         pdf_file = html_file.replace('.html', '.pdf')
         if not os.path.exists(pdf_file):
-            return None, "PDF file was not generated"
         with open(pdf_file, 'rb') as f:
             pdf_bytes = f.read()
-        return pdf_bytes, None
     except subprocess.TimeoutExpired:
-        return None, "Error: PDF conversion timed out"
     except Exception as e:
-        return None, f"Error: {str(e)}"
-@app.get("/", response_class=HTMLResponse)
 async def root():
-    """Root endpoint with documentation"""
-    html = """
-    <!DOCTYPE html>
-    <html>
-    <head>
-        <title>HTML to PDF Converter API</title>
-        <style>
-            body { font-family: Arial; max-width: 800px; margin: 50px auto; padding: 20px; }
-            h1 { color: #667eea; }
-            code { background: #f4f4f4; padding: 2px 6px; border-radius: 3px; }
-            pre { background: #f4f4f4; padding: 15px; border-radius: 5px; overflow-x: auto; }
-        </style>
-    </head>
-    <body>
-        <h1>📄 HTML to PDF Converter API</h1>
-        <p>Convert HTML to PDF with proper page breaks and embedded images.</p>
-        <h2>🚀 Quick Start</h2>
-        <pre>curl -X POST https://abdallalswaiti-htmlpdf.hf.space/convert \\
-  -F 'html_content=&lt;html&gt;&lt;body&gt;&lt;div class="page"&gt;Hello&lt;/div&gt;&lt;/body&gt;&lt;/html&gt;' \\
-  --output output.pdf</pre>
-        <h2>📚 Endpoints</h2>
-        <ul>
-            <li><code>GET /</code> - This page</li>
-            <li><code>GET /health</code> - Health check</li>
-            <li><code>POST /convert</code> - Convert HTML to PDF</li>
-            <li><code>GET /docs</code> - Interactive API documentation</li>
-        </ul>
-        <h2>💡 Features</h2>
-        <ul>
-            <li>✅ Automatic page break detection</li>
-            <li>✅ Base64 image embedding</li>
-            <li>✅ Multiple aspect ratios (16:9, 1:1, 9:16)</li>
-            <li>✅ CSS @page support</li>
-        </ul>
-        <p><a href="/docs">📖 View Full API Documentation</a></p>
-    </body>
-    </html>
-    """
-    return html
 @app.get("/health")
 async def health():
     """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "version": "3.0",
-        "api": "HTML to PDF Converter"
-    }
 @app.post("/convert")
 async def convert_to_pdf(
-    html_file: Optional[UploadFile] = File(None),
-    html_content: Optional[str] = Form(None),
-    aspect_ratio: Optional[str] = Form(None),
-    auto_detect: bool = Form(True),
-    images: Optional[List[UploadFile]] = File(None)
 ):
     """
-    Convert HTML to PDF with page breaks and embedded images
-    - **html_file**: HTML file upload (optional)
-    - **html_content**: Raw HTML content (optional)
-    - **aspect_ratio**: "16:9", "1:1", or "9:16"
-    - **auto_detect**: Auto-detect aspect ratio
-    - **images**: Image files to embed
     """
     temp_dir = None
     try:
-        if not html_file and not html_content:
-            raise HTTPException(status_code=400, detail="Either html_file or html_content must be provided")
-        if html_file:
-            content = await html_file.read()
-            try:
-                html = content.decode('utf-8')
-            except UnicodeDecodeError:
-                html = content.decode('latin-1')
-            filename = html_file.filename
-        else:
-            html = html_content
-            filename = "converted.pdf"
-        temp_dir = tempfile.mkdtemp()
-        if images:
-            html, replacements = embed_images_as_base64(html, images)
-        if auto_detect or not aspect_ratio:
-            aspect_ratio = detect_aspect_ratio(html)
-        else:
-            if aspect_ratio not in ["16:9", "1:1", "9:16"]:
-                raise HTTPException(status_code=400, detail="Invalid aspect_ratio")
-        pdf_bytes, error = convert_html_to_pdf(html, aspect_ratio, temp_dir)
-        if error:
-            raise HTTPException(status_code=500, detail=error)
-        output_filename = filename.replace('.html', '.pdf').replace('.htm', '.pdf')
-        if not output_filename.endswith('.pdf'):
-            output_filename = 'converted.pdf'
         return Response(
             content=pdf_bytes,
             media_type="application/pdf",
             headers={
-                "Content-Disposition": f"attachment; filename={output_filename}",
                 "X-Aspect-Ratio": aspect_ratio,
-                "X-Images-Embedded": str(len(images)) if images else "0"
             }
         )
@@ -331,7 +355,54 @@ async def convert_to_pdf(
         if temp_dir and os.path.exists(temp_dir):
             shutil.rmtree(temp_dir, ignore_errors=True)
 if __name__ == "__main__":
     import uvicorn
-    port = int(os.environ.get("PORT", 7860))
-    uvicorn.run(app, host="0.0.0.0", port=port)

 """
+FastAPI Backend for HTML to PDF Conversion
+Runs alongside Streamlit on port 7860
 """
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi.responses import Response, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 import subprocess
 import os
 import re
 import mimetypes
 from typing import List, Optional
+from pathlib import Path
 app = FastAPI(
+    title="HTML to PDF API",
+    description="Convert HTML to PDF with image support and page breaks",
+    version="1.0.0"
 )
+# Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
         data_url = f"data:{mime_type};base64,{b64_data}"
         return data_url
     except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Error converting {filename} to base64: {str(e)}")
+def embed_images_as_base64(html_content, images_dict):
     """Embed all images directly as base64 data URLs in the HTML"""
+    if not images_dict:
         return html_content, {}
     replacements = {}
+    for filename, data_url in images_dict.items():
         escaped_name = re.escape(filename)
+        # Pattern 1: img src attribute
         pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
         matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
+        count1 = len(matches1)
         if matches1:
             html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
+            replacements[f"{filename} (img src)"] = count1
         # Pattern 2: background-image
         pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
         matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
+        count2 = len(matches2)
         if matches2:
             html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
+            replacements[f"{filename} (bg-image)"] = count2
+        # Pattern 3: CSS url()
         pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
         matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
+        count3 = len(matches3)
         if matches3:
             html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
+            replacements[f"{filename} (url)"] = count3
     return html_content, replacements
 def inject_page_breaks(html_content: str, aspect_ratio: str):
+    """Automatically inject page breaks and page sizing CSS"""
+    if aspect_ratio == "16:9":
+        page_size = "A4 landscape"
+    elif aspect_ratio == "1:1":
+        page_size = "210mm 210mm"
+    else:
+        page_size = "A4 portrait"
     page_css = f"""
     <style id="auto-page-breaks">
+        @page {{
+            size: {page_size};
+            margin: 0;
+        }}
+        html, body {{
+            margin: 0 !important;
+            padding: 0 !important;
+            width: 100% !important;
+            height: 100% !important;
+        }}
         .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
+            width: 100% !important;
+            min-height: 100vh !important;
+            height: 100vh !important;
+            page-break-after: always !important;
+            break-after: page !important;
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+            position: relative !important;
+            box-sizing: border-box !important;
+            overflow: hidden !important;
         }}
+        .page:last-child, .slide:last-child,
+        section.page:last-child, article.page:last-child {{
+            page-break-after: auto !important;
+            break-after: auto !important;
         }}
+        body > section:not(.no-page-break),
+        body > article:not(.no-page-break),
+        body > div:not(.no-page-break) {{
+            page-break-after: always !important;
+            break-after: page !important;
+            min-height: 100vh;
         }}
+        body > section:last-child,
+        body > article:last-child,
+        body > div:last-child {{
             page-break-after: auto !important;
         }}
+        .page-break, .page-break-after {{
+            page-break-after: always !important;
+            break-after: page !important;
+        }}
+        .page-break-before {{
+            page-break-before: always !important;
+            break-before: page !important;
+        }}
+        .no-page-break, .keep-together {{
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        h1, h2, h3, h4, h5, h6 {{
+            page-break-after: avoid !important;
+            break-after: avoid !important;
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        img, figure, table, pre, blockquote {{
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        * {{
+            -webkit-print-color-adjust: exact !important;
+            print-color-adjust: exact !important;
+            color-adjust: exact !important;
+        }}
     </style>
     """
     return html_content
+def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
+    """Convert HTML content to PDF using Puppeteer"""
     try:
         html_content = inject_page_breaks(html_content, aspect_ratio)
         with open(html_file, 'w', encoding='utf-8') as f:
             f.write(html_content)
+        # Find puppeteer script
+        possible_paths = [
+            'puppeteer_pdf.js',
+            '/app/puppeteer_pdf.js',
+            os.path.join(os.path.dirname(__file__), 'puppeteer_pdf.js'),
+        ]
+        puppeteer_script = None
+        for path in possible_paths:
+            if os.path.exists(path):
+                puppeteer_script = path
+                break
+        if not puppeteer_script:
+            raise Exception("puppeteer_pdf.js not found")
         result = subprocess.run(
             ['node', puppeteer_script, html_file, aspect_ratio],
         )
         if result.returncode != 0:
+            raise Exception(f"PDF conversion failed: {result.stderr}")
         pdf_file = html_file.replace('.html', '.pdf')
         if not os.path.exists(pdf_file):
+            raise Exception("PDF file was not generated")
         with open(pdf_file, 'rb') as f:
             pdf_bytes = f.read()
+        return pdf_bytes
     except subprocess.TimeoutExpired:
+        raise Exception("PDF conversion timed out (60 seconds)")
     except Exception as e:
+        raise Exception(f"Error: {str(e)}")
+@app.get("/")
 async def root():
+    """API root endpoint"""
+    return {
+        "message": "HTML to PDF Converter API",
+        "version": "1.0.0",
+        "endpoints": {
+            "POST /convert": "Convert HTML to PDF",
+            "GET /health": "Health check",
+            "GET /docs": "API documentation"
+        }
+    }
 @app.get("/health")
 async def health():
     """Health check endpoint"""
+    return {"status": "healthy"}
 @app.post("/convert")
 async def convert_to_pdf(
+    html_file: UploadFile = File(..., description="HTML file to convert"),
+    aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
+    auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML"),
+    images: Optional[List[UploadFile]] = File(None, description="Images to embed in HTML")
 ):
     """
+    Convert HTML to PDF with optional image embedding
+    - **html_file**: HTML file to convert (required)
+    - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
+    - **auto_detect**: Auto-detect aspect ratio from HTML content
+    - **images**: Image files to embed as base64 in HTML
     """
     temp_dir = None
     try:
+        # Read HTML content
+        html_content = await html_file.read()
+        try:
+            html_content = html_content.decode('utf-8')
+        except UnicodeDecodeError:
+            html_content = html_content.decode('latin-1')
+        # Detect or use provided aspect ratio
+        if auto_detect:
+            detected_ratio = detect_aspect_ratio(html_content)
+            aspect_ratio = detected_ratio
+        elif not aspect_ratio:
+            aspect_ratio = "9:16"
+        # Validate aspect ratio
+        if aspect_ratio not in ["16:9", "1:1", "9:16"]:
+            raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
+        # Process images if provided
+        image_replacements = {}
+        if images:
+            images_dict = {}
+            for img in images:
+                img_bytes = await img.read()
+                data_url = image_to_base64(img_bytes, img.filename)
+                images_dict[img.filename] = data_url
+            html_content, image_replacements = embed_images_as_base64(html_content, images_dict)
+        # Create temp directory and convert
+        temp_dir = tempfile.mkdtemp()
+        pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
+        # Return PDF
         return Response(
             content=pdf_bytes,
             media_type="application/pdf",
             headers={
+                "Content-Disposition": f"attachment; filename=converted.pdf",
                 "X-Aspect-Ratio": aspect_ratio,
+                "X-Image-Replacements": str(len(image_replacements)),
+                "X-PDF-Size": str(len(pdf_bytes))
             }
         )
         if temp_dir and os.path.exists(temp_dir):
             shutil.rmtree(temp_dir, ignore_errors=True)
+@app.post("/convert-base64")
+async def convert_to_pdf_base64(
+    html_content: str = Form(..., description="HTML content as string"),
+    aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
+    auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML")
+):
+    """
+    Convert HTML string to PDF and return as base64
+    - **html_content**: HTML content as string (required)
+    - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
+    - **auto_detect**: Auto-detect aspect ratio from HTML content
+    """
+    temp_dir = None
+    try:
+        # Detect or use provided aspect ratio
+        if auto_detect:
+            detected_ratio = detect_aspect_ratio(html_content)
+            aspect_ratio = detected_ratio
+        elif not aspect_ratio:
+            aspect_ratio = "9:16"
+        # Validate aspect ratio
+        if aspect_ratio not in ["16:9", "1:1", "9:16"]:
+            raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
+        # Create temp directory and convert
+        temp_dir = tempfile.mkdtemp()
+        pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
+        # Convert to base64
+        pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
+        return JSONResponse({
+            "success": True,
+            "pdf_base64": pdf_base64,
+            "aspect_ratio": aspect_ratio,
+            "size_bytes": len(pdf_bytes)
+        })
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        if temp_dir and os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir, ignore_errors=True)
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)