Spaces:

ABDALLALSWAITI
/

htmlpdf

Sleeping

App Files Files Community

ABDALLALSWAITI commited on Oct 16, 2025

Commit

5b2c21f

verified ·

1 Parent(s): 45054fd

Update api.py

Browse files

Files changed (1) hide show

api.py +132 -134

api.py CHANGED Viewed

@@ -1,17 +1,17 @@
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
-from fastapi.responses import Response, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 import subprocess
 import os
 import tempfile
 import shutil
-from pathlib import Path
 import re
 from typing import List, Optional
 app = FastAPI(title="HTML to PDF Converter API")
-# Enable CORS
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -21,21 +21,15 @@ app.add_middleware(
 )
 def detect_aspect_ratio(html_content):
-    """
-    Detect aspect ratio from HTML content
-    Returns: "16:9", "1:1", or "9:16"
-    """
-    # Check for viewport meta tag
     viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
     if viewport_match:
         viewport = viewport_match.group(1).lower()
-        if 'width=device-width' in viewport or 'width=100%' in viewport:
-            if 'orientation=portrait' in viewport:
-                return "9:16"
-            elif 'orientation=landscape' in viewport:
-                return "16:9"
-    # Check for CSS aspect-ratio property
     aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
     if aspect_match:
         width = int(aspect_match.group(1))
@@ -48,110 +42,106 @@ def detect_aspect_ratio(html_content):
         else:
             return "1:1"
-    # Check for common presentation frameworks
     if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
         return "16:9"
-    # Default to A4 portrait
     return "9:16"
-def save_uploaded_images(images: List[UploadFile], temp_dir: str):
-    """Save uploaded images and return mapping"""
-    image_mapping = {}
-    images_dir = os.path.join(temp_dir, "images")
-    os.makedirs(images_dir, exist_ok=True)
-    for image in images:
-        # Save image
-        image_path = os.path.join(images_dir, image.filename)
-        with open(image_path, 'wb') as f:
-            content = image.file.read()
-            f.write(content)
-        # Create mapping
-        image_mapping[image.filename] = f"images/{image.filename}"
-        print(f"API: Saved image: {image.filename} -> {image_path}")
-    return image_mapping
-def process_html_with_images(html_content: str, temp_dir: str, image_mapping: dict):
-    """Process HTML to handle image references with absolute file paths"""
-    replacements_made = []
-    for original_name, relative_path in image_mapping.items():
-        # Get absolute path for the image
-        absolute_path = os.path.abspath(os.path.join(temp_dir, relative_path))
-        file_url = f"file://{absolute_path}"
-        # Escape the filename for regex
-        escaped_name = re.escape(original_name)
-        # Pattern 1: src with any path prefix
-        pattern1 = rf'src=(["\'])(?:[^"\']*?/)?{escaped_name}\1'
-        matches1 = re.findall(pattern1, html_content, flags=re.IGNORECASE)
-        html_content = re.sub(
-            pattern1,
-            f'src=\\1{file_url}\\1',
-            html_content,
-            flags=re.IGNORECASE
-        )
         if matches1:
-            replacements_made.append(f"Pattern 1 (src): Found {len(matches1)} matches for {original_name}")
-        # Pattern 2: url() with any path prefix
-        pattern2 = rf'url\((["\']?)(?:[^)"\']*/)?{escaped_name}\1\)'
-        matches2 = re.findall(pattern2, html_content, flags=re.IGNORECASE)
-        html_content = re.sub(
-            pattern2,
-            f'url("{file_url}")',
-            html_content,
-            flags=re.IGNORECASE
-        )
         if matches2:
-            replacements_made.append(f"Pattern 2 (url): Found {len(matches2)} matches for {original_name}")
-        # Pattern 3: href with any path prefix
-        pattern3 = rf'href=(["\'])(?:[^"\']*?/)?{escaped_name}\1'
-        matches3 = re.findall(pattern3, html_content, flags=re.IGNORECASE)
-        html_content = re.sub(
-            pattern3,
-            f'href=\\1{file_url}\\1',
-            html_content,
-            flags=re.IGNORECASE
-        )
         if matches3:
-            replacements_made.append(f"Pattern 3 (href): Found {len(matches3)} matches for {original_name}")
-    # Print debug info
-    if replacements_made:
-        print("=== API Image Replacements Made ===")
-        for msg in replacements_made:
-            print(f"  ✓ {msg}")
     else:
-        print("=== API WARNING: No image replacements made ===")
-        print(f"Looking for images: {list(image_mapping.keys())}")
-    return html_content
 def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
-    """
-    Convert HTML content to PDF using Puppeteer
-    Args:
-        html_content: String containing HTML content
-        aspect_ratio: One of "16:9", "1:1", or "9:16"
-        temp_dir: Temporary directory for processing
-    Returns:
-        Tuple of (pdf_bytes, error_message)
-    """
     try:
-        # Inject CSS to preserve styles better
         style_injection = """
         <style>
-            @page {
-                margin: 0;
-            }
             * {
                 -webkit-print-color-adjust: exact !important;
                 print-color-adjust: exact !important;
@@ -164,7 +154,6 @@ def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
         </style>
         """
-        # Insert style injection
         if '</head>' in html_content:
             html_content = html_content.replace('</head>', style_injection + '</head>')
         elif '<body' in html_content:
@@ -172,57 +161,57 @@ def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
         else:
             html_content = style_injection + html_content
-        # Save HTML content to temporary file
         html_file = os.path.join(temp_dir, "input.html")
         with open(html_file, 'w', encoding='utf-8') as f:
             f.write(html_content)
-        # Get the path to puppeteer_pdf.js
         puppeteer_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'puppeteer_pdf.js')
-        print(f"API: Running Puppeteer conversion with aspect ratio: {aspect_ratio}")
-        print(f"API: HTML file: {html_file}")
-        print(f"API: Puppeteer script: {puppeteer_script}")
-        # Run Node.js script to convert HTML to PDF
         result = subprocess.run(
             ['node', puppeteer_script, html_file, aspect_ratio],
             capture_output=True,
             text=True,
             timeout=60,
-            cwd=os.path.dirname(os.path.abspath(__file__))
         )
         if result.returncode != 0:
-            print(f"API: Puppeteer error: {result.stderr}")
             return None, f"PDF conversion failed: {result.stderr}"
-        # Get the generated PDF path
         pdf_file = html_file.replace('.html', '.pdf')
         if not os.path.exists(pdf_file):
             return None, "PDF file was not generated"
-        # Read PDF file into memory
         with open(pdf_file, 'rb') as f:
             pdf_bytes = f.read()
-        print(f"API: PDF generated successfully, size: {len(pdf_bytes)} bytes")
         return pdf_bytes, None
     except subprocess.TimeoutExpired:
         return None, "Error: PDF conversion timed out (60 seconds)"
     except Exception as e:
-        print(f"API: Conversion error: {str(e)}")
         return None, f"Error: {str(e)}"
 @app.get("/")
 async def root():
     return {
-        "message": "HTML to PDF Converter API",
-        "version": "2.0",
         "endpoints": {
-            "/convert": "POST - Convert HTML to PDF (supports file upload or raw HTML)",
             "/health": "GET - Health check"
         }
     }
@@ -240,17 +229,17 @@ async def convert_to_pdf(
     images: Optional[List[UploadFile]] = File(None)
 ):
     """
-    Convert HTML to PDF
     Parameters:
     - html_file: HTML file upload (optional)
     - html_content: Raw HTML content (optional, used if html_file not provided)
     - aspect_ratio: "16:9", "1:1", or "9:16" (optional if auto_detect is True)
     - auto_detect: Auto-detect aspect ratio from HTML (default: True)
-    - images: List of image files referenced in the HTML (optional)
     Returns:
-    - PDF file as bytes
     """
     temp_dir = None
@@ -271,35 +260,38 @@ async def convert_to_pdf(
             html = html_content
             filename = "converted.pdf"
         # Create temp directory
         temp_dir = tempfile.mkdtemp()
-        print(f"API: Created temp directory: {temp_dir}")
-        # Process images if provided
         if images:
-            print(f"API: Processing {len(images)} uploaded images")
-            image_mapping = save_uploaded_images(images, temp_dir)
-            html = process_html_with_images(html, temp_dir, image_mapping)
-            print(f"API: Image processing complete")
         # Determine aspect ratio
         if auto_detect or not aspect_ratio:
             detected_ratio = detect_aspect_ratio(html)
             aspect_ratio = detected_ratio
-            print(f"API: Auto-detected aspect ratio: {aspect_ratio}")
         else:
-            # Validate aspect ratio
             if aspect_ratio not in ["16:9", "1:1", "9:16"]:
                 raise HTTPException(status_code=400, detail="Invalid aspect_ratio. Must be '16:9', '1:1', or '9:16'")
-            print(f"API: Using specified aspect ratio: {aspect_ratio}")
         # Convert to PDF
         pdf_bytes, error = convert_html_to_pdf(html, aspect_ratio, temp_dir)
-        # Cleanup
-        if temp_dir:
-            shutil.rmtree(temp_dir, ignore_errors=True)
         if error:
             raise HTTPException(status_code=500, detail=error)
@@ -308,23 +300,29 @@ async def convert_to_pdf(
         if not output_filename.endswith('.pdf'):
             output_filename = 'converted.pdf'
-        # Return PDF as response
         return Response(
             content=pdf_bytes,
             media_type="application/pdf",
             headers={
                 "Content-Disposition": f"attachment; filename={output_filename}",
-                "X-Aspect-Ratio": aspect_ratio
             }
         )
     except HTTPException:
         raise
     except Exception as e:
-        if temp_dir:
-            shutil.rmtree(temp_dir, ignore_errors=True)
-        print(f"API: Error in convert endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     import uvicorn

 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import Response
 from fastapi.middleware.cors import CORSMiddleware
 import subprocess
 import os
 import tempfile
 import shutil
+import base64
 import re
+import mimetypes
 from typing import List, Optional
 app = FastAPI(title="HTML to PDF Converter API")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
 )
 def detect_aspect_ratio(html_content):
+    """Detect aspect ratio from HTML content"""
     viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
     if viewport_match:
         viewport = viewport_match.group(1).lower()
+        if 'orientation=portrait' in viewport:
+            return "9:16"
+        elif 'orientation=landscape' in viewport:
+            return "16:9"
     aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
     if aspect_match:
         width = int(aspect_match.group(1))
         else:
             return "1:1"
     if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
         return "16:9"
     return "9:16"
+def image_to_base64(image_bytes, filename):
+    """Convert image bytes to base64 data URL"""
+    try:
+        mime_type, _ = mimetypes.guess_type(filename)
+        if not mime_type:
+            ext = os.path.splitext(filename)[1].lower()
+            mime_map = {
+                '.jpg': 'image/jpeg',
+                '.jpeg': 'image/jpeg',
+                '.png': 'image/png',
+                '.gif': 'image/gif',
+                '.svg': 'image/svg+xml',
+                '.webp': 'image/webp',
+                '.bmp': 'image/bmp'
+            }
+            mime_type = mime_map.get(ext, 'image/png')
+        b64_data = base64.b64encode(image_bytes).decode('utf-8')
+        data_url = f"data:{mime_type};base64,{b64_data}"
+        return data_url
+    except Exception as e:
+        print(f"Error converting {filename} to base64: {str(e)}")
+        return None
+def embed_images_as_base64(html_content, images: List[UploadFile]):
+    """
+    Embed all images directly as base64 data URLs in the HTML
+    This ensures images are always included in the PDF
+    """
+    if not images:
+        return html_content, {}
+    # Create mapping of filename to base64 data URL
+    image_data_urls = {}
+    for img in images:
+        img.file.seek(0)
+        image_bytes = img.file.read()
+        data_url = image_to_base64(image_bytes, img.filename)
+        if data_url:
+            image_data_urls[img.filename] = data_url
+            print(f"✓ Converted {img.filename} to base64 ({len(data_url)} chars)")
+    if not image_data_urls:
+        return html_content, {}
+    # Track replacements
+    replacements = {}
+    for filename, data_url in image_data_urls.items():
+        # Escape filename for regex
+        escaped_name = re.escape(filename)
+        # Pattern 1: img src attribute
+        pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
+        matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
+        count1 = len(matches1)
         if matches1:
+            html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
+            replacements[f"{filename} (img src)"] = count1
+        # Pattern 2: background-image
+        pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
+        matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
+        count2 = len(matches2)
         if matches2:
+            html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
+            replacements[f"{filename} (bg-image)"] = count2
+        # Pattern 3: CSS url()
+        pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
+        matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
+        count3 = len(matches3)
         if matches3:
+            html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
+            replacements[f"{filename} (url)"] = count3
+    # Log results
+    if replacements:
+        print("=== Image Replacements ===")
+        for key, count in replacements.items():
+            print(f"  ✓ {key}: {count} replacement(s)")
     else:
+        print("=== WARNING: No image replacements made ===")
+        print(f"Looking for: {list(image_data_urls.keys())}")
+    return html_content, replacements
 def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
+    """Convert HTML content to PDF using Puppeteer"""
     try:
+        # Inject CSS to preserve styles
         style_injection = """
         <style>
+            @page { margin: 0; }
             * {
                 -webkit-print-color-adjust: exact !important;
                 print-color-adjust: exact !important;
         </style>
         """
         if '</head>' in html_content:
             html_content = html_content.replace('</head>', style_injection + '</head>')
         elif '<body' in html_content:
         else:
             html_content = style_injection + html_content
+        # Save HTML to temp file
         html_file = os.path.join(temp_dir, "input.html")
         with open(html_file, 'w', encoding='utf-8') as f:
             f.write(html_content)
+        print(f"Saved HTML: {os.path.getsize(html_file):,} bytes")
+        # Find puppeteer script
         puppeteer_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'puppeteer_pdf.js')
+        if not os.path.exists(puppeteer_script):
+            return None, f"Error: puppeteer_pdf.js not found at {puppeteer_script}"
+        print(f"Using Puppeteer: {puppeteer_script}")
+        # Run conversion
         result = subprocess.run(
             ['node', puppeteer_script, html_file, aspect_ratio],
             capture_output=True,
             text=True,
             timeout=60,
+            cwd=os.path.dirname(os.path.abspath(puppeteer_script))
         )
         if result.returncode != 0:
             return None, f"PDF conversion failed: {result.stderr}"
+        # Read PDF
         pdf_file = html_file.replace('.html', '.pdf')
         if not os.path.exists(pdf_file):
             return None, "PDF file was not generated"
         with open(pdf_file, 'rb') as f:
             pdf_bytes = f.read()
+        print(f"PDF generated: {len(pdf_bytes):,} bytes")
         return pdf_bytes, None
     except subprocess.TimeoutExpired:
         return None, "Error: PDF conversion timed out (60 seconds)"
     except Exception as e:
+        print(f"Conversion error: {str(e)}")
         return None, f"Error: {str(e)}"
 @app.get("/")
 async def root():
     return {
+        "message": "HTML to PDF Converter API with Base64 Image Embedding",
+        "version": "2.1",
         "endpoints": {
+            "/convert": "POST - Convert HTML to PDF (images embedded as base64)",
             "/health": "GET - Health check"
         }
     }
     images: Optional[List[UploadFile]] = File(None)
 ):
     """
+    Convert HTML to PDF with embedded base64 images
     Parameters:
     - html_file: HTML file upload (optional)
     - html_content: Raw HTML content (optional, used if html_file not provided)
     - aspect_ratio: "16:9", "1:1", or "9:16" (optional if auto_detect is True)
     - auto_detect: Auto-detect aspect ratio from HTML (default: True)
+    - images: List of image files - will be embedded as base64 in HTML (optional)
     Returns:
+    - PDF file as bytes with images embedded
     """
     temp_dir = None
             html = html_content
             filename = "converted.pdf"
+        print(f"\n{'='*60}")
+        print(f"Processing HTML: {len(html)} characters")
         # Create temp directory
         temp_dir = tempfile.mkdtemp()
+        print(f"Temp directory: {temp_dir}")
+        # Embed images as base64 if provided
         if images:
+            print(f"Processing {len(images)} uploaded images...")
+            html, replacements = embed_images_as_base64(html, images)
+            if replacements:
+                print(f"Successfully embedded {len(replacements)} image reference(s)")
+            else:
+                print("WARNING: Images uploaded but no matches found in HTML")
+        else:
+            print("No images provided")
         # Determine aspect ratio
         if auto_detect or not aspect_ratio:
             detected_ratio = detect_aspect_ratio(html)
             aspect_ratio = detected_ratio
+            print(f"Auto-detected aspect ratio: {aspect_ratio}")
         else:
             if aspect_ratio not in ["16:9", "1:1", "9:16"]:
                 raise HTTPException(status_code=400, detail="Invalid aspect_ratio. Must be '16:9', '1:1', or '9:16'")
+            print(f"Using specified aspect ratio: {aspect_ratio}")
         # Convert to PDF
         pdf_bytes, error = convert_html_to_pdf(html, aspect_ratio, temp_dir)
         if error:
             raise HTTPException(status_code=500, detail=error)
         if not output_filename.endswith('.pdf'):
             output_filename = 'converted.pdf'
+        print(f"Success! Generated {output_filename}")
+        print(f"{'='*60}\n")
+        # Return PDF
         return Response(
             content=pdf_bytes,
             media_type="application/pdf",
             headers={
                 "Content-Disposition": f"attachment; filename={output_filename}",
+                "X-Aspect-Ratio": aspect_ratio,
+                "X-Images-Embedded": str(len(images)) if images else "0"
             }
         )
     except HTTPException:
         raise
     except Exception as e:
+        print(f"Error in convert endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        # Cleanup
+        if temp_dir and os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir, ignore_errors=True)
 if __name__ == "__main__":
     import uvicorn