Spaces:
Sleeping
Sleeping
| """ | |
| FastAPI Backend for HTML to PDF Conversion | |
| Runs alongside Streamlit on port 7860 | |
| """ | |
| from fastapi import FastAPI, UploadFile, File, Form, HTTPException | |
| from fastapi.responses import Response, JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| import subprocess | |
| import os | |
| import tempfile | |
| import shutil | |
| import re | |
| import mimetypes | |
| from typing import List, Optional | |
| from pathlib import Path | |
| app = FastAPI( | |
| title="HTML to PDF API", | |
| description="Convert HTML to PDF with image support and page breaks", | |
| version="1.0.0" | |
| ) | |
| # Add CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| def detect_aspect_ratio(html_content): | |
| """Detect aspect ratio from HTML content""" | |
| viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE) | |
| if viewport_match: | |
| viewport = viewport_match.group(1).lower() | |
| if 'orientation=portrait' in viewport: | |
| return "9:16" | |
| elif 'orientation=landscape' in viewport: | |
| return "16:9" | |
| aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE) | |
| if aspect_match: | |
| width = int(aspect_match.group(1)) | |
| height = int(aspect_match.group(2)) | |
| ratio = width / height | |
| if ratio > 1.5: | |
| return "16:9" | |
| elif ratio < 0.7: | |
| return "9:16" | |
| else: | |
| return "1:1" | |
| if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']): | |
| return "16:9" | |
| return "9:16" | |
| def normalize_image_paths(html_content): | |
| """Replace complex image paths with just filenames""" | |
| replacements = {} | |
| # Pattern for img src - capture full path and extract filename | |
| def replace_img_src(match): | |
| prefix = match.group(1) | |
| quote = match.group(2) | |
| full_path = match.group(3) | |
| # Extract just the filename from the full path | |
| filename = os.path.basename(full_path) | |
| replacements[f"img: {full_path}"] = filename | |
| return f'{prefix}{quote}{filename}{quote}' | |
| html_content = re.sub( | |
| r'(<img[^>]*\s+src\s*=\s*)(["\'])([^"\']+\.(jpg|jpeg|png|gif|svg|webp|bmp|JPG|JPEG|PNG|GIF|SVG|WEBP|BMP))(\2)', | |
| replace_img_src, | |
| html_content, | |
| flags=re.IGNORECASE | |
| ) | |
| # Pattern for background-image | |
| def replace_bg_image(match): | |
| prefix = match.group(1) | |
| quote = match.group(2) | |
| full_path = match.group(3) | |
| suffix = match.group(5) | |
| # Extract just the filename from the full path | |
| filename = os.path.basename(full_path) | |
| replacements[f"bg: {full_path}"] = filename | |
| return f'{prefix}{quote}{filename}{quote}{suffix}' | |
| html_content = re.sub( | |
| r'(background-image\s*:\s*url\s*\()(["\']?)([^)"\'/]+\.(jpg|jpeg|png|gif|svg|webp|bmp|JPG|JPEG|PNG|GIF|SVG|WEBP|BMP))(\2)(\))', | |
| replace_bg_image, | |
| html_content, | |
| flags=re.IGNORECASE | |
| ) | |
| # Pattern for CSS url() | |
| def replace_url(match): | |
| prefix = match.group(1) | |
| quote = match.group(2) | |
| full_path = match.group(3) | |
| suffix = match.group(5) | |
| # Extract just the filename from the full path | |
| filename = os.path.basename(full_path) | |
| replacements[f"url: {full_path}"] = filename | |
| return f'{prefix}{quote}{filename}{quote}{suffix}' | |
| html_content = re.sub( | |
| r'(url\s*\()(["\']?)([^)"\'/]+\.(jpg|jpeg|png|gif|svg|webp|bmp|JPG|JPEG|PNG|GIF|SVG|WEBP|BMP))(\2)(\))', | |
| replace_url, | |
| html_content, | |
| flags=re.IGNORECASE | |
| ) | |
| return html_content, replacements | |
| def inject_page_breaks(html_content: str, aspect_ratio: str): | |
| """Automatically inject page breaks and page sizing CSS""" | |
| if aspect_ratio == "16:9": | |
| page_size = "288mm 162mm" | |
| elif aspect_ratio == "1:1": | |
| page_size = "210mm 210mm" | |
| else: | |
| page_size = "162mm 288mm" | |
| page_css = f""" | |
| <style id="auto-page-breaks"> | |
| @page {{ | |
| size: {page_size}; | |
| margin: 0; | |
| }} | |
| html, body {{ | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| width: 100% !important; | |
| height: 100% !important; | |
| }} | |
| .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{ | |
| width: 100% !important; | |
| min-height: 100vh !important; | |
| page-break-after: always !important; | |
| break-after: page !important; | |
| page-break-inside: avoid !important; | |
| break-inside: avoid !important; | |
| position: relative !important; | |
| box-sizing: border-box !important; | |
| }} | |
| .page:last-child, .slide:last-child, | |
| section.page:last-child, article.page:last-child {{ | |
| page-break-after: auto !important; | |
| break-after: auto !important; | |
| }} | |
| body > section:not(.no-page-break), | |
| body > article:not(.no-page-break), | |
| body > div:not(.no-page-break) {{ | |
| page-break-after: always !important; | |
| break-after: page !important; | |
| min-height: 100vh; | |
| box-sizing: border-box !important; | |
| position: relative !important; | |
| }} | |
| body > section:last-child, | |
| body > article:last-child, | |
| body > div:last-child {{ | |
| page-break-after: auto !important; | |
| }} | |
| .page-break, .page-break-after {{ | |
| page-break-after: always !important; | |
| break-after: page !important; | |
| }} | |
| .page-break-before {{ | |
| page-break-before: always !important; | |
| break-before: page !important; | |
| }} | |
| .no-page-break, .keep-together {{ | |
| page-break-inside: avoid !important; | |
| break-inside: avoid !important; | |
| }} | |
| h1, h2, h3, h4, h5, h6 {{ | |
| page-break-after: avoid !important; | |
| break-after: avoid !important; | |
| page-break-inside: avoid !important; | |
| break-inside: avoid !important; | |
| }} | |
| img, figure, table, pre, blockquote {{ | |
| page-break-inside: avoid !important; | |
| break-inside: avoid !important; | |
| }} | |
| * {{ | |
| -webkit-print-color-adjust: exact !important; | |
| print-color-adjust: exact !important; | |
| color-adjust: exact !important; | |
| }} | |
| </style> | |
| """ | |
| if '</head>' in html_content: | |
| html_content = html_content.replace('</head>', page_css + '</head>') | |
| elif '<body' in html_content: | |
| html_content = html_content.replace('<body', page_css + '<body', 1) | |
| else: | |
| html_content = page_css + html_content | |
| return html_content | |
| def convert_html_to_pdf(html_content, aspect_ratio, temp_dir, image_files=None): | |
| """Convert HTML content to PDF using Puppeteer""" | |
| try: | |
| # Normalize image paths in HTML | |
| html_content, path_replacements = normalize_image_paths(html_content) | |
| # Inject page breaks | |
| html_content = inject_page_breaks(html_content, aspect_ratio) | |
| # Save HTML file | |
| html_file = os.path.join(temp_dir, "input.html") | |
| with open(html_file, 'w', encoding='utf-8') as f: | |
| f.write(html_content) | |
| # Save image files to the same directory | |
| if image_files: | |
| print(f"Saving {len(image_files)} image(s) to {temp_dir}") | |
| for filename, img_bytes in image_files.items(): | |
| img_path = os.path.join(temp_dir, filename) | |
| with open(img_path, 'wb') as f: | |
| f.write(img_bytes) | |
| print(f" Saved: {filename} ({len(img_bytes)} bytes)") | |
| # Debug: Log what's in the temp directory | |
| dir_contents = os.listdir(temp_dir) | |
| print(f"Temp directory contents: {dir_contents}") | |
| # Find puppeteer script | |
| possible_paths = [ | |
| 'puppeteer_pdf.js', | |
| '/app/puppeteer_pdf.js', | |
| os.path.join(os.path.dirname(__file__), 'puppeteer_pdf.js'), | |
| ] | |
| puppeteer_script = None | |
| for path in possible_paths: | |
| if os.path.exists(path): | |
| puppeteer_script = path | |
| break | |
| if not puppeteer_script: | |
| raise Exception("puppeteer_pdf.js not found") | |
| # Run Puppeteer | |
| result = subprocess.run( | |
| ['node', puppeteer_script, html_file, aspect_ratio], | |
| capture_output=True, | |
| text=True, | |
| timeout=60, | |
| cwd=os.path.dirname(os.path.abspath(puppeteer_script)) | |
| ) | |
| if result.returncode != 0: | |
| raise Exception(f"PDF conversion failed: {result.stderr}") | |
| pdf_file = html_file.replace('.html', '.pdf') | |
| if not os.path.exists(pdf_file): | |
| raise Exception("PDF file was not generated") | |
| with open(pdf_file, 'rb') as f: | |
| pdf_bytes = f.read() | |
| return pdf_bytes, path_replacements | |
| except subprocess.TimeoutExpired: | |
| raise Exception("PDF conversion timed out (60 seconds)") | |
| except Exception as e: | |
| raise Exception(f"Error: {str(e)}") | |
| async def root(): | |
| """API root endpoint""" | |
| return { | |
| "message": "HTML to PDF Converter API", | |
| "version": "1.0.0", | |
| "endpoints": { | |
| "POST /convert": "Convert HTML to PDF", | |
| "GET /health": "Health check", | |
| "GET /docs": "API documentation" | |
| } | |
| } | |
| async def health(): | |
| """Health check endpoint""" | |
| return {"status": "healthy"} | |
| async def convert_to_pdf( | |
| html_file: UploadFile = File(..., description="HTML file to convert"), | |
| aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"), | |
| auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML"), | |
| images: Optional[List[UploadFile]] = File(None, description="Images referenced in HTML") | |
| ): | |
| """ | |
| Convert HTML to PDF with image files in same directory | |
| - **html_file**: HTML file to convert (required) | |
| - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true) | |
| - **auto_detect**: Auto-detect aspect ratio from HTML content | |
| - **images**: Image files referenced in HTML (saved to temp directory) | |
| """ | |
| temp_dir = None | |
| try: | |
| # Read HTML content | |
| html_content = await html_file.read() | |
| try: | |
| html_content = html_content.decode('utf-8') | |
| except UnicodeDecodeError: | |
| html_content = html_content.decode('latin-1') | |
| # Detect or use provided aspect ratio | |
| if auto_detect: | |
| detected_ratio = detect_aspect_ratio(html_content) | |
| aspect_ratio = detected_ratio | |
| elif not aspect_ratio: | |
| aspect_ratio = "9:16" | |
| # Validate aspect ratio | |
| if aspect_ratio not in ["16:9", "1:1", "9:16"]: | |
| raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16") | |
| # Create temp directory | |
| temp_dir = tempfile.mkdtemp() | |
| # Read images into dictionary - extract just the filename without path | |
| image_files = {} | |
| if images: | |
| for img in images: | |
| img_bytes = await img.read() | |
| # Extract just the filename from path (e.g., "images/photo.png" -> "photo.png") | |
| clean_filename = os.path.basename(img.filename) | |
| image_files[clean_filename] = img_bytes | |
| # Convert to PDF | |
| pdf_bytes, path_replacements = convert_html_to_pdf( | |
| html_content, | |
| aspect_ratio, | |
| temp_dir, | |
| image_files | |
| ) | |
| # Return PDF | |
| return Response( | |
| content=pdf_bytes, | |
| media_type="application/pdf", | |
| headers={ | |
| "Content-Disposition": f"attachment; filename=converted.pdf", | |
| "X-Aspect-Ratio": aspect_ratio, | |
| "X-Path-Replacements": str(len(path_replacements)), | |
| "X-PDF-Size": str(len(pdf_bytes)) | |
| } | |
| ) | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| finally: | |
| if temp_dir and os.path.exists(temp_dir): | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| async def convert_string_to_pdf( | |
| html_content: str = Form(..., description="HTML content as string"), | |
| aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"), | |
| auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML") | |
| ): | |
| """ | |
| Convert HTML string to PDF (for HTML without external images) | |
| - **html_content**: HTML content as string (required) | |
| - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true) | |
| - **auto_detect**: Auto-detect aspect ratio from HTML content | |
| """ | |
| temp_dir = None | |
| try: | |
| # Detect or use provided aspect ratio | |
| if auto_detect: | |
| detected_ratio = detect_aspect_ratio(html_content) | |
| aspect_ratio = detected_ratio | |
| elif not aspect_ratio: | |
| aspect_ratio = "9:16" | |
| # Validate aspect ratio | |
| if aspect_ratio not in ["16:9", "1:1", "9:16"]: | |
| raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16") | |
| # Create temp directory and convert | |
| temp_dir = tempfile.mkdtemp() | |
| pdf_bytes, path_replacements = convert_html_to_pdf(html_content, aspect_ratio, temp_dir, None) | |
| return Response( | |
| content=pdf_bytes, | |
| media_type="application/pdf", | |
| headers={ | |
| "Content-Disposition": f"attachment; filename=converted.pdf", | |
| "X-Aspect-Ratio": aspect_ratio, | |
| "X-Path-Replacements": str(len(path_replacements)), | |
| "X-PDF-Size": str(len(pdf_bytes)) | |
| } | |
| ) | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| finally: | |
| if temp_dir and os.path.exists(temp_dir): | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |