Spaces:
Running
Running
| import sys | |
| import asyncio | |
| import os | |
| from contextlib import suppress | |
| from typing import Optional | |
| import time | |
| import uuid | |
| import re | |
| # Set Windows event loop policy for Playwright compatibility | |
| if sys.platform == 'win32': | |
| asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) | |
| from fastapi import FastAPI, HTTPException, Body | |
| from fastapi.responses import Response | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel, Field, field_validator | |
| from typing import List, Dict, Any | |
| import markdown | |
| from jinja2 import Template, Environment, select_autoescape | |
| from playwright.async_api import async_playwright, Browser, BrowserContext, Page, Error as PlaywrightError | |
| from datetime import datetime | |
| import re | |
| from urllib.parse import quote | |
| import gc | |
| import io | |
| import uvicorn | |
| # ==================== APP INITIALIZATION ==================== | |
| app = FastAPI( | |
| title="Chat PDF Export Service", | |
| description="Production-grade API for exporting chat conversations to PDF", | |
| version="1.0.0" | |
| ) | |
| # Add CORS for web clients | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], # Configure for your domain in production | |
| allow_credentials=True, | |
| allow_methods=["GET", "POST"], | |
| allow_headers=["*"], | |
| ) | |
| # ==================== GLOBAL CONFIGURATION ==================== | |
| MAX_CONTENT_LENGTH = 50_000 # 50kb max content length | |
| PDF_GENERATION_TIMEOUT = 30 # seconds | |
| MAX_REQUESTS_PER_CONNECTION = 100 # After this, browser is restarted | |
| # ==================== PYDANTIC MODELS ==================== | |
| class ExportRequest(BaseModel): | |
| messages: List[dict] = Field(..., min_length=1) | |
| language: str = Field(default="en", description="ISO 639-1 language code") | |
| font_family: Optional[str] = Field(default=None, description="Custom font family") | |
| def validate_messages(cls, v: list) -> list: | |
| for msg in v: | |
| if not isinstance(msg, dict): | |
| raise ValueError('Each message must be a dictionary') | |
| if 'role' not in msg or 'content' not in msg: | |
| raise ValueError('Message must have "role" and "content" keys') | |
| return v | |
| def validate_language(cls, v: str) -> str: | |
| if not isinstance(v, str) or len(v) != 2: | |
| raise ValueError('Language must be a 2-letter ISO code') | |
| return v.lower() | |
| # ==================== PLAYWRIGHT BROWSER MANAGEMENT ==================== | |
| class PlaywrightBrowserPool: | |
| """Manages a pool of persistent browser instances for optimal performance""" | |
| def __init__(self): | |
| self.browser: Optional[Browser] = None | |
| self.context: Optional[BrowserContext] = None | |
| self.playwright = None | |
| self.request_count = 0 | |
| self._lock = asyncio.Lock() | |
| self._last_maintenance = time.time() | |
| async def get_page(self) -> Page: | |
| """Get a new page from the browser pool""" | |
| async with self._lock: | |
| # Check if browser is alive; restart if it crashed (OOM, timeout, etc.) | |
| if self.browser and not self.browser.is_connected(): | |
| print("[browser_pool] Browser disconnected — restarting…") | |
| self.browser = None | |
| self.context = None | |
| if not self.browser or not self.context: | |
| await self._create_browser() | |
| # Perform maintenance every N requests | |
| if self.request_count > MAX_REQUESTS_PER_CONNECTION: | |
| await self._restart_browser() | |
| self.request_count = 0 | |
| try: | |
| page = await self.context.new_page() | |
| except Exception: | |
| # Browser died between the check and page creation — restart | |
| print("[browser_pool] Failed to create page — restarting browser…") | |
| await self._restart_browser() | |
| page = await self.context.new_page() | |
| self.request_count += 1 | |
| return page | |
| async def _create_browser(self): | |
| """Initialize the Playwright browser instance""" | |
| self.playwright = await async_playwright().start() | |
| self.browser = await self.playwright.chromium.launch( | |
| headless=True, | |
| args=[ | |
| '--no-sandbox', | |
| '--disable-setuid-sandbox', | |
| '--disable-dev-shm-usage', | |
| '--disable-gpu', | |
| '--no-zygote', # Critical for Docker: skip forking zygote process | |
| '--single-process', # Critical for Docker: run everything in one process | |
| '--disable-web-security', | |
| '--disable-features=VizDisplayCompositor', | |
| '--font-render-hinting=none', # Prevents blurry PDF text | |
| '--disable-lcd-text', # Disable subpixel AA (fuzz in PDFs) | |
| '--enable-font-antialiasing', | |
| '--force-color-profile=srgb', | |
| ] | |
| ) | |
| self.context = await self.browser.new_context() | |
| self._last_maintenance = time.time() | |
| print(f"[browser_pool] Browser launched successfully (pid={self.browser.process.pid if self.browser.process else '?'})") | |
| async def _restart_browser(self): | |
| """Restart browser to free memory and resources""" | |
| # Close existing browser + Playwright server gracefully | |
| with suppress(Exception): | |
| if self.browser: | |
| await self.browser.close() | |
| with suppress(Exception): | |
| if self.playwright: | |
| await self.playwright.stop() | |
| self.browser = None | |
| self.context = None | |
| self.playwright = None | |
| gc.collect() | |
| await self._create_browser() | |
| async def close(self): | |
| """Clean up browser instances""" | |
| with suppress(Exception): | |
| if self.browser: | |
| await self.browser.close() | |
| with suppress(Exception): | |
| if self.playwright: | |
| await self.playwright.stop() | |
| # Initialize global browser pool | |
| browser_pool = PlaywrightBrowserPool() | |
| # ==================== MULTILINGUAL FONT MAPPING ==================== | |
| MULTILINGUAL_FONTS = { | |
| # (display_name, google_font_url_param, is_system_font) | |
| # is_system_font=True means no Google Font link needed (the font is pre-installed) | |
| 'en': ('Georgia', '', True), | |
| 'hi': ('Noto Sans Devanagari', 'Noto+Sans+Devanagari:wght@400;600;700', False), | |
| 'ar': ('Noto Sans Arabic', 'Noto+Sans+Arabic:wght@400;600;700', False), | |
| 'zh': ('Noto Sans SC', 'Noto+Sans+SC:wght@400;600;700', False), | |
| 'ja': ('Noto Sans JP', 'Noto+Sans+JP:wght@400;600;700', False), | |
| 'ko': ('Noto Sans KR', 'Noto+Sans+KR:wght@400;600;700', False), | |
| 'th': ('Noto Sans Thai', 'Noto+Sans+Thai:wght@400;600;700', False), | |
| 'he': ('Noto Serif Hebrew', 'Noto+Serif+Hebrew:wght@400;600;700', False), | |
| 'bn': ('Noto Sans Bengali', 'Noto+Sans+Bengali:wght@400;600;700', False), | |
| 'ta': ('Noto Sans Tamil', 'Noto+Sans+Tamil:wght@400;600;700', False), | |
| 'te': ('Noto Serif Telugu', 'Noto+Serif+Telugu:wght@400;600;700', False), | |
| 'ml': ('Noto Serif Malayalam', 'Noto+Serif+Malayalam:wght@400;600;700', False), | |
| 'ru': ('Georgia', '', True), | |
| 'ur': ('Noto Nastaliq Urdu', 'Noto+Nastaliq+Urdu', False), | |
| } | |
| def get_font_for_language(lang: str) -> str: | |
| """Get appropriate Google Font for the specified language""" | |
| lang = lang.lower() | |
| info = MULTILINGUAL_FONTS.get(lang, ('Georgia', '', True)) | |
| return info[1] | |
| # ==================== HTML TEMPLATE - FIXED VERSION ==================== | |
| PDF_HTML_TEMPLATE = """ | |
| <!DOCTYPE html> | |
| <html lang="{{ language }}"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>{{ title }}</title> | |
| <link rel="preconnect" href="https://fonts.googleapis.com"> | |
| <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> | |
| <link href="https://fonts.googleapis.com/css2?family={{ font_family }}&display=swap" rel="stylesheet"> | |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/github-dark.min.css"> | |
| <style> | |
| * { | |
| font-kerning: normal; | |
| text-rendering: optimizeLegibility; | |
| -webkit-font-smoothing: antialiased; | |
| -webkit-print-color-adjust: exact !important; | |
| print-color-adjust: exact !important; | |
| box-sizing: border-box; | |
| } | |
| body { | |
| font-family: '{{ font_family.split(":")[0] | default("Inter") }}', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; | |
| max-width: 100%; | |
| margin: 0 auto; | |
| padding: 0; | |
| color: #000000; | |
| direction: {{ text_direction }}; | |
| font-size: 11pt; | |
| line-height: 1.6; | |
| } | |
| /* HEADER STYLING */ | |
| .header { | |
| text-align: center; | |
| border-bottom: 2pt solid #000; | |
| padding-bottom: 15pt; | |
| margin-bottom: 20pt; | |
| page-break-after: avoid; | |
| } | |
| .logo { | |
| font-size: 18pt; | |
| font-weight: 800; | |
| color: #000; | |
| letter-spacing: -0.5px; | |
| } | |
| .date { | |
| color: #6b7280; | |
| font-size: 10pt; | |
| margin-top: 5pt; | |
| } | |
| /* MESSAGE CONTAINERS - FIXED SPACING */ | |
| .message { | |
| margin-bottom: 16pt; | |
| page-break-inside: auto; | |
| } | |
| h1, h2, h3, h4, h5, h6, .user .content { | |
| page-break-after: avoid; | |
| } | |
| .message:last-child { | |
| margin-bottom: 0; | |
| } | |
| /* USER MESSAGE - QUESTION HEADER */ | |
| .user .content { | |
| font-weight: 700; | |
| font-size: 12pt; | |
| color: #000000; | |
| margin: 0 0 8pt 0; | |
| padding: 0 0 8pt 0; | |
| border-bottom: 1px solid #e5e7eb; | |
| background: none; | |
| border-left: none; | |
| } | |
| img, svg { | |
| max-width: 100%; | |
| height: auto; | |
| display: block; | |
| margin: 12pt auto; | |
| page-break-inside: avoid; | |
| } | |
| /* SVG DIAGRAM STYLING */ | |
| .svg-diagram-container, | |
| .mermaid-diagram-container { | |
| max-width: 100%; | |
| margin: 16pt 0; | |
| padding: 12pt; | |
| background: #f8f9fa; | |
| border: 1px solid #e9ecef; | |
| border-radius: 6pt; | |
| page-break-inside: avoid; | |
| overflow: hidden; | |
| } | |
| .svg-diagram-container svg, | |
| .mermaid-diagram-container svg { | |
| max-width: 100%; | |
| max-height: 600px; /* Professional dimension limit */ | |
| height: auto; | |
| margin: 0 auto; | |
| display: block; | |
| } | |
| /* INLINE SVG FROM MARKDOWN */ | |
| svg:not([class]) { | |
| max-width: 100%; | |
| height: auto; | |
| page-break-inside: avoid; | |
| } | |
| /* ASSISTANT MESSAGE - ANSWER BODY */ | |
| .assistant .content { | |
| font-weight: 400; | |
| color: #000000; | |
| padding: 0; | |
| margin: 0; | |
| font-size: 11pt; | |
| } | |
| /* CODE BLOCKS - FIXED WRAPPING ISSUES */ | |
| pre { | |
| background: #f8f9fa; | |
| border: 1px solid #e9ecef; | |
| border-radius: 6pt; | |
| margin: 12pt 0; | |
| padding: 12pt; | |
| page-break-inside: auto; | |
| orphans: 3; | |
| widows: 3; | |
| overflow-x: auto; | |
| white-space: pre-wrap; | |
| word-wrap: break-word; | |
| font-size: 10pt; | |
| } | |
| pre code { | |
| display: block; | |
| padding: 0; | |
| background: transparent; | |
| color: #000000; | |
| font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; | |
| font-size: 10pt; | |
| line-height: 1.5; | |
| white-space: pre-wrap; | |
| word-wrap: break-word; | |
| } | |
| /* INLINE CODE */ | |
| code { | |
| font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; | |
| font-size: 10pt; | |
| background-color: #f3f4f6; | |
| color: #000000; | |
| padding: 2px 4px; | |
| border-radius: 4px; | |
| white-space: normal; | |
| } | |
| /* TABLES */ | |
| table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin: 12pt 0; | |
| font-size: 10pt; | |
| page-break-inside: auto; | |
| table-layout: fixed; | |
| word-wrap: break-word; | |
| } | |
| td { | |
| word-break: break-word; | |
| overflow-wrap: break-word; | |
| } | |
| th, td { | |
| border: 1px solid #d1d5db; | |
| padding: 8pt; | |
| text-align: {{ text_alignment }}; | |
| } | |
| th { | |
| background: #f9fafb; | |
| font-weight: 600; | |
| } | |
| /* LISTS */ | |
| ul, ol { | |
| padding-left: 30px; | |
| margin: 8pt 0; | |
| } | |
| /* Nested unordered list style differentiation */ | |
| ul { list-style: disc; } | |
| ul ul { list-style: circle; } | |
| ul ul ul { list-style: square; } | |
| /* Nested ordered list style differentiation */ | |
| ol { list-style: decimal; } | |
| ol ol { list-style: lower-alpha; } | |
| ol ol ol { list-style: lower-roman; } | |
| li { | |
| margin: 4pt 0; | |
| } | |
| /* PARAGRAPHS AND TEXT ELEMENTS */ | |
| p { | |
| margin: 8pt 0; | |
| } | |
| h1, h2, h3, h4, h5, h6 { | |
| margin: 16pt 0 8pt 0; | |
| font-weight: 600; | |
| line-height: 1.3; | |
| } | |
| h1 { font-size: 16pt; } | |
| h2 { font-size: 14pt; } | |
| h3 { font-size: 13pt; } | |
| /* PDF PAGE SETUP */ | |
| @page { | |
| size: A4; | |
| margin: 20mm; | |
| } | |
| @media print { | |
| body { | |
| -webkit-print-color-adjust: exact; | |
| print-color-adjust: exact; | |
| } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="header"> | |
| <div class="logo">{{ document_title }}</div> | |
| <div class="date">{{ date }}</div> | |
| </div> | |
| {% for msg in messages %} | |
| <div class="message {{ msg.role }}"> | |
| <div class="content"> | |
| {{ msg.content_html | safe }} | |
| </div> | |
| </div> | |
| {% endfor %} | |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script> | |
| <script> | |
| // Enhanced loading strategy for highlight.js | |
| (function() { | |
| function initHighlighting() { | |
| if (window.hljs) { | |
| try { | |
| hljs.highlightAll(); | |
| } catch (e) { | |
| console.log('Highlight.js error:', e); | |
| } | |
| } else { | |
| console.log('Highlight.js not available, skipping syntax highlighting.'); | |
| } | |
| } | |
| // Try to load with timeout | |
| Promise.race([ | |
| new Promise(function(resolve) { | |
| if (document.fonts && document.fonts.ready) { | |
| document.fonts.ready.then(resolve).catch(resolve); | |
| } else { | |
| resolve(); | |
| } | |
| }), | |
| new Promise(resolve => setTimeout(resolve, 3000)) | |
| ]).then(initHighlighting); | |
| })(); | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| # Create Jinja2 environment for security | |
| jinja_env = Environment(autoescape=select_autoescape(['html', 'xml'])) | |
| def get_text_direction(lang: str) -> str: | |
| """Determine text direction for the language""" | |
| if lang in ['ar', 'he', 'ur', 'fa']: | |
| return 'rtl' | |
| return 'ltr' | |
| def get_text_alignment(lang: str) -> str: | |
| """Determine text alignment for the language""" | |
| if lang in ['ar', 'he', 'ur', 'fa']: | |
| return 'right' | |
| return 'left' | |
| # ==================== UTILITY FUNCTIONS ==================== | |
| def sanitize_content(content: str) -> str: | |
| """ | |
| Context-Aware Gatekeeper: | |
| Sanitizes dangerous HTML from narrative text but PRESERVES it inside code blocks. | |
| Strategy: Mask Code -> Sanitize Text -> Unmask Code | |
| """ | |
| content = str(content).strip() | |
| if not content: | |
| return content | |
| # Storage for the safe code blocks we temporarily hide | |
| placeholders = {} | |
| def mask_match(match): | |
| """Generate a unique token for code blocks to preserve them""" | |
| token = f"__SAFE_CODE_BLOCK_{uuid.uuid4().hex}__" | |
| placeholders[token] = match.group(0) | |
| return token | |
| # --- PHASE 1: MASKING (Protect Valid Data) --- | |
| # This guarantees that educational content (like <link> inside code) is NEVER touched. | |
| # Pattern A: Fenced Code Blocks (``` ... ```) | |
| content = re.sub(r'(```[\s\S]*?```)', mask_match, content) | |
| # Pattern B: Inline Code (` ... `) | |
| # We exclude newlines inside inline code to avoid over-matching broken syntax | |
| content = re.sub(r'(`[^`\n]+`)', mask_match, content) | |
| # --- PHASE 2: FILTRATION (Neutralize Threats in Narrative) --- | |
| # 1. Remove dangerous tags completely | |
| # We INCLUDE 'link' and 'meta' because if they appear outside code blocks, | |
| # they are likely injection attacks (CSS injection or redirects). | |
| dangerous_tags = ['script', 'iframe', 'object', 'embed', 'applet', 'form', 'link', 'meta'] | |
| for tag in dangerous_tags: | |
| # Remove tag and its full content (e.g. <script>...</script>) | |
| pattern = f'<{tag}[^>]*>.*?</{tag}>' | |
| content = re.sub(pattern, '', content, flags=re.IGNORECASE | re.DOTALL) | |
| # Handle self-closing tags or single tags (e.g. <link ... />) | |
| single_pattern = f'<{tag}[^>]*>' | |
| content = re.sub(single_pattern, '', content, flags=re.IGNORECASE) | |
| # 2. Neutralize dangerous attributes in remaining allowed tags (like <div> or <a>) | |
| dangerous_attrs = [ | |
| # Event handlers (onclick, onload, onmouseover, etc.) | |
| r'\son[a-z]+\s*=\s*["\'][^"\']*["\']', | |
| # Javascript protocol in href/src | |
| r'\s(href|src)\s*=\s*["\'][^"\']*javascript:[^"\']*["\']', | |
| # Data URI exploits in href/src (Base64 HTML injection) | |
| r'\s(href|src)\s*=\s*["\'][^"\']*data:[^"\']*["\']', | |
| ] | |
| for attr_pattern in dangerous_attrs: | |
| content = re.sub(attr_pattern, '', content, flags=re.IGNORECASE) | |
| # --- PHASE 3: UNMASKING (Restore Valid Data) --- | |
| for token, original_code in placeholders.items(): | |
| content = content.replace(token, original_code) | |
| return content | |
| def fix_markdown_tables(content: str) -> str: | |
| """ | |
| Heals common Markdown table issues before parsing: | |
| 1. Ensures empty line before table headers (fixes "missing table" bug) | |
| 2. Ensures table rows are on their own lines | |
| """ | |
| # Regex to find a table header pipe starting a line, | |
| # possibly preceded by text on the previous line without a gap. | |
| # Look for: (newline) (text) (newline) (| col | col) | |
| # 1. Force newline before table header if missing | |
| # Matches a pattern like: "text\n| Header |" and inserts extra newline | |
| content = re.sub(r'(?<=\S)\n(\|.*\|.*\n\|[- :|]+\|)', r'\n\n\1', content) | |
| return content | |
| def generate_filename(language: str) -> str: | |
| """Generate a safe filename with language prefix""" | |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
| return f"NeuralStream_Export_{timestamp}_{language}.pdf" | |
| def validate_content_size(content_length: int) -> bool: | |
| """Validate that content size is within acceptable limits""" | |
| return content_length <= MAX_CONTENT_LENGTH | |
| # ==================== API ENDPOINT ==================== | |
| async def export_pdf_endpoint(request: ExportRequest): | |
| """Export chat conversation to PDF with comprehensive multilingual support""" | |
| # Validate content size | |
| total_content = sum(len(msg.get('content', '')) for msg in request.messages) | |
| if not validate_content_size(total_content): | |
| raise HTTPException( | |
| status_code=413, | |
| detail=f"Content too large. Max allowed: {MAX_CONTENT_LENGTH} characters" | |
| ) | |
| page = None | |
| try: | |
| # 1. Process messages to HTML | |
| processed_msgs = [] | |
| for msg in request.messages: | |
| # [CORRECT ORDER] | |
| # Sanitize RAW input first (The Gatekeeper) | |
| raw_content = sanitize_content(msg.get('content', '')) | |
| # Fix tables | |
| fixed_content = fix_markdown_tables(raw_content) | |
| # Render Markdown | |
| html_content = markdown.markdown( | |
| fixed_content, | |
| extensions=['fenced_code', 'tables', 'sane_lists', 'nl2br'] | |
| ) | |
| processed_msgs.append({ | |
| 'role': msg.get('role', 'unknown'), | |
| 'content_html': html_content | |
| }) | |
| # 2. Prepare HTML template | |
| font_family = get_font_for_language(request.language) | |
| template = jinja_env.from_string(PDF_HTML_TEMPLATE) | |
| full_html = template.render( | |
| messages=processed_msgs, | |
| document_title="NeuralStream AI", | |
| date=datetime.now().strftime("%B %d, %Y"), | |
| language=request.language, | |
| font_family=font_family, | |
| text_direction=get_text_direction(request.language), | |
| text_alignment=get_text_alignment(request.language), | |
| title=f"Chat Export {datetime.now().strftime('%m/%d/%Y')}" | |
| ) | |
| # 3. Generate PDF with Playwright | |
| page = await browser_pool.get_page() | |
| page.set_default_timeout(30000) | |
| page.set_default_navigation_timeout(30000) | |
| await page.set_content(full_html, wait_until='load') | |
| # 4. Wait for fonts and code highlighting | |
| await asyncio.wait_for( | |
| page.evaluate('''async () => { | |
| try { | |
| await Promise.race([ | |
| document.fonts ? document.fonts.ready : Promise.resolve(), | |
| new Promise(resolve => setTimeout(resolve, 1500)) | |
| ]); | |
| if (window.hljs) await new Promise(resolve => setTimeout(resolve, 150)); | |
| } catch (e) { console.log('Font loading error:', e); } | |
| }'''), | |
| timeout=5.0 | |
| ) | |
| # [CRITICAL ADDITION] 5. Hydrate Links in Code Blocks | |
| # This turns the text URLs inside code blocks into real clickable <a> tags | |
| await page.evaluate('''() => { | |
| // A. LINK HYDRATION | |
| const codeElements = document.querySelectorAll('pre code'); | |
| codeElements.forEach(block => { | |
| const urlRegex = /(https?:\/\/[^\s<"']+)/g; | |
| block.innerHTML = block.innerHTML.replace(urlRegex, (url) => { | |
| return `<a href="${url}" style="text-decoration:underline; color:inherit; pointer-events:all;">${url}</a>`; | |
| }); | |
| }); | |
| // B. SMART LAYOUT PROTECTION | |
| // Heuristic: If a block is shorter than ~1/3 of a page (approx 350px), | |
| // assume it's a diagram or snippet that should NOT split. | |
| const preBlocks = document.querySelectorAll('pre'); | |
| preBlocks.forEach(pre => { | |
| if (pre.offsetHeight < 350) { | |
| pre.style.pageBreakInside = 'avoid'; | |
| pre.style.breakInside = 'avoid'; // Modern standard | |
| } | |
| }); | |
| }''') | |
| # 6. Generate PDF | |
| pdf_bytes = await page.pdf( | |
| format="A4", | |
| margin={"top": "20mm", "bottom": "20mm", "left": "20mm", "right": "20mm"}, | |
| print_background=True, | |
| display_header_footer=True, | |
| footer_template='<div style="font-size:9px; margin:0 auto; color:#666; text-align:center;">Page <span class="pageNumber"></span> of <span class="totalPages"></span></div>', | |
| header_template='<div></div>', | |
| prefer_css_page_size=True | |
| ) | |
| filename = generate_filename(request.language) | |
| return Response( | |
| content=pdf_bytes, | |
| media_type="application/pdf", | |
| headers={ | |
| "Content-Disposition": f"attachment; filename={filename}", | |
| "Cache-Control": "no-cache, no-store, must-revalidate", | |
| "Pragma": "no-cache", | |
| "Expires": "0" | |
| } | |
| ) | |
| except asyncio.TimeoutError: | |
| raise HTTPException( | |
| status_code=408, | |
| detail="PDF generation timed out. The document may be too complex." | |
| ) | |
| except PlaywrightError as e: | |
| raise HTTPException( | |
| status_code=500, | |
| detail=f"Browser error during PDF generation: {str(e)[:100]}" | |
| ) | |
| except Exception as e: | |
| print(f"PDF Export API Error: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| raise HTTPException( | |
| status_code=500, | |
| detail="Internal server error during PDF generation. Please try again." | |
| ) | |
| finally: | |
| if page: | |
| with suppress(Exception): | |
| await page.close() | |
| # ==================== ANT-EDITOR DOCUMENT EXPORT ==================== | |
| class EditorExportRequest(BaseModel): | |
| """Request model for AbWrite document PDF export""" | |
| html: str = Field(..., description="HTML content from TipTap editor.getHTML()") | |
| title: str = Field(default="Untitled Document", description="Document title") | |
| language: str = Field(default="en", description="ISO 639-1 language code for font selection") | |
| password: Optional[str] = Field(default=None, description="Optional password to encrypt the PDF") | |
| watermark_text: Optional[str] = Field(default=None, description="Optional watermark text to overlay on every page") | |
| def validate_html(cls, v: str) -> str: | |
| if not v or not v.strip(): | |
| raise ValueError('HTML content cannot be empty') | |
| # 10MB limit — documents with embedded base64 images are large | |
| if len(v) > 10_000_000: | |
| raise ValueError('HTML content too large (max 10MB)') | |
| return v | |
| def validate_language(cls, v: str) -> str: | |
| if not isinstance(v, str) or len(v) < 2: | |
| raise ValueError('Language must be a valid ISO code') | |
| return v.lower()[:2] | |
| EDITOR_PDF_TEMPLATE = """ | |
| <!DOCTYPE html> | |
| <html lang="{{ language }}"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>{{ title }}</title> | |
| {% if not is_system_font %} | |
| <link rel="preconnect" href="https://fonts.googleapis.com"> | |
| <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> | |
| <link href="https://fonts.googleapis.com/css2?family={{ font_url }}&display=swap" rel="stylesheet"> | |
| {% endif %} | |
| <!-- Load all editor Google Fonts for user-applied font-family inline styles --> | |
| <link href="https://fonts.googleapis.com/css2?family=Amatic+SC:wght@400;700&family=Bebas+Neue&family=Caveat:wght@400;700&family=Comic+Neue:wght@400;700&family=Cookie&family=Courier+Prime:wght@400;700&family=Dancing+Script:wght@400;700&family=Great+Vibes&family=Indie+Flower&family=Kaushan+Script&family=Lato:wght@300;400;700&family=Libre+Baskerville:ital,wght@0,400;0,700;1,400&family=Lora:ital,wght@0,400;0,600;1,400&family=Merriweather:wght@300;700&family=Montserrat:wght@400;600;800&family=Open+Sans:wght@400;600&family=Oswald:wght@400;500&family=Pacifico&family=Patrick+Hand&family=Permanent+Marker&family=Playfair+Display:wght@400;700&family=Poppins:wght@300;400;600&family=Roboto:wght@300;400;700&family=Sacramento&family=Shadows+Into+Light&family=Tinos:ital,wght@0,400;0,700;1,400&display=swap" rel="stylesheet"> | |
| <style> | |
| *, *::before, *::after { | |
| box-sizing: border-box; | |
| margin: 0; | |
| padding: 0; | |
| } | |
| body { | |
| /* Georgia first = matches the editor's default document font. | |
| Language-specific Noto font as fallback = auto-activates for non-Latin | |
| scripts (Hindi, Arabic, etc.) because Georgia lacks those glyphs. */ | |
| font-family: 'Georgia', '{{ font_name }}', 'Times New Roman', serif; | |
| color: #1a1a2e; | |
| direction: {{ text_direction }}; | |
| font-size: 13pt; | |
| line-height: 1.6; | |
| -webkit-print-color-adjust: exact !important; | |
| print-color-adjust: exact !important; | |
| -webkit-font-smoothing: antialiased; | |
| -moz-osx-font-smoothing: grayscale; | |
| text-rendering: optimizeLegibility; | |
| font-feature-settings: 'kern' 1, 'liga' 1; | |
| } | |
| /* ── Headings (matched to editor) ── */ | |
| h1 { | |
| font-size: 26pt; | |
| font-weight: 700; | |
| margin: 5pt 0 3pt 0; | |
| line-height: 1.2; | |
| color: #1a1a2e; | |
| letter-spacing: -0.015em; | |
| } | |
| h2 { | |
| font-size: 20pt; | |
| font-weight: 600; | |
| margin: 6pt 0 3pt 0; | |
| line-height: 1.25; | |
| color: #1a1a2e; | |
| letter-spacing: -0.01em; | |
| } | |
| h3 { | |
| font-size: 16pt; | |
| font-weight: 600; | |
| margin: 6pt 0 2pt 0; | |
| line-height: 1.3; | |
| color: #2a2a3e; | |
| } | |
| h4 { | |
| font-size: 13pt; | |
| font-weight: 600; | |
| margin: 6pt 0 1pt 0; | |
| color: #2a2a3e; | |
| } | |
| h5 { | |
| font-size: 11pt; | |
| font-weight: 600; | |
| margin: 6pt 0 1pt 0; | |
| color: #3a3a4e; | |
| text-transform: uppercase; | |
| letter-spacing: 0.04em; | |
| } | |
| h6 { | |
| font-size: 10pt; | |
| font-weight: 600; | |
| margin: 6pt 0 1pt 0; | |
| color: #4a4a5e; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| } | |
| h1, h2, h3, h4, h5, h6 { page-break-after: avoid; } | |
| /* ── Paragraphs ── */ | |
| p { | |
| margin: 0 0 6pt 0; | |
| orphans: 3; | |
| widows: 3; | |
| } | |
| p:last-child { margin-bottom: 0; } | |
| /* ── Links (matched to editor #4a69bd) ── */ | |
| a { | |
| color: #4a69bd; | |
| text-decoration: underline; | |
| text-decoration-color: rgba(74, 105, 189, 0.4); | |
| text-underline-offset: 2px; | |
| } | |
| /* ── Lists ── */ | |
| ul, ol { padding-left: 22pt; margin: 5pt 0; } | |
| /* Nested unordered list styles (disc → circle → square) */ | |
| ul { list-style: disc; } | |
| ul ul { list-style: circle; } | |
| ul ul ul { list-style: square; } | |
| /* Nested ordered list styles (decimal → lower-alpha → lower-roman) */ | |
| ol { list-style: decimal; } | |
| ol ol { list-style: lower-alpha; } | |
| ol ol ol { list-style: lower-roman; } | |
| li { margin: 2pt 0; } | |
| li p { margin: 0; } | |
| ul[data-type="taskList"] { list-style: none; padding-left: 0; } | |
| ul[data-type="taskList"] li { display: flex; align-items: flex-start; gap: 6pt; } | |
| ul[data-type="taskList"] li[data-checked="true"] p { | |
| text-decoration: line-through; | |
| color: #999; | |
| } | |
| /* ── Tables (matched to editor) ── */ | |
| table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin: 10pt 0; | |
| font-size: 12pt; | |
| page-break-inside: auto; | |
| table-layout: fixed; | |
| word-wrap: break-word; | |
| } | |
| th, td { | |
| border: 1px solid #ddd; | |
| padding: 6pt 9pt; | |
| text-align: {{ text_alignment }}; | |
| vertical-align: top; | |
| } | |
| th { | |
| background: #f5f5fa; | |
| font-weight: 600; | |
| text-align: left; | |
| } | |
| tr { page-break-inside: avoid; } | |
| /* ── Code blocks (dark theme, matched to editor) ── */ | |
| pre { | |
| background: #1e1e2e; | |
| color: #e8e8ed; | |
| border-radius: 6pt; | |
| padding: 12pt; | |
| margin: 10pt 0; | |
| overflow-x: auto; | |
| white-space: pre-wrap; | |
| word-wrap: break-word; | |
| font-size: 11pt; | |
| line-height: 1.5; | |
| page-break-inside: auto; | |
| orphans: 3; | |
| widows: 3; | |
| } | |
| pre code { | |
| display: block; | |
| background: transparent; | |
| padding: 0; | |
| font-family: 'Consolas', 'Fira Code', ui-monospace, SFMono-Regular, Menlo, Monaco, monospace; | |
| font-size: 11pt; | |
| color: #e8e8ed; | |
| white-space: pre-wrap; | |
| word-wrap: break-word; | |
| } | |
| /* ── Inline code (matched to editor) ── */ | |
| code { | |
| font-family: 'Consolas', 'Fira Code', ui-monospace, SFMono-Regular, Menlo, Monaco, monospace; | |
| font-size: 0.9em; | |
| background: #f0f0f5; | |
| padding: 2pt 4pt; | |
| border-radius: 3pt; | |
| color: #d63384; | |
| } | |
| /* ── Blockquotes (matched to editor — purple accent, italic) ── */ | |
| blockquote { | |
| border-left: 3pt solid #6c5ce7; | |
| padding: 6pt 0 6pt 12pt; | |
| margin: 10pt 0; | |
| color: #444; | |
| font-style: italic; | |
| background: rgba(108, 92, 231, 0.03); | |
| } | |
| /* ── Horizontal rules ── */ | |
| hr { | |
| border: none; | |
| border-top: 1pt solid #e0e0e5; | |
| margin: 18pt 0; | |
| } | |
| /* ── Images ── */ | |
| img { | |
| max-width: 100%; | |
| height: auto; | |
| display: block; | |
| margin: 8pt auto; | |
| border-radius: 4pt; | |
| page-break-inside: avoid; | |
| } | |
| /* ── Image-Text Block (float-based text wrapping) ── */ | |
| .image-text-block { | |
| overflow: hidden; | |
| margin: 8pt 0; | |
| page-break-inside: auto; | |
| } | |
| .image-text-block-image { | |
| max-width: 45%; | |
| line-height: 0; | |
| } | |
| .image-text-block[data-image-position="left"] .image-text-block-image { | |
| float: left; | |
| margin: 0 16px 8px 0; | |
| } | |
| .image-text-block[data-image-position="right"] .image-text-block-image { | |
| float: right; | |
| margin: 0 0 8px 16px; | |
| } | |
| .image-text-block-image img { max-width: 100%; height: auto; margin: 0; } | |
| /* Flip support */ | |
| .image-text-block[data-flip-h="true"] .image-text-block-image img { transform: scaleX(-1); } | |
| .image-text-block[data-flip-v="true"] .image-text-block-image img { transform: scaleY(-1); } | |
| .image-text-block[data-flip-h="true"][data-flip-v="true"] .image-text-block-image img { transform: scaleX(-1) scaleY(-1); } | |
| .image-text-block-content { } | |
| /* ── Resizable Image Wrapper (from editor's ResizableImage extension) ── */ | |
| .resizable-image-wrapper { | |
| width: fit-content; | |
| max-width: 100%; | |
| margin: 8pt 0; | |
| page-break-inside: avoid; | |
| } | |
| .resizable-image-wrapper img { | |
| display: block; | |
| max-width: 100%; | |
| height: auto; | |
| border-radius: 4pt; | |
| margin: 0; /* parent wrapper handles margins */ | |
| } | |
| /* Preserve explicit width/height from editor resize handles */ | |
| .resizable-image-wrapper img[width] { | |
| width: attr(width px); | |
| } | |
| /* Alignment variants */ | |
| .resizable-image-align-left { margin-left: 0; margin-right: auto; } | |
| .resizable-image-align-center { margin-left: auto; margin-right: auto; } | |
| .resizable-image-align-right { margin-left: auto; margin-right: 0; } | |
| /* Flip support for standalone images */ | |
| .resizable-image-wrapper[data-flip-h="true"] img { transform: scaleX(-1); } | |
| .resizable-image-wrapper[data-flip-v="true"] img { transform: scaleY(-1); } | |
| .resizable-image-wrapper[data-flip-h="true"][data-flip-v="true"] img { transform: scaleX(-1) scaleY(-1); } | |
| /* ── Highlights (warm yellow, matched to editor) ── */ | |
| mark { | |
| background: #fef3c7; | |
| padding: 1pt 2pt; | |
| border-radius: 2pt; | |
| } | |
| /* ── Text formatting ── */ | |
| s { text-decoration: line-through; color: #999; } | |
| sub { font-size: 0.75em; } | |
| sup { font-size: 0.75em; } | |
| /* ── PDF page setup ── */ | |
| @page { | |
| size: A4; | |
| margin: 25mm 20mm 25mm 20mm; | |
| } | |
| @media print { | |
| body { | |
| -webkit-print-color-adjust: exact; | |
| print-color-adjust: exact; | |
| } | |
| pre { | |
| background: #1e1e2e !important; | |
| color: #e8e8ed !important; | |
| } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| {% if watermark_text %} | |
| <div style=" | |
| position: fixed; | |
| top: 0; left: 0; right: 0; bottom: 0; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| pointer-events: none; | |
| z-index: 9999; | |
| "> | |
| <div style=" | |
| font-size: 72pt; | |
| font-weight: 800; | |
| color: rgba(0, 0, 0, 0.06); | |
| transform: rotate(-35deg); | |
| white-space: nowrap; | |
| user-select: none; | |
| letter-spacing: 0.08em; | |
| text-transform: uppercase; | |
| ">{{ watermark_text }}</div> | |
| </div> | |
| {% endif %} | |
| {{ content | safe }} | |
| <script> | |
| // Wait for fonts to load | |
| (function() { | |
| if (document.fonts && document.fonts.ready) { | |
| Promise.race([ | |
| document.fonts.ready, | |
| new Promise(resolve => setTimeout(resolve, 3000)) | |
| ]).catch(() => {}); | |
| } | |
| })(); | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| async def editor_export_pdf(request: EditorExportRequest): | |
| """Export Ant-Editor document content to PDF via Playwright""" | |
| page = None | |
| try: | |
| # 1. Determine font settings | |
| lang = request.language | |
| font_info = MULTILINGUAL_FONTS.get(lang, MULTILINGUAL_FONTS['en']) | |
| font_name = font_info[0] | |
| font_url = font_info[1] | |
| is_system_font = font_info[2] if len(font_info) > 2 else False | |
| # 2. Render HTML template | |
| template = jinja_env.from_string(EDITOR_PDF_TEMPLATE) | |
| full_html = template.render( | |
| content=request.html, | |
| title=request.title, | |
| language=lang, | |
| font_name=font_name, | |
| font_url=font_url, | |
| is_system_font=is_system_font, | |
| text_direction=get_text_direction(lang), | |
| text_alignment=get_text_alignment(lang), | |
| watermark_text=request.watermark_text or '', | |
| ) | |
| # 3. Generate PDF via Playwright | |
| page = await browser_pool.get_page() | |
| page.set_default_timeout(30000) | |
| page.set_default_navigation_timeout(30000) | |
| await page.set_content(full_html, wait_until='networkidle') | |
| # 4. Wait for fonts to load | |
| await asyncio.wait_for( | |
| page.evaluate('''async () => { | |
| try { | |
| await Promise.race([ | |
| document.fonts ? document.fonts.ready : Promise.resolve(), | |
| new Promise(resolve => setTimeout(resolve, 2000)) | |
| ]); | |
| } catch (e) {} | |
| }'''), | |
| timeout=5.0 | |
| ) | |
| # 5. Wait for all images to finish loading (base64 + external URLs) | |
| await asyncio.wait_for( | |
| page.evaluate('''() => { | |
| const imgs = document.querySelectorAll('img'); | |
| return Promise.all(Array.from(imgs).map(img => { | |
| if (img.complete) return Promise.resolve(); | |
| return new Promise(resolve => { | |
| img.onload = resolve; | |
| img.onerror = resolve; | |
| }); | |
| })); | |
| }'''), | |
| timeout=10.0 | |
| ) | |
| # 6. Apply explicit width from HTML attributes (CSS attr() fallback) | |
| await page.evaluate('''() => { | |
| document.querySelectorAll('.resizable-image-wrapper img').forEach(img => { | |
| const w = img.getAttribute('width'); | |
| if (w) { | |
| img.style.width = w + (w.includes('%') ? '' : 'px'); | |
| img.style.maxWidth = '100%'; | |
| img.style.height = 'auto'; | |
| } | |
| }); | |
| }''') | |
| # 7. Smart layout: keep short code blocks together | |
| await page.evaluate('''() => { | |
| const preBlocks = document.querySelectorAll('pre'); | |
| preBlocks.forEach(pre => { | |
| if (pre.offsetHeight < 400) { | |
| pre.style.pageBreakInside = 'avoid'; | |
| pre.style.breakInside = 'avoid'; | |
| } | |
| }); | |
| }''') | |
| # 8. Generate PDF bytes | |
| pdf_bytes = await page.pdf( | |
| format="A4", | |
| scale=1, | |
| margin={"top": "25mm", "bottom": "25mm", "left": "20mm", "right": "20mm"}, | |
| print_background=True, | |
| display_header_footer=True, | |
| footer_template='<div style="font-size:9px; margin:0 auto; color:#999; text-align:center; width:100%;">Page <span class="pageNumber"></span> of <span class="totalPages"></span></div>', | |
| header_template='<div></div>', | |
| prefer_css_page_size=True, | |
| ) | |
| # 9. Encrypt PDF with password if provided | |
| if request.password: | |
| import pikepdf | |
| src = pikepdf.open(io.BytesIO(pdf_bytes)) | |
| encrypted_buf = io.BytesIO() | |
| src.save( | |
| encrypted_buf, | |
| encryption=pikepdf.Encryption( | |
| owner=request.password, | |
| user=request.password, | |
| R=6, # AES-256 | |
| ), | |
| ) | |
| src.close() | |
| pdf_bytes = encrypted_buf.getvalue() | |
| # 10. Generate safe filename — ASCII-only for HTTP header safety | |
| # re.ASCII ensures \w matches only [a-zA-Z0-9_], preventing | |
| # Unicode chars that Starlette can't encode as latin-1 headers. | |
| safe_title = re.sub(r'[^\w\s-]', '', request.title, flags=re.ASCII)[:50].strip() or 'Document' | |
| safe_title = re.sub(r'\s+', '_', safe_title) | |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
| filename = f"{safe_title}_{timestamp}.pdf" | |
| # Build Content-Disposition with RFC 5987 filename* for Unicode display | |
| from urllib.parse import quote | |
| cd_value = f'attachment; filename="{filename}"' | |
| # If original title has non-ASCII chars, add filename* so browsers | |
| # show the real Unicode name while the ASCII fallback stays safe | |
| unicode_title = re.sub(r'[<>:"/\\|?*]', '', request.title)[:80].strip() or 'Document' | |
| unicode_title = re.sub(r'\s+', '_', unicode_title) | |
| unicode_filename = f"{unicode_title}_{timestamp}.pdf" | |
| encoded_unicode = quote(unicode_filename, safe='') | |
| cd_value += f"; filename*=UTF-8''{encoded_unicode}" | |
| return Response( | |
| content=pdf_bytes, | |
| media_type="application/pdf", | |
| headers={ | |
| "Content-Disposition": cd_value, | |
| "Cache-Control": "no-cache, no-store, must-revalidate", | |
| } | |
| ) | |
| except asyncio.TimeoutError: | |
| raise HTTPException(status_code=408, detail="PDF generation timed out") | |
| except PlaywrightError as e: | |
| raise HTTPException(status_code=500, detail=f"Browser error: {str(e)[:200]}") | |
| except Exception as e: | |
| print(f"Editor PDF Export Error: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| raise HTTPException(status_code=500, detail="PDF generation failed") | |
| finally: | |
| if page: | |
| with suppress(Exception): | |
| await page.close() | |
| # Health check endpoint | |
| async def health(): | |
| """Health check endpoint for load balancers and monitoring""" | |
| return { | |
| "status": "healthy", | |
| "timestamp": datetime.now().isoformat(), | |
| "service": "pdf-exporter" | |
| } | |
| # Test endpoint (for debugging) | |
| async def export_html_endpoint(request: ExportRequest): | |
| """Return rendered HTML for debugging purposes""" | |
| processed_msgs = [] | |
| for msg in request.messages: | |
| html_content = markdown.markdown( | |
| str(msg.get('content', '')), | |
| extensions=['fenced_code', 'tables', 'sane_lists'] | |
| ) | |
| processed_msgs.append({ | |
| 'role': msg.get('role', 'unknown'), | |
| 'content_html': html_content | |
| }) | |
| font_family = get_font_for_language(request.language) | |
| template = jinja_env.from_string(PDF_HTML_TEMPLATE) | |
| full_html = template.render( | |
| messages=processed_msgs, | |
| document_title="NeuralStream AI (HTML Preview)", | |
| date=datetime.now().strftime("%B %d, %Y"), | |
| language=request.language, | |
| font_family=font_family, | |
| text_direction=get_text_direction(request.language), | |
| text_alignment=get_text_alignment(request.language), | |
| title="Chat HTML Preview" | |
| ) | |
| return Response( | |
| content=full_html, | |
| media_type="text/html", | |
| headers={"X-Debug": "html-preview"} | |
| ) | |
| # Startup/Shutdown events | |
| async def startup_event(): | |
| """Initialize browser pool on startup to warm up resources""" | |
| print("🚀 Pre-warming browser pool...") | |
| try: | |
| # Launch browser immediately so we catch any errors at deploy time | |
| # instead of failing on the first user request. | |
| await browser_pool.get_page() | |
| print("✅ Browser pool warmed up successfully") | |
| except Exception as e: | |
| print(f"⚠️ Warning: Failed to pre-warm browser pool: {e}") | |
| # We don't raise here so the server still starts; individual requests will retry. | |
| async def shutdown_event(): | |
| """Clean shutdown of browser pool""" | |
| await browser_pool.close() | |
| if __name__ == "__main__": | |
| # Force the Windows Proactor Loop Policy (Required for Playwright) | |
| if sys.platform == 'win32': | |
| asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) | |
| print("🚀 Starting NeuralStream PDF Backend...") | |
| uvicorn.run(app, host="0.0.0.0", port=7860, reload=False) | |