import sys import asyncio import os from contextlib import suppress from typing import Optional import time import uuid import re # Set Windows event loop policy for Playwright compatibility if sys.platform == 'win32': asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) from fastapi import FastAPI, HTTPException, Body from fastapi.responses import Response from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, Field, field_validator from typing import List, Dict, Any import markdown from jinja2 import Template, Environment, select_autoescape from playwright.async_api import async_playwright, Browser, BrowserContext, Page, Error as PlaywrightError from datetime import datetime import re from urllib.parse import quote import gc import io import uvicorn # ==================== APP INITIALIZATION ==================== app = FastAPI( title="Chat PDF Export Service", description="Production-grade API for exporting chat conversations to PDF", version="1.0.0" ) # Add CORS for web clients app.add_middleware( CORSMiddleware, allow_origins=["*"], # Configure for your domain in production allow_credentials=True, allow_methods=["GET", "POST"], allow_headers=["*"], ) # ==================== GLOBAL CONFIGURATION ==================== MAX_CONTENT_LENGTH = 50_000 # 50kb max content length PDF_GENERATION_TIMEOUT = 30 # seconds MAX_REQUESTS_PER_CONNECTION = 100 # After this, browser is restarted # ==================== PYDANTIC MODELS ==================== class ExportRequest(BaseModel): messages: List[dict] = Field(..., min_length=1) language: str = Field(default="en", description="ISO 639-1 language code") font_family: Optional[str] = Field(default=None, description="Custom font family") @field_validator('messages') @classmethod def validate_messages(cls, v: list) -> list: for msg in v: if not isinstance(msg, dict): raise ValueError('Each message must be a dictionary') if 'role' not in msg or 'content' not in msg: raise ValueError('Message must have "role" and "content" keys') return v @field_validator('language') @classmethod def validate_language(cls, v: str) -> str: if not isinstance(v, str) or len(v) != 2: raise ValueError('Language must be a 2-letter ISO code') return v.lower() # ==================== PLAYWRIGHT BROWSER MANAGEMENT ==================== class PlaywrightBrowserPool: """Manages a pool of persistent browser instances for optimal performance""" def __init__(self): self.browser: Optional[Browser] = None self.context: Optional[BrowserContext] = None self.playwright = None self.request_count = 0 self._lock = asyncio.Lock() self._last_maintenance = time.time() async def get_page(self) -> Page: """Get a new page from the browser pool""" async with self._lock: # Check if browser is alive; restart if it crashed (OOM, timeout, etc.) if self.browser and not self.browser.is_connected(): print("[browser_pool] Browser disconnected — restarting…") self.browser = None self.context = None if not self.browser or not self.context: await self._create_browser() # Perform maintenance every N requests if self.request_count > MAX_REQUESTS_PER_CONNECTION: await self._restart_browser() self.request_count = 0 try: page = await self.context.new_page() except Exception: # Browser died between the check and page creation — restart print("[browser_pool] Failed to create page — restarting browser…") await self._restart_browser() page = await self.context.new_page() self.request_count += 1 return page async def _create_browser(self): """Initialize the Playwright browser instance""" self.playwright = await async_playwright().start() self.browser = await self.playwright.chromium.launch( headless=True, args=[ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--no-zygote', # Critical for Docker: skip forking zygote process '--single-process', # Critical for Docker: run everything in one process '--disable-web-security', '--disable-features=VizDisplayCompositor', '--font-render-hinting=none', # Prevents blurry PDF text '--disable-lcd-text', # Disable subpixel AA (fuzz in PDFs) '--enable-font-antialiasing', '--force-color-profile=srgb', ] ) self.context = await self.browser.new_context() self._last_maintenance = time.time() print(f"[browser_pool] Browser launched successfully (pid={self.browser.process.pid if self.browser.process else '?'})") async def _restart_browser(self): """Restart browser to free memory and resources""" # Close existing browser + Playwright server gracefully with suppress(Exception): if self.browser: await self.browser.close() with suppress(Exception): if self.playwright: await self.playwright.stop() self.browser = None self.context = None self.playwright = None gc.collect() await self._create_browser() async def close(self): """Clean up browser instances""" with suppress(Exception): if self.browser: await self.browser.close() with suppress(Exception): if self.playwright: await self.playwright.stop() # Initialize global browser pool browser_pool = PlaywrightBrowserPool() # ==================== MULTILINGUAL FONT MAPPING ==================== MULTILINGUAL_FONTS = { # (display_name, google_font_url_param, is_system_font) # is_system_font=True means no Google Font link needed (the font is pre-installed) 'en': ('Georgia', '', True), 'hi': ('Noto Sans Devanagari', 'Noto+Sans+Devanagari:wght@400;600;700', False), 'ar': ('Noto Sans Arabic', 'Noto+Sans+Arabic:wght@400;600;700', False), 'zh': ('Noto Sans SC', 'Noto+Sans+SC:wght@400;600;700', False), 'ja': ('Noto Sans JP', 'Noto+Sans+JP:wght@400;600;700', False), 'ko': ('Noto Sans KR', 'Noto+Sans+KR:wght@400;600;700', False), 'th': ('Noto Sans Thai', 'Noto+Sans+Thai:wght@400;600;700', False), 'he': ('Noto Serif Hebrew', 'Noto+Serif+Hebrew:wght@400;600;700', False), 'bn': ('Noto Sans Bengali', 'Noto+Sans+Bengali:wght@400;600;700', False), 'ta': ('Noto Sans Tamil', 'Noto+Sans+Tamil:wght@400;600;700', False), 'te': ('Noto Serif Telugu', 'Noto+Serif+Telugu:wght@400;600;700', False), 'ml': ('Noto Serif Malayalam', 'Noto+Serif+Malayalam:wght@400;600;700', False), 'ru': ('Georgia', '', True), 'ur': ('Noto Nastaliq Urdu', 'Noto+Nastaliq+Urdu', False), } def get_font_for_language(lang: str) -> str: """Get appropriate Google Font for the specified language""" lang = lang.lower() info = MULTILINGUAL_FONTS.get(lang, ('Georgia', '', True)) return info[1] # ==================== HTML TEMPLATE - FIXED VERSION ==================== PDF_HTML_TEMPLATE = """ {{ title }}
{{ date }}
{% for msg in messages %}
{{ msg.content_html | safe }}
{% endfor %} """ # Create Jinja2 environment for security jinja_env = Environment(autoescape=select_autoescape(['html', 'xml'])) def get_text_direction(lang: str) -> str: """Determine text direction for the language""" if lang in ['ar', 'he', 'ur', 'fa']: return 'rtl' return 'ltr' def get_text_alignment(lang: str) -> str: """Determine text alignment for the language""" if lang in ['ar', 'he', 'ur', 'fa']: return 'right' return 'left' # ==================== UTILITY FUNCTIONS ==================== def sanitize_content(content: str) -> str: """ Context-Aware Gatekeeper: Sanitizes dangerous HTML from narrative text but PRESERVES it inside code blocks. Strategy: Mask Code -> Sanitize Text -> Unmask Code """ content = str(content).strip() if not content: return content # Storage for the safe code blocks we temporarily hide placeholders = {} def mask_match(match): """Generate a unique token for code blocks to preserve them""" token = f"__SAFE_CODE_BLOCK_{uuid.uuid4().hex}__" placeholders[token] = match.group(0) return token # --- PHASE 1: MASKING (Protect Valid Data) --- # This guarantees that educational content (like inside code) is NEVER touched. # Pattern A: Fenced Code Blocks (``` ... ```) content = re.sub(r'(```[\s\S]*?```)', mask_match, content) # Pattern B: Inline Code (` ... `) # We exclude newlines inside inline code to avoid over-matching broken syntax content = re.sub(r'(`[^`\n]+`)', mask_match, content) # --- PHASE 2: FILTRATION (Neutralize Threats in Narrative) --- # 1. Remove dangerous tags completely # We INCLUDE 'link' and 'meta' because if they appear outside code blocks, # they are likely injection attacks (CSS injection or redirects). dangerous_tags = ['script', 'iframe', 'object', 'embed', 'applet', 'form', 'link', 'meta'] for tag in dangerous_tags: # Remove tag and its full content (e.g. ) pattern = f'<{tag}[^>]*>.*?' content = re.sub(pattern, '', content, flags=re.IGNORECASE | re.DOTALL) # Handle self-closing tags or single tags (e.g. ) single_pattern = f'<{tag}[^>]*>' content = re.sub(single_pattern, '', content, flags=re.IGNORECASE) # 2. Neutralize dangerous attributes in remaining allowed tags (like
or ) dangerous_attrs = [ # Event handlers (onclick, onload, onmouseover, etc.) r'\son[a-z]+\s*=\s*["\'][^"\']*["\']', # Javascript protocol in href/src r'\s(href|src)\s*=\s*["\'][^"\']*javascript:[^"\']*["\']', # Data URI exploits in href/src (Base64 HTML injection) r'\s(href|src)\s*=\s*["\'][^"\']*data:[^"\']*["\']', ] for attr_pattern in dangerous_attrs: content = re.sub(attr_pattern, '', content, flags=re.IGNORECASE) # --- PHASE 3: UNMASKING (Restore Valid Data) --- for token, original_code in placeholders.items(): content = content.replace(token, original_code) return content def fix_markdown_tables(content: str) -> str: """ Heals common Markdown table issues before parsing: 1. Ensures empty line before table headers (fixes "missing table" bug) 2. Ensures table rows are on their own lines """ # Regex to find a table header pipe starting a line, # possibly preceded by text on the previous line without a gap. # Look for: (newline) (text) (newline) (| col | col) # 1. Force newline before table header if missing # Matches a pattern like: "text\n| Header |" and inserts extra newline content = re.sub(r'(?<=\S)\n(\|.*\|.*\n\|[- :|]+\|)', r'\n\n\1', content) return content def generate_filename(language: str) -> str: """Generate a safe filename with language prefix""" timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') return f"NeuralStream_Export_{timestamp}_{language}.pdf" def validate_content_size(content_length: int) -> bool: """Validate that content size is within acceptable limits""" return content_length <= MAX_CONTENT_LENGTH # ==================== API ENDPOINT ==================== @app.post("/api/export/pdf") async def export_pdf_endpoint(request: ExportRequest): """Export chat conversation to PDF with comprehensive multilingual support""" # Validate content size total_content = sum(len(msg.get('content', '')) for msg in request.messages) if not validate_content_size(total_content): raise HTTPException( status_code=413, detail=f"Content too large. Max allowed: {MAX_CONTENT_LENGTH} characters" ) page = None try: # 1. Process messages to HTML processed_msgs = [] for msg in request.messages: # [CORRECT ORDER] # Sanitize RAW input first (The Gatekeeper) raw_content = sanitize_content(msg.get('content', '')) # Fix tables fixed_content = fix_markdown_tables(raw_content) # Render Markdown html_content = markdown.markdown( fixed_content, extensions=['fenced_code', 'tables', 'sane_lists', 'nl2br'] ) processed_msgs.append({ 'role': msg.get('role', 'unknown'), 'content_html': html_content }) # 2. Prepare HTML template font_family = get_font_for_language(request.language) template = jinja_env.from_string(PDF_HTML_TEMPLATE) full_html = template.render( messages=processed_msgs, document_title="NeuralStream AI", date=datetime.now().strftime("%B %d, %Y"), language=request.language, font_family=font_family, text_direction=get_text_direction(request.language), text_alignment=get_text_alignment(request.language), title=f"Chat Export {datetime.now().strftime('%m/%d/%Y')}" ) # 3. Generate PDF with Playwright page = await browser_pool.get_page() page.set_default_timeout(30000) page.set_default_navigation_timeout(30000) await page.set_content(full_html, wait_until='load') # 4. Wait for fonts and code highlighting await asyncio.wait_for( page.evaluate('''async () => { try { await Promise.race([ document.fonts ? document.fonts.ready : Promise.resolve(), new Promise(resolve => setTimeout(resolve, 1500)) ]); if (window.hljs) await new Promise(resolve => setTimeout(resolve, 150)); } catch (e) { console.log('Font loading error:', e); } }'''), timeout=5.0 ) # [CRITICAL ADDITION] 5. Hydrate Links in Code Blocks # This turns the text URLs inside code blocks into real clickable tags await page.evaluate('''() => { // A. LINK HYDRATION const codeElements = document.querySelectorAll('pre code'); codeElements.forEach(block => { const urlRegex = /(https?:\/\/[^\s<"']+)/g; block.innerHTML = block.innerHTML.replace(urlRegex, (url) => { return `${url}`; }); }); // B. SMART LAYOUT PROTECTION // Heuristic: If a block is shorter than ~1/3 of a page (approx 350px), // assume it's a diagram or snippet that should NOT split. const preBlocks = document.querySelectorAll('pre'); preBlocks.forEach(pre => { if (pre.offsetHeight < 350) { pre.style.pageBreakInside = 'avoid'; pre.style.breakInside = 'avoid'; // Modern standard } }); }''') # 6. Generate PDF pdf_bytes = await page.pdf( format="A4", margin={"top": "20mm", "bottom": "20mm", "left": "20mm", "right": "20mm"}, print_background=True, display_header_footer=True, footer_template='
Page of
', header_template='
', prefer_css_page_size=True ) filename = generate_filename(request.language) return Response( content=pdf_bytes, media_type="application/pdf", headers={ "Content-Disposition": f"attachment; filename={filename}", "Cache-Control": "no-cache, no-store, must-revalidate", "Pragma": "no-cache", "Expires": "0" } ) except asyncio.TimeoutError: raise HTTPException( status_code=408, detail="PDF generation timed out. The document may be too complex." ) except PlaywrightError as e: raise HTTPException( status_code=500, detail=f"Browser error during PDF generation: {str(e)[:100]}" ) except Exception as e: print(f"PDF Export API Error: {str(e)}") import traceback traceback.print_exc() raise HTTPException( status_code=500, detail="Internal server error during PDF generation. Please try again." ) finally: if page: with suppress(Exception): await page.close() # ==================== ANT-EDITOR DOCUMENT EXPORT ==================== class EditorExportRequest(BaseModel): """Request model for AbWrite document PDF export""" html: str = Field(..., description="HTML content from TipTap editor.getHTML()") title: str = Field(default="Untitled Document", description="Document title") language: str = Field(default="en", description="ISO 639-1 language code for font selection") password: Optional[str] = Field(default=None, description="Optional password to encrypt the PDF") watermark_text: Optional[str] = Field(default=None, description="Optional watermark text to overlay on every page") @field_validator('html') @classmethod def validate_html(cls, v: str) -> str: if not v or not v.strip(): raise ValueError('HTML content cannot be empty') # 10MB limit — documents with embedded base64 images are large if len(v) > 10_000_000: raise ValueError('HTML content too large (max 10MB)') return v @field_validator('language') @classmethod def validate_language(cls, v: str) -> str: if not isinstance(v, str) or len(v) < 2: raise ValueError('Language must be a valid ISO code') return v.lower()[:2] EDITOR_PDF_TEMPLATE = """ {{ title }} {% if not is_system_font %} {% endif %} {% if watermark_text %}
{{ watermark_text }}
{% endif %} {{ content | safe }} """ @app.post("/api/editor/export-pdf") async def editor_export_pdf(request: EditorExportRequest): """Export Ant-Editor document content to PDF via Playwright""" page = None try: # 1. Determine font settings lang = request.language font_info = MULTILINGUAL_FONTS.get(lang, MULTILINGUAL_FONTS['en']) font_name = font_info[0] font_url = font_info[1] is_system_font = font_info[2] if len(font_info) > 2 else False # 2. Render HTML template template = jinja_env.from_string(EDITOR_PDF_TEMPLATE) full_html = template.render( content=request.html, title=request.title, language=lang, font_name=font_name, font_url=font_url, is_system_font=is_system_font, text_direction=get_text_direction(lang), text_alignment=get_text_alignment(lang), watermark_text=request.watermark_text or '', ) # 3. Generate PDF via Playwright page = await browser_pool.get_page() page.set_default_timeout(30000) page.set_default_navigation_timeout(30000) await page.set_content(full_html, wait_until='networkidle') # 4. Wait for fonts to load await asyncio.wait_for( page.evaluate('''async () => { try { await Promise.race([ document.fonts ? document.fonts.ready : Promise.resolve(), new Promise(resolve => setTimeout(resolve, 2000)) ]); } catch (e) {} }'''), timeout=5.0 ) # 5. Wait for all images to finish loading (base64 + external URLs) await asyncio.wait_for( page.evaluate('''() => { const imgs = document.querySelectorAll('img'); return Promise.all(Array.from(imgs).map(img => { if (img.complete) return Promise.resolve(); return new Promise(resolve => { img.onload = resolve; img.onerror = resolve; }); })); }'''), timeout=10.0 ) # 6. Apply explicit width from HTML attributes (CSS attr() fallback) await page.evaluate('''() => { document.querySelectorAll('.resizable-image-wrapper img').forEach(img => { const w = img.getAttribute('width'); if (w) { img.style.width = w + (w.includes('%') ? '' : 'px'); img.style.maxWidth = '100%'; img.style.height = 'auto'; } }); }''') # 7. Smart layout: keep short code blocks together await page.evaluate('''() => { const preBlocks = document.querySelectorAll('pre'); preBlocks.forEach(pre => { if (pre.offsetHeight < 400) { pre.style.pageBreakInside = 'avoid'; pre.style.breakInside = 'avoid'; } }); }''') # 8. Generate PDF bytes pdf_bytes = await page.pdf( format="A4", scale=1, margin={"top": "25mm", "bottom": "25mm", "left": "20mm", "right": "20mm"}, print_background=True, display_header_footer=True, footer_template='
Page of
', header_template='
', prefer_css_page_size=True, ) # 9. Encrypt PDF with password if provided if request.password: import pikepdf src = pikepdf.open(io.BytesIO(pdf_bytes)) encrypted_buf = io.BytesIO() src.save( encrypted_buf, encryption=pikepdf.Encryption( owner=request.password, user=request.password, R=6, # AES-256 ), ) src.close() pdf_bytes = encrypted_buf.getvalue() # 10. Generate safe filename — ASCII-only for HTTP header safety # re.ASCII ensures \w matches only [a-zA-Z0-9_], preventing # Unicode chars that Starlette can't encode as latin-1 headers. safe_title = re.sub(r'[^\w\s-]', '', request.title, flags=re.ASCII)[:50].strip() or 'Document' safe_title = re.sub(r'\s+', '_', safe_title) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') filename = f"{safe_title}_{timestamp}.pdf" # Build Content-Disposition with RFC 5987 filename* for Unicode display from urllib.parse import quote cd_value = f'attachment; filename="{filename}"' # If original title has non-ASCII chars, add filename* so browsers # show the real Unicode name while the ASCII fallback stays safe unicode_title = re.sub(r'[<>:"/\\|?*]', '', request.title)[:80].strip() or 'Document' unicode_title = re.sub(r'\s+', '_', unicode_title) unicode_filename = f"{unicode_title}_{timestamp}.pdf" encoded_unicode = quote(unicode_filename, safe='') cd_value += f"; filename*=UTF-8''{encoded_unicode}" return Response( content=pdf_bytes, media_type="application/pdf", headers={ "Content-Disposition": cd_value, "Cache-Control": "no-cache, no-store, must-revalidate", } ) except asyncio.TimeoutError: raise HTTPException(status_code=408, detail="PDF generation timed out") except PlaywrightError as e: raise HTTPException(status_code=500, detail=f"Browser error: {str(e)[:200]}") except Exception as e: print(f"Editor PDF Export Error: {str(e)}") import traceback traceback.print_exc() raise HTTPException(status_code=500, detail="PDF generation failed") finally: if page: with suppress(Exception): await page.close() # Health check endpoint @app.get("/health") async def health(): """Health check endpoint for load balancers and monitoring""" return { "status": "healthy", "timestamp": datetime.now().isoformat(), "service": "pdf-exporter" } # Test endpoint (for debugging) @app.post("/api/export/html") async def export_html_endpoint(request: ExportRequest): """Return rendered HTML for debugging purposes""" processed_msgs = [] for msg in request.messages: html_content = markdown.markdown( str(msg.get('content', '')), extensions=['fenced_code', 'tables', 'sane_lists'] ) processed_msgs.append({ 'role': msg.get('role', 'unknown'), 'content_html': html_content }) font_family = get_font_for_language(request.language) template = jinja_env.from_string(PDF_HTML_TEMPLATE) full_html = template.render( messages=processed_msgs, document_title="NeuralStream AI (HTML Preview)", date=datetime.now().strftime("%B %d, %Y"), language=request.language, font_family=font_family, text_direction=get_text_direction(request.language), text_alignment=get_text_alignment(request.language), title="Chat HTML Preview" ) return Response( content=full_html, media_type="text/html", headers={"X-Debug": "html-preview"} ) # Startup/Shutdown events @app.on_event("startup") async def startup_event(): """Initialize browser pool on startup to warm up resources""" print("🚀 Pre-warming browser pool...") try: # Launch browser immediately so we catch any errors at deploy time # instead of failing on the first user request. await browser_pool.get_page() print("✅ Browser pool warmed up successfully") except Exception as e: print(f"⚠️ Warning: Failed to pre-warm browser pool: {e}") # We don't raise here so the server still starts; individual requests will retry. @app.on_event("shutdown") async def shutdown_event(): """Clean shutdown of browser pool""" await browser_pool.close() if __name__ == "__main__": # Force the Windows Proactor Loop Policy (Required for Playwright) if sys.platform == 'win32': asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) print("🚀 Starting NeuralStream PDF Backend...") uvicorn.run(app, host="0.0.0.0", port=7860, reload=False)