import sys
import asyncio
import os
from contextlib import suppress
from typing import Optional
import time
import uuid
import re
# Set Windows event loop policy for Playwright compatibility
if sys.platform == 'win32':
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
from fastapi import FastAPI, HTTPException, Body
from fastapi.responses import Response
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, field_validator
from typing import List, Dict, Any
import markdown
from jinja2 import Template, Environment, select_autoescape
from playwright.async_api import async_playwright, Browser, BrowserContext, Page, Error as PlaywrightError
from datetime import datetime
import re
from urllib.parse import quote
import gc
import io
import uvicorn
# ==================== APP INITIALIZATION ====================
app = FastAPI(
title="Chat PDF Export Service",
description="Production-grade API for exporting chat conversations to PDF",
version="1.0.0"
)
# Add CORS for web clients
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure for your domain in production
allow_credentials=True,
allow_methods=["GET", "POST"],
allow_headers=["*"],
)
# ==================== GLOBAL CONFIGURATION ====================
MAX_CONTENT_LENGTH = 50_000 # 50kb max content length
PDF_GENERATION_TIMEOUT = 30 # seconds
MAX_REQUESTS_PER_CONNECTION = 100 # After this, browser is restarted
# ==================== PYDANTIC MODELS ====================
class ExportRequest(BaseModel):
messages: List[dict] = Field(..., min_length=1)
language: str = Field(default="en", description="ISO 639-1 language code")
font_family: Optional[str] = Field(default=None, description="Custom font family")
@field_validator('messages')
@classmethod
def validate_messages(cls, v: list) -> list:
for msg in v:
if not isinstance(msg, dict):
raise ValueError('Each message must be a dictionary')
if 'role' not in msg or 'content' not in msg:
raise ValueError('Message must have "role" and "content" keys')
return v
@field_validator('language')
@classmethod
def validate_language(cls, v: str) -> str:
if not isinstance(v, str) or len(v) != 2:
raise ValueError('Language must be a 2-letter ISO code')
return v.lower()
# ==================== PLAYWRIGHT BROWSER MANAGEMENT ====================
class PlaywrightBrowserPool:
"""Manages a pool of persistent browser instances for optimal performance"""
def __init__(self):
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.playwright = None
self.request_count = 0
self._lock = asyncio.Lock()
self._last_maintenance = time.time()
async def get_page(self) -> Page:
"""Get a new page from the browser pool"""
async with self._lock:
# Check if browser is alive; restart if it crashed (OOM, timeout, etc.)
if self.browser and not self.browser.is_connected():
print("[browser_pool] Browser disconnected — restarting…")
self.browser = None
self.context = None
if not self.browser or not self.context:
await self._create_browser()
# Perform maintenance every N requests
if self.request_count > MAX_REQUESTS_PER_CONNECTION:
await self._restart_browser()
self.request_count = 0
try:
page = await self.context.new_page()
except Exception:
# Browser died between the check and page creation — restart
print("[browser_pool] Failed to create page — restarting browser…")
await self._restart_browser()
page = await self.context.new_page()
self.request_count += 1
return page
async def _create_browser(self):
"""Initialize the Playwright browser instance"""
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--no-zygote', # Critical for Docker: skip forking zygote process
'--single-process', # Critical for Docker: run everything in one process
'--disable-web-security',
'--disable-features=VizDisplayCompositor',
'--font-render-hinting=none', # Prevents blurry PDF text
'--disable-lcd-text', # Disable subpixel AA (fuzz in PDFs)
'--enable-font-antialiasing',
'--force-color-profile=srgb',
]
)
self.context = await self.browser.new_context()
self._last_maintenance = time.time()
print(f"[browser_pool] Browser launched successfully (pid={self.browser.process.pid if self.browser.process else '?'})")
async def _restart_browser(self):
"""Restart browser to free memory and resources"""
# Close existing browser + Playwright server gracefully
with suppress(Exception):
if self.browser:
await self.browser.close()
with suppress(Exception):
if self.playwright:
await self.playwright.stop()
self.browser = None
self.context = None
self.playwright = None
gc.collect()
await self._create_browser()
async def close(self):
"""Clean up browser instances"""
with suppress(Exception):
if self.browser:
await self.browser.close()
with suppress(Exception):
if self.playwright:
await self.playwright.stop()
# Initialize global browser pool
browser_pool = PlaywrightBrowserPool()
# ==================== MULTILINGUAL FONT MAPPING ====================
MULTILINGUAL_FONTS = {
# (display_name, google_font_url_param, is_system_font)
# is_system_font=True means no Google Font link needed (the font is pre-installed)
'en': ('Georgia', '', True),
'hi': ('Noto Sans Devanagari', 'Noto+Sans+Devanagari:wght@400;600;700', False),
'ar': ('Noto Sans Arabic', 'Noto+Sans+Arabic:wght@400;600;700', False),
'zh': ('Noto Sans SC', 'Noto+Sans+SC:wght@400;600;700', False),
'ja': ('Noto Sans JP', 'Noto+Sans+JP:wght@400;600;700', False),
'ko': ('Noto Sans KR', 'Noto+Sans+KR:wght@400;600;700', False),
'th': ('Noto Sans Thai', 'Noto+Sans+Thai:wght@400;600;700', False),
'he': ('Noto Serif Hebrew', 'Noto+Serif+Hebrew:wght@400;600;700', False),
'bn': ('Noto Sans Bengali', 'Noto+Sans+Bengali:wght@400;600;700', False),
'ta': ('Noto Sans Tamil', 'Noto+Sans+Tamil:wght@400;600;700', False),
'te': ('Noto Serif Telugu', 'Noto+Serif+Telugu:wght@400;600;700', False),
'ml': ('Noto Serif Malayalam', 'Noto+Serif+Malayalam:wght@400;600;700', False),
'ru': ('Georgia', '', True),
'ur': ('Noto Nastaliq Urdu', 'Noto+Nastaliq+Urdu', False),
}
def get_font_for_language(lang: str) -> str:
"""Get appropriate Google Font for the specified language"""
lang = lang.lower()
info = MULTILINGUAL_FONTS.get(lang, ('Georgia', '', True))
return info[1]
# ==================== HTML TEMPLATE - FIXED VERSION ====================
PDF_HTML_TEMPLATE = """
{{ title }}
{{ document_title }}
{{ date }}
{% for msg in messages %}
{{ msg.content_html | safe }}
{% endfor %}
"""
# Create Jinja2 environment for security
jinja_env = Environment(autoescape=select_autoescape(['html', 'xml']))
def get_text_direction(lang: str) -> str:
"""Determine text direction for the language"""
if lang in ['ar', 'he', 'ur', 'fa']:
return 'rtl'
return 'ltr'
def get_text_alignment(lang: str) -> str:
"""Determine text alignment for the language"""
if lang in ['ar', 'he', 'ur', 'fa']:
return 'right'
return 'left'
# ==================== UTILITY FUNCTIONS ====================
def sanitize_content(content: str) -> str:
"""
Context-Aware Gatekeeper:
Sanitizes dangerous HTML from narrative text but PRESERVES it inside code blocks.
Strategy: Mask Code -> Sanitize Text -> Unmask Code
"""
content = str(content).strip()
if not content:
return content
# Storage for the safe code blocks we temporarily hide
placeholders = {}
def mask_match(match):
"""Generate a unique token for code blocks to preserve them"""
token = f"__SAFE_CODE_BLOCK_{uuid.uuid4().hex}__"
placeholders[token] = match.group(0)
return token
# --- PHASE 1: MASKING (Protect Valid Data) ---
# This guarantees that educational content (like inside code) is NEVER touched.
# Pattern A: Fenced Code Blocks (``` ... ```)
content = re.sub(r'(```[\s\S]*?```)', mask_match, content)
# Pattern B: Inline Code (` ... `)
# We exclude newlines inside inline code to avoid over-matching broken syntax
content = re.sub(r'(`[^`\n]+`)', mask_match, content)
# --- PHASE 2: FILTRATION (Neutralize Threats in Narrative) ---
# 1. Remove dangerous tags completely
# We INCLUDE 'link' and 'meta' because if they appear outside code blocks,
# they are likely injection attacks (CSS injection or redirects).
dangerous_tags = ['script', 'iframe', 'object', 'embed', 'applet', 'form', 'link', 'meta']
for tag in dangerous_tags:
# Remove tag and its full content (e.g. )
pattern = f'<{tag}[^>]*>.*?{tag}>'
content = re.sub(pattern, '', content, flags=re.IGNORECASE | re.DOTALL)
# Handle self-closing tags or single tags (e.g. )
single_pattern = f'<{tag}[^>]*>'
content = re.sub(single_pattern, '', content, flags=re.IGNORECASE)
# 2. Neutralize dangerous attributes in remaining allowed tags (like
',
header_template='',
prefer_css_page_size=True
)
filename = generate_filename(request.language)
return Response(
content=pdf_bytes,
media_type="application/pdf",
headers={
"Content-Disposition": f"attachment; filename={filename}",
"Cache-Control": "no-cache, no-store, must-revalidate",
"Pragma": "no-cache",
"Expires": "0"
}
)
except asyncio.TimeoutError:
raise HTTPException(
status_code=408,
detail="PDF generation timed out. The document may be too complex."
)
except PlaywrightError as e:
raise HTTPException(
status_code=500,
detail=f"Browser error during PDF generation: {str(e)[:100]}"
)
except Exception as e:
print(f"PDF Export API Error: {str(e)}")
import traceback
traceback.print_exc()
raise HTTPException(
status_code=500,
detail="Internal server error during PDF generation. Please try again."
)
finally:
if page:
with suppress(Exception):
await page.close()
# ==================== ANT-EDITOR DOCUMENT EXPORT ====================
class EditorExportRequest(BaseModel):
"""Request model for AbWrite document PDF export"""
html: str = Field(..., description="HTML content from TipTap editor.getHTML()")
title: str = Field(default="Untitled Document", description="Document title")
language: str = Field(default="en", description="ISO 639-1 language code for font selection")
password: Optional[str] = Field(default=None, description="Optional password to encrypt the PDF")
watermark_text: Optional[str] = Field(default=None, description="Optional watermark text to overlay on every page")
@field_validator('html')
@classmethod
def validate_html(cls, v: str) -> str:
if not v or not v.strip():
raise ValueError('HTML content cannot be empty')
# 10MB limit — documents with embedded base64 images are large
if len(v) > 10_000_000:
raise ValueError('HTML content too large (max 10MB)')
return v
@field_validator('language')
@classmethod
def validate_language(cls, v: str) -> str:
if not isinstance(v, str) or len(v) < 2:
raise ValueError('Language must be a valid ISO code')
return v.lower()[:2]
EDITOR_PDF_TEMPLATE = """
{{ title }}
{% if not is_system_font %}
{% endif %}
{% if watermark_text %}