gem / app.py
Rajhuggingface4253's picture
Update app.py
ca43214 verified
import sys
import asyncio
import os
from contextlib import suppress
from typing import Optional
import time
import uuid
import re
# Set Windows event loop policy for Playwright compatibility
if sys.platform == 'win32':
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
from fastapi import FastAPI, HTTPException, Body
from fastapi.responses import Response
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, field_validator
from typing import List, Dict, Any
import markdown
from jinja2 import Template, Environment, select_autoescape
from playwright.async_api import async_playwright, Browser, BrowserContext, Page, Error as PlaywrightError
from datetime import datetime
import re
from urllib.parse import quote
import gc
import io
import uvicorn
# ==================== APP INITIALIZATION ====================
app = FastAPI(
title="Chat PDF Export Service",
description="Production-grade API for exporting chat conversations to PDF",
version="1.0.0"
)
# Add CORS for web clients
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure for your domain in production
allow_credentials=True,
allow_methods=["GET", "POST"],
allow_headers=["*"],
)
# ==================== GLOBAL CONFIGURATION ====================
MAX_CONTENT_LENGTH = 50_000 # 50kb max content length
PDF_GENERATION_TIMEOUT = 30 # seconds
MAX_REQUESTS_PER_CONNECTION = 100 # After this, browser is restarted
# ==================== PYDANTIC MODELS ====================
class ExportRequest(BaseModel):
messages: List[dict] = Field(..., min_length=1)
language: str = Field(default="en", description="ISO 639-1 language code")
font_family: Optional[str] = Field(default=None, description="Custom font family")
@field_validator('messages')
@classmethod
def validate_messages(cls, v: list) -> list:
for msg in v:
if not isinstance(msg, dict):
raise ValueError('Each message must be a dictionary')
if 'role' not in msg or 'content' not in msg:
raise ValueError('Message must have "role" and "content" keys')
return v
@field_validator('language')
@classmethod
def validate_language(cls, v: str) -> str:
if not isinstance(v, str) or len(v) != 2:
raise ValueError('Language must be a 2-letter ISO code')
return v.lower()
# ==================== PLAYWRIGHT BROWSER MANAGEMENT ====================
class PlaywrightBrowserPool:
"""Manages a pool of persistent browser instances for optimal performance"""
def __init__(self):
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.playwright = None
self.request_count = 0
self._lock = asyncio.Lock()
self._last_maintenance = time.time()
async def get_page(self) -> Page:
"""Get a new page from the browser pool"""
async with self._lock:
# Check if browser is alive; restart if it crashed (OOM, timeout, etc.)
if self.browser and not self.browser.is_connected():
print("[browser_pool] Browser disconnected — restarting…")
self.browser = None
self.context = None
if not self.browser or not self.context:
await self._create_browser()
# Perform maintenance every N requests
if self.request_count > MAX_REQUESTS_PER_CONNECTION:
await self._restart_browser()
self.request_count = 0
try:
page = await self.context.new_page()
except Exception:
# Browser died between the check and page creation — restart
print("[browser_pool] Failed to create page — restarting browser…")
await self._restart_browser()
page = await self.context.new_page()
self.request_count += 1
return page
async def _create_browser(self):
"""Initialize the Playwright browser instance"""
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--no-zygote', # Critical for Docker: skip forking zygote process
'--single-process', # Critical for Docker: run everything in one process
'--disable-web-security',
'--disable-features=VizDisplayCompositor',
'--font-render-hinting=none', # Prevents blurry PDF text
'--disable-lcd-text', # Disable subpixel AA (fuzz in PDFs)
'--enable-font-antialiasing',
'--force-color-profile=srgb',
]
)
self.context = await self.browser.new_context()
self._last_maintenance = time.time()
print(f"[browser_pool] Browser launched successfully (pid={self.browser.process.pid if self.browser.process else '?'})")
async def _restart_browser(self):
"""Restart browser to free memory and resources"""
# Close existing browser + Playwright server gracefully
with suppress(Exception):
if self.browser:
await self.browser.close()
with suppress(Exception):
if self.playwright:
await self.playwright.stop()
self.browser = None
self.context = None
self.playwright = None
gc.collect()
await self._create_browser()
async def close(self):
"""Clean up browser instances"""
with suppress(Exception):
if self.browser:
await self.browser.close()
with suppress(Exception):
if self.playwright:
await self.playwright.stop()
# Initialize global browser pool
browser_pool = PlaywrightBrowserPool()
# ==================== MULTILINGUAL FONT MAPPING ====================
MULTILINGUAL_FONTS = {
# (display_name, google_font_url_param, is_system_font)
# is_system_font=True means no Google Font link needed (the font is pre-installed)
'en': ('Georgia', '', True),
'hi': ('Noto Sans Devanagari', 'Noto+Sans+Devanagari:wght@400;600;700', False),
'ar': ('Noto Sans Arabic', 'Noto+Sans+Arabic:wght@400;600;700', False),
'zh': ('Noto Sans SC', 'Noto+Sans+SC:wght@400;600;700', False),
'ja': ('Noto Sans JP', 'Noto+Sans+JP:wght@400;600;700', False),
'ko': ('Noto Sans KR', 'Noto+Sans+KR:wght@400;600;700', False),
'th': ('Noto Sans Thai', 'Noto+Sans+Thai:wght@400;600;700', False),
'he': ('Noto Serif Hebrew', 'Noto+Serif+Hebrew:wght@400;600;700', False),
'bn': ('Noto Sans Bengali', 'Noto+Sans+Bengali:wght@400;600;700', False),
'ta': ('Noto Sans Tamil', 'Noto+Sans+Tamil:wght@400;600;700', False),
'te': ('Noto Serif Telugu', 'Noto+Serif+Telugu:wght@400;600;700', False),
'ml': ('Noto Serif Malayalam', 'Noto+Serif+Malayalam:wght@400;600;700', False),
'ru': ('Georgia', '', True),
'ur': ('Noto Nastaliq Urdu', 'Noto+Nastaliq+Urdu', False),
}
def get_font_for_language(lang: str) -> str:
"""Get appropriate Google Font for the specified language"""
lang = lang.lower()
info = MULTILINGUAL_FONTS.get(lang, ('Georgia', '', True))
return info[1]
# ==================== HTML TEMPLATE - FIXED VERSION ====================
PDF_HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="{{ language }}">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{{ title }}</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family={{ font_family }}&display=swap" rel="stylesheet">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/github-dark.min.css">
<style>
* {
font-kerning: normal;
text-rendering: optimizeLegibility;
-webkit-font-smoothing: antialiased;
-webkit-print-color-adjust: exact !important;
print-color-adjust: exact !important;
box-sizing: border-box;
}
body {
font-family: '{{ font_family.split(":")[0] | default("Inter") }}', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
max-width: 100%;
margin: 0 auto;
padding: 0;
color: #000000;
direction: {{ text_direction }};
font-size: 11pt;
line-height: 1.6;
}
/* HEADER STYLING */
.header {
text-align: center;
border-bottom: 2pt solid #000;
padding-bottom: 15pt;
margin-bottom: 20pt;
page-break-after: avoid;
}
.logo {
font-size: 18pt;
font-weight: 800;
color: #000;
letter-spacing: -0.5px;
}
.date {
color: #6b7280;
font-size: 10pt;
margin-top: 5pt;
}
/* MESSAGE CONTAINERS - FIXED SPACING */
.message {
margin-bottom: 16pt;
page-break-inside: auto;
}
h1, h2, h3, h4, h5, h6, .user .content {
page-break-after: avoid;
}
.message:last-child {
margin-bottom: 0;
}
/* USER MESSAGE - QUESTION HEADER */
.user .content {
font-weight: 700;
font-size: 12pt;
color: #000000;
margin: 0 0 8pt 0;
padding: 0 0 8pt 0;
border-bottom: 1px solid #e5e7eb;
background: none;
border-left: none;
}
img, svg {
max-width: 100%;
height: auto;
display: block;
margin: 12pt auto;
page-break-inside: avoid;
}
/* SVG DIAGRAM STYLING */
.svg-diagram-container,
.mermaid-diagram-container {
max-width: 100%;
margin: 16pt 0;
padding: 12pt;
background: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 6pt;
page-break-inside: avoid;
overflow: hidden;
}
.svg-diagram-container svg,
.mermaid-diagram-container svg {
max-width: 100%;
max-height: 600px; /* Professional dimension limit */
height: auto;
margin: 0 auto;
display: block;
}
/* INLINE SVG FROM MARKDOWN */
svg:not([class]) {
max-width: 100%;
height: auto;
page-break-inside: avoid;
}
/* ASSISTANT MESSAGE - ANSWER BODY */
.assistant .content {
font-weight: 400;
color: #000000;
padding: 0;
margin: 0;
font-size: 11pt;
}
/* CODE BLOCKS - FIXED WRAPPING ISSUES */
pre {
background: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 6pt;
margin: 12pt 0;
padding: 12pt;
page-break-inside: auto;
orphans: 3;
widows: 3;
overflow-x: auto;
white-space: pre-wrap;
word-wrap: break-word;
font-size: 10pt;
}
pre code {
display: block;
padding: 0;
background: transparent;
color: #000000;
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
font-size: 10pt;
line-height: 1.5;
white-space: pre-wrap;
word-wrap: break-word;
}
/* INLINE CODE */
code {
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
font-size: 10pt;
background-color: #f3f4f6;
color: #000000;
padding: 2px 4px;
border-radius: 4px;
white-space: normal;
}
/* TABLES */
table {
width: 100%;
border-collapse: collapse;
margin: 12pt 0;
font-size: 10pt;
page-break-inside: auto;
table-layout: fixed;
word-wrap: break-word;
}
td {
word-break: break-word;
overflow-wrap: break-word;
}
th, td {
border: 1px solid #d1d5db;
padding: 8pt;
text-align: {{ text_alignment }};
}
th {
background: #f9fafb;
font-weight: 600;
}
/* LISTS */
ul, ol {
padding-left: 30px;
margin: 8pt 0;
}
/* Nested unordered list style differentiation */
ul { list-style: disc; }
ul ul { list-style: circle; }
ul ul ul { list-style: square; }
/* Nested ordered list style differentiation */
ol { list-style: decimal; }
ol ol { list-style: lower-alpha; }
ol ol ol { list-style: lower-roman; }
li {
margin: 4pt 0;
}
/* PARAGRAPHS AND TEXT ELEMENTS */
p {
margin: 8pt 0;
}
h1, h2, h3, h4, h5, h6 {
margin: 16pt 0 8pt 0;
font-weight: 600;
line-height: 1.3;
}
h1 { font-size: 16pt; }
h2 { font-size: 14pt; }
h3 { font-size: 13pt; }
/* PDF PAGE SETUP */
@page {
size: A4;
margin: 20mm;
}
@media print {
body {
-webkit-print-color-adjust: exact;
print-color-adjust: exact;
}
}
</style>
</head>
<body>
<div class="header">
<div class="logo">{{ document_title }}</div>
<div class="date">{{ date }}</div>
</div>
{% for msg in messages %}
<div class="message {{ msg.role }}">
<div class="content">
{{ msg.content_html | safe }}
</div>
</div>
{% endfor %}
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
<script>
// Enhanced loading strategy for highlight.js
(function() {
function initHighlighting() {
if (window.hljs) {
try {
hljs.highlightAll();
} catch (e) {
console.log('Highlight.js error:', e);
}
} else {
console.log('Highlight.js not available, skipping syntax highlighting.');
}
}
// Try to load with timeout
Promise.race([
new Promise(function(resolve) {
if (document.fonts && document.fonts.ready) {
document.fonts.ready.then(resolve).catch(resolve);
} else {
resolve();
}
}),
new Promise(resolve => setTimeout(resolve, 3000))
]).then(initHighlighting);
})();
</script>
</body>
</html>
"""
# Create Jinja2 environment for security
jinja_env = Environment(autoescape=select_autoescape(['html', 'xml']))
def get_text_direction(lang: str) -> str:
"""Determine text direction for the language"""
if lang in ['ar', 'he', 'ur', 'fa']:
return 'rtl'
return 'ltr'
def get_text_alignment(lang: str) -> str:
"""Determine text alignment for the language"""
if lang in ['ar', 'he', 'ur', 'fa']:
return 'right'
return 'left'
# ==================== UTILITY FUNCTIONS ====================
def sanitize_content(content: str) -> str:
"""
Context-Aware Gatekeeper:
Sanitizes dangerous HTML from narrative text but PRESERVES it inside code blocks.
Strategy: Mask Code -> Sanitize Text -> Unmask Code
"""
content = str(content).strip()
if not content:
return content
# Storage for the safe code blocks we temporarily hide
placeholders = {}
def mask_match(match):
"""Generate a unique token for code blocks to preserve them"""
token = f"__SAFE_CODE_BLOCK_{uuid.uuid4().hex}__"
placeholders[token] = match.group(0)
return token
# --- PHASE 1: MASKING (Protect Valid Data) ---
# This guarantees that educational content (like <link> inside code) is NEVER touched.
# Pattern A: Fenced Code Blocks (``` ... ```)
content = re.sub(r'(```[\s\S]*?```)', mask_match, content)
# Pattern B: Inline Code (` ... `)
# We exclude newlines inside inline code to avoid over-matching broken syntax
content = re.sub(r'(`[^`\n]+`)', mask_match, content)
# --- PHASE 2: FILTRATION (Neutralize Threats in Narrative) ---
# 1. Remove dangerous tags completely
# We INCLUDE 'link' and 'meta' because if they appear outside code blocks,
# they are likely injection attacks (CSS injection or redirects).
dangerous_tags = ['script', 'iframe', 'object', 'embed', 'applet', 'form', 'link', 'meta']
for tag in dangerous_tags:
# Remove tag and its full content (e.g. <script>...</script>)
pattern = f'<{tag}[^>]*>.*?</{tag}>'
content = re.sub(pattern, '', content, flags=re.IGNORECASE | re.DOTALL)
# Handle self-closing tags or single tags (e.g. <link ... />)
single_pattern = f'<{tag}[^>]*>'
content = re.sub(single_pattern, '', content, flags=re.IGNORECASE)
# 2. Neutralize dangerous attributes in remaining allowed tags (like <div> or <a>)
dangerous_attrs = [
# Event handlers (onclick, onload, onmouseover, etc.)
r'\son[a-z]+\s*=\s*["\'][^"\']*["\']',
# Javascript protocol in href/src
r'\s(href|src)\s*=\s*["\'][^"\']*javascript:[^"\']*["\']',
# Data URI exploits in href/src (Base64 HTML injection)
r'\s(href|src)\s*=\s*["\'][^"\']*data:[^"\']*["\']',
]
for attr_pattern in dangerous_attrs:
content = re.sub(attr_pattern, '', content, flags=re.IGNORECASE)
# --- PHASE 3: UNMASKING (Restore Valid Data) ---
for token, original_code in placeholders.items():
content = content.replace(token, original_code)
return content
def fix_markdown_tables(content: str) -> str:
"""
Heals common Markdown table issues before parsing:
1. Ensures empty line before table headers (fixes "missing table" bug)
2. Ensures table rows are on their own lines
"""
# Regex to find a table header pipe starting a line,
# possibly preceded by text on the previous line without a gap.
# Look for: (newline) (text) (newline) (| col | col)
# 1. Force newline before table header if missing
# Matches a pattern like: "text\n| Header |" and inserts extra newline
content = re.sub(r'(?<=\S)\n(\|.*\|.*\n\|[- :|]+\|)', r'\n\n\1', content)
return content
def generate_filename(language: str) -> str:
"""Generate a safe filename with language prefix"""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
return f"NeuralStream_Export_{timestamp}_{language}.pdf"
def validate_content_size(content_length: int) -> bool:
"""Validate that content size is within acceptable limits"""
return content_length <= MAX_CONTENT_LENGTH
# ==================== API ENDPOINT ====================
@app.post("/api/export/pdf")
async def export_pdf_endpoint(request: ExportRequest):
"""Export chat conversation to PDF with comprehensive multilingual support"""
# Validate content size
total_content = sum(len(msg.get('content', '')) for msg in request.messages)
if not validate_content_size(total_content):
raise HTTPException(
status_code=413,
detail=f"Content too large. Max allowed: {MAX_CONTENT_LENGTH} characters"
)
page = None
try:
# 1. Process messages to HTML
processed_msgs = []
for msg in request.messages:
# [CORRECT ORDER]
# Sanitize RAW input first (The Gatekeeper)
raw_content = sanitize_content(msg.get('content', ''))
# Fix tables
fixed_content = fix_markdown_tables(raw_content)
# Render Markdown
html_content = markdown.markdown(
fixed_content,
extensions=['fenced_code', 'tables', 'sane_lists', 'nl2br']
)
processed_msgs.append({
'role': msg.get('role', 'unknown'),
'content_html': html_content
})
# 2. Prepare HTML template
font_family = get_font_for_language(request.language)
template = jinja_env.from_string(PDF_HTML_TEMPLATE)
full_html = template.render(
messages=processed_msgs,
document_title="NeuralStream AI",
date=datetime.now().strftime("%B %d, %Y"),
language=request.language,
font_family=font_family,
text_direction=get_text_direction(request.language),
text_alignment=get_text_alignment(request.language),
title=f"Chat Export {datetime.now().strftime('%m/%d/%Y')}"
)
# 3. Generate PDF with Playwright
page = await browser_pool.get_page()
page.set_default_timeout(30000)
page.set_default_navigation_timeout(30000)
await page.set_content(full_html, wait_until='load')
# 4. Wait for fonts and code highlighting
await asyncio.wait_for(
page.evaluate('''async () => {
try {
await Promise.race([
document.fonts ? document.fonts.ready : Promise.resolve(),
new Promise(resolve => setTimeout(resolve, 1500))
]);
if (window.hljs) await new Promise(resolve => setTimeout(resolve, 150));
} catch (e) { console.log('Font loading error:', e); }
}'''),
timeout=5.0
)
# [CRITICAL ADDITION] 5. Hydrate Links in Code Blocks
# This turns the text URLs inside code blocks into real clickable <a> tags
await page.evaluate('''() => {
// A. LINK HYDRATION
const codeElements = document.querySelectorAll('pre code');
codeElements.forEach(block => {
const urlRegex = /(https?:\/\/[^\s<"']+)/g;
block.innerHTML = block.innerHTML.replace(urlRegex, (url) => {
return `<a href="${url}" style="text-decoration:underline; color:inherit; pointer-events:all;">${url}</a>`;
});
});
// B. SMART LAYOUT PROTECTION
// Heuristic: If a block is shorter than ~1/3 of a page (approx 350px),
// assume it's a diagram or snippet that should NOT split.
const preBlocks = document.querySelectorAll('pre');
preBlocks.forEach(pre => {
if (pre.offsetHeight < 350) {
pre.style.pageBreakInside = 'avoid';
pre.style.breakInside = 'avoid'; // Modern standard
}
});
}''')
# 6. Generate PDF
pdf_bytes = await page.pdf(
format="A4",
margin={"top": "20mm", "bottom": "20mm", "left": "20mm", "right": "20mm"},
print_background=True,
display_header_footer=True,
footer_template='<div style="font-size:9px; margin:0 auto; color:#666; text-align:center;">Page <span class="pageNumber"></span> of <span class="totalPages"></span></div>',
header_template='<div></div>',
prefer_css_page_size=True
)
filename = generate_filename(request.language)
return Response(
content=pdf_bytes,
media_type="application/pdf",
headers={
"Content-Disposition": f"attachment; filename={filename}",
"Cache-Control": "no-cache, no-store, must-revalidate",
"Pragma": "no-cache",
"Expires": "0"
}
)
except asyncio.TimeoutError:
raise HTTPException(
status_code=408,
detail="PDF generation timed out. The document may be too complex."
)
except PlaywrightError as e:
raise HTTPException(
status_code=500,
detail=f"Browser error during PDF generation: {str(e)[:100]}"
)
except Exception as e:
print(f"PDF Export API Error: {str(e)}")
import traceback
traceback.print_exc()
raise HTTPException(
status_code=500,
detail="Internal server error during PDF generation. Please try again."
)
finally:
if page:
with suppress(Exception):
await page.close()
# ==================== ANT-EDITOR DOCUMENT EXPORT ====================
class EditorExportRequest(BaseModel):
"""Request model for AbWrite document PDF export"""
html: str = Field(..., description="HTML content from TipTap editor.getHTML()")
title: str = Field(default="Untitled Document", description="Document title")
language: str = Field(default="en", description="ISO 639-1 language code for font selection")
password: Optional[str] = Field(default=None, description="Optional password to encrypt the PDF")
watermark_text: Optional[str] = Field(default=None, description="Optional watermark text to overlay on every page")
@field_validator('html')
@classmethod
def validate_html(cls, v: str) -> str:
if not v or not v.strip():
raise ValueError('HTML content cannot be empty')
# 10MB limit — documents with embedded base64 images are large
if len(v) > 10_000_000:
raise ValueError('HTML content too large (max 10MB)')
return v
@field_validator('language')
@classmethod
def validate_language(cls, v: str) -> str:
if not isinstance(v, str) or len(v) < 2:
raise ValueError('Language must be a valid ISO code')
return v.lower()[:2]
EDITOR_PDF_TEMPLATE = """
<!DOCTYPE html>
<html lang="{{ language }}">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{{ title }}</title>
{% if not is_system_font %}
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family={{ font_url }}&display=swap" rel="stylesheet">
{% endif %}
<!-- Load all editor Google Fonts for user-applied font-family inline styles -->
<link href="https://fonts.googleapis.com/css2?family=Amatic+SC:wght@400;700&family=Bebas+Neue&family=Caveat:wght@400;700&family=Comic+Neue:wght@400;700&family=Cookie&family=Courier+Prime:wght@400;700&family=Dancing+Script:wght@400;700&family=Great+Vibes&family=Indie+Flower&family=Kaushan+Script&family=Lato:wght@300;400;700&family=Libre+Baskerville:ital,wght@0,400;0,700;1,400&family=Lora:ital,wght@0,400;0,600;1,400&family=Merriweather:wght@300;700&family=Montserrat:wght@400;600;800&family=Open+Sans:wght@400;600&family=Oswald:wght@400;500&family=Pacifico&family=Patrick+Hand&family=Permanent+Marker&family=Playfair+Display:wght@400;700&family=Poppins:wght@300;400;600&family=Roboto:wght@300;400;700&family=Sacramento&family=Shadows+Into+Light&family=Tinos:ital,wght@0,400;0,700;1,400&display=swap" rel="stylesheet">
<style>
*, *::before, *::after {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
/* Georgia first = matches the editor's default document font.
Language-specific Noto font as fallback = auto-activates for non-Latin
scripts (Hindi, Arabic, etc.) because Georgia lacks those glyphs. */
font-family: 'Georgia', '{{ font_name }}', 'Times New Roman', serif;
color: #1a1a2e;
direction: {{ text_direction }};
font-size: 13pt;
line-height: 1.6;
-webkit-print-color-adjust: exact !important;
print-color-adjust: exact !important;
-webkit-font-smoothing: antialiased;
-moz-osx-font-smoothing: grayscale;
text-rendering: optimizeLegibility;
font-feature-settings: 'kern' 1, 'liga' 1;
}
/* ── Headings (matched to editor) ── */
h1 {
font-size: 26pt;
font-weight: 700;
margin: 5pt 0 3pt 0;
line-height: 1.2;
color: #1a1a2e;
letter-spacing: -0.015em;
}
h2 {
font-size: 20pt;
font-weight: 600;
margin: 6pt 0 3pt 0;
line-height: 1.25;
color: #1a1a2e;
letter-spacing: -0.01em;
}
h3 {
font-size: 16pt;
font-weight: 600;
margin: 6pt 0 2pt 0;
line-height: 1.3;
color: #2a2a3e;
}
h4 {
font-size: 13pt;
font-weight: 600;
margin: 6pt 0 1pt 0;
color: #2a2a3e;
}
h5 {
font-size: 11pt;
font-weight: 600;
margin: 6pt 0 1pt 0;
color: #3a3a4e;
text-transform: uppercase;
letter-spacing: 0.04em;
}
h6 {
font-size: 10pt;
font-weight: 600;
margin: 6pt 0 1pt 0;
color: #4a4a5e;
text-transform: uppercase;
letter-spacing: 0.05em;
}
h1, h2, h3, h4, h5, h6 { page-break-after: avoid; }
/* ── Paragraphs ── */
p {
margin: 0 0 6pt 0;
orphans: 3;
widows: 3;
}
p:last-child { margin-bottom: 0; }
/* ── Links (matched to editor #4a69bd) ── */
a {
color: #4a69bd;
text-decoration: underline;
text-decoration-color: rgba(74, 105, 189, 0.4);
text-underline-offset: 2px;
}
/* ── Lists ── */
ul, ol { padding-left: 22pt; margin: 5pt 0; }
/* Nested unordered list styles (disc → circle → square) */
ul { list-style: disc; }
ul ul { list-style: circle; }
ul ul ul { list-style: square; }
/* Nested ordered list styles (decimal → lower-alpha → lower-roman) */
ol { list-style: decimal; }
ol ol { list-style: lower-alpha; }
ol ol ol { list-style: lower-roman; }
li { margin: 2pt 0; }
li p { margin: 0; }
ul[data-type="taskList"] { list-style: none; padding-left: 0; }
ul[data-type="taskList"] li { display: flex; align-items: flex-start; gap: 6pt; }
ul[data-type="taskList"] li[data-checked="true"] p {
text-decoration: line-through;
color: #999;
}
/* ── Tables (matched to editor) ── */
table {
width: 100%;
border-collapse: collapse;
margin: 10pt 0;
font-size: 12pt;
page-break-inside: auto;
table-layout: fixed;
word-wrap: break-word;
}
th, td {
border: 1px solid #ddd;
padding: 6pt 9pt;
text-align: {{ text_alignment }};
vertical-align: top;
}
th {
background: #f5f5fa;
font-weight: 600;
text-align: left;
}
tr { page-break-inside: avoid; }
/* ── Code blocks (dark theme, matched to editor) ── */
pre {
background: #1e1e2e;
color: #e8e8ed;
border-radius: 6pt;
padding: 12pt;
margin: 10pt 0;
overflow-x: auto;
white-space: pre-wrap;
word-wrap: break-word;
font-size: 11pt;
line-height: 1.5;
page-break-inside: auto;
orphans: 3;
widows: 3;
}
pre code {
display: block;
background: transparent;
padding: 0;
font-family: 'Consolas', 'Fira Code', ui-monospace, SFMono-Regular, Menlo, Monaco, monospace;
font-size: 11pt;
color: #e8e8ed;
white-space: pre-wrap;
word-wrap: break-word;
}
/* ── Inline code (matched to editor) ── */
code {
font-family: 'Consolas', 'Fira Code', ui-monospace, SFMono-Regular, Menlo, Monaco, monospace;
font-size: 0.9em;
background: #f0f0f5;
padding: 2pt 4pt;
border-radius: 3pt;
color: #d63384;
}
/* ── Blockquotes (matched to editor — purple accent, italic) ── */
blockquote {
border-left: 3pt solid #6c5ce7;
padding: 6pt 0 6pt 12pt;
margin: 10pt 0;
color: #444;
font-style: italic;
background: rgba(108, 92, 231, 0.03);
}
/* ── Horizontal rules ── */
hr {
border: none;
border-top: 1pt solid #e0e0e5;
margin: 18pt 0;
}
/* ── Images ── */
img {
max-width: 100%;
height: auto;
display: block;
margin: 8pt auto;
border-radius: 4pt;
page-break-inside: avoid;
}
/* ── Image-Text Block (float-based text wrapping) ── */
.image-text-block {
overflow: hidden;
margin: 8pt 0;
page-break-inside: auto;
}
.image-text-block-image {
max-width: 45%;
line-height: 0;
}
.image-text-block[data-image-position="left"] .image-text-block-image {
float: left;
margin: 0 16px 8px 0;
}
.image-text-block[data-image-position="right"] .image-text-block-image {
float: right;
margin: 0 0 8px 16px;
}
.image-text-block-image img { max-width: 100%; height: auto; margin: 0; }
/* Flip support */
.image-text-block[data-flip-h="true"] .image-text-block-image img { transform: scaleX(-1); }
.image-text-block[data-flip-v="true"] .image-text-block-image img { transform: scaleY(-1); }
.image-text-block[data-flip-h="true"][data-flip-v="true"] .image-text-block-image img { transform: scaleX(-1) scaleY(-1); }
.image-text-block-content { }
/* ── Resizable Image Wrapper (from editor's ResizableImage extension) ── */
.resizable-image-wrapper {
width: fit-content;
max-width: 100%;
margin: 8pt 0;
page-break-inside: avoid;
}
.resizable-image-wrapper img {
display: block;
max-width: 100%;
height: auto;
border-radius: 4pt;
margin: 0; /* parent wrapper handles margins */
}
/* Preserve explicit width/height from editor resize handles */
.resizable-image-wrapper img[width] {
width: attr(width px);
}
/* Alignment variants */
.resizable-image-align-left { margin-left: 0; margin-right: auto; }
.resizable-image-align-center { margin-left: auto; margin-right: auto; }
.resizable-image-align-right { margin-left: auto; margin-right: 0; }
/* Flip support for standalone images */
.resizable-image-wrapper[data-flip-h="true"] img { transform: scaleX(-1); }
.resizable-image-wrapper[data-flip-v="true"] img { transform: scaleY(-1); }
.resizable-image-wrapper[data-flip-h="true"][data-flip-v="true"] img { transform: scaleX(-1) scaleY(-1); }
/* ── Highlights (warm yellow, matched to editor) ── */
mark {
background: #fef3c7;
padding: 1pt 2pt;
border-radius: 2pt;
}
/* ── Text formatting ── */
s { text-decoration: line-through; color: #999; }
sub { font-size: 0.75em; }
sup { font-size: 0.75em; }
/* ── PDF page setup ── */
@page {
size: A4;
margin: 25mm 20mm 25mm 20mm;
}
@media print {
body {
-webkit-print-color-adjust: exact;
print-color-adjust: exact;
}
pre {
background: #1e1e2e !important;
color: #e8e8ed !important;
}
}
</style>
</head>
<body>
{% if watermark_text %}
<div style="
position: fixed;
top: 0; left: 0; right: 0; bottom: 0;
display: flex;
align-items: center;
justify-content: center;
pointer-events: none;
z-index: 9999;
">
<div style="
font-size: 72pt;
font-weight: 800;
color: rgba(0, 0, 0, 0.06);
transform: rotate(-35deg);
white-space: nowrap;
user-select: none;
letter-spacing: 0.08em;
text-transform: uppercase;
">{{ watermark_text }}</div>
</div>
{% endif %}
{{ content | safe }}
<script>
// Wait for fonts to load
(function() {
if (document.fonts && document.fonts.ready) {
Promise.race([
document.fonts.ready,
new Promise(resolve => setTimeout(resolve, 3000))
]).catch(() => {});
}
})();
</script>
</body>
</html>
"""
@app.post("/api/editor/export-pdf")
async def editor_export_pdf(request: EditorExportRequest):
"""Export Ant-Editor document content to PDF via Playwright"""
page = None
try:
# 1. Determine font settings
lang = request.language
font_info = MULTILINGUAL_FONTS.get(lang, MULTILINGUAL_FONTS['en'])
font_name = font_info[0]
font_url = font_info[1]
is_system_font = font_info[2] if len(font_info) > 2 else False
# 2. Render HTML template
template = jinja_env.from_string(EDITOR_PDF_TEMPLATE)
full_html = template.render(
content=request.html,
title=request.title,
language=lang,
font_name=font_name,
font_url=font_url,
is_system_font=is_system_font,
text_direction=get_text_direction(lang),
text_alignment=get_text_alignment(lang),
watermark_text=request.watermark_text or '',
)
# 3. Generate PDF via Playwright
page = await browser_pool.get_page()
page.set_default_timeout(30000)
page.set_default_navigation_timeout(30000)
await page.set_content(full_html, wait_until='networkidle')
# 4. Wait for fonts to load
await asyncio.wait_for(
page.evaluate('''async () => {
try {
await Promise.race([
document.fonts ? document.fonts.ready : Promise.resolve(),
new Promise(resolve => setTimeout(resolve, 2000))
]);
} catch (e) {}
}'''),
timeout=5.0
)
# 5. Wait for all images to finish loading (base64 + external URLs)
await asyncio.wait_for(
page.evaluate('''() => {
const imgs = document.querySelectorAll('img');
return Promise.all(Array.from(imgs).map(img => {
if (img.complete) return Promise.resolve();
return new Promise(resolve => {
img.onload = resolve;
img.onerror = resolve;
});
}));
}'''),
timeout=10.0
)
# 6. Apply explicit width from HTML attributes (CSS attr() fallback)
await page.evaluate('''() => {
document.querySelectorAll('.resizable-image-wrapper img').forEach(img => {
const w = img.getAttribute('width');
if (w) {
img.style.width = w + (w.includes('%') ? '' : 'px');
img.style.maxWidth = '100%';
img.style.height = 'auto';
}
});
}''')
# 7. Smart layout: keep short code blocks together
await page.evaluate('''() => {
const preBlocks = document.querySelectorAll('pre');
preBlocks.forEach(pre => {
if (pre.offsetHeight < 400) {
pre.style.pageBreakInside = 'avoid';
pre.style.breakInside = 'avoid';
}
});
}''')
# 8. Generate PDF bytes
pdf_bytes = await page.pdf(
format="A4",
scale=1,
margin={"top": "25mm", "bottom": "25mm", "left": "20mm", "right": "20mm"},
print_background=True,
display_header_footer=True,
footer_template='<div style="font-size:9px; margin:0 auto; color:#999; text-align:center; width:100%;">Page <span class="pageNumber"></span> of <span class="totalPages"></span></div>',
header_template='<div></div>',
prefer_css_page_size=True,
)
# 9. Encrypt PDF with password if provided
if request.password:
import pikepdf
src = pikepdf.open(io.BytesIO(pdf_bytes))
encrypted_buf = io.BytesIO()
src.save(
encrypted_buf,
encryption=pikepdf.Encryption(
owner=request.password,
user=request.password,
R=6, # AES-256
),
)
src.close()
pdf_bytes = encrypted_buf.getvalue()
# 10. Generate safe filename — ASCII-only for HTTP header safety
# re.ASCII ensures \w matches only [a-zA-Z0-9_], preventing
# Unicode chars that Starlette can't encode as latin-1 headers.
safe_title = re.sub(r'[^\w\s-]', '', request.title, flags=re.ASCII)[:50].strip() or 'Document'
safe_title = re.sub(r'\s+', '_', safe_title)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"{safe_title}_{timestamp}.pdf"
# Build Content-Disposition with RFC 5987 filename* for Unicode display
from urllib.parse import quote
cd_value = f'attachment; filename="{filename}"'
# If original title has non-ASCII chars, add filename* so browsers
# show the real Unicode name while the ASCII fallback stays safe
unicode_title = re.sub(r'[<>:"/\\|?*]', '', request.title)[:80].strip() or 'Document'
unicode_title = re.sub(r'\s+', '_', unicode_title)
unicode_filename = f"{unicode_title}_{timestamp}.pdf"
encoded_unicode = quote(unicode_filename, safe='')
cd_value += f"; filename*=UTF-8''{encoded_unicode}"
return Response(
content=pdf_bytes,
media_type="application/pdf",
headers={
"Content-Disposition": cd_value,
"Cache-Control": "no-cache, no-store, must-revalidate",
}
)
except asyncio.TimeoutError:
raise HTTPException(status_code=408, detail="PDF generation timed out")
except PlaywrightError as e:
raise HTTPException(status_code=500, detail=f"Browser error: {str(e)[:200]}")
except Exception as e:
print(f"Editor PDF Export Error: {str(e)}")
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail="PDF generation failed")
finally:
if page:
with suppress(Exception):
await page.close()
# Health check endpoint
@app.get("/health")
async def health():
"""Health check endpoint for load balancers and monitoring"""
return {
"status": "healthy",
"timestamp": datetime.now().isoformat(),
"service": "pdf-exporter"
}
# Test endpoint (for debugging)
@app.post("/api/export/html")
async def export_html_endpoint(request: ExportRequest):
"""Return rendered HTML for debugging purposes"""
processed_msgs = []
for msg in request.messages:
html_content = markdown.markdown(
str(msg.get('content', '')),
extensions=['fenced_code', 'tables', 'sane_lists']
)
processed_msgs.append({
'role': msg.get('role', 'unknown'),
'content_html': html_content
})
font_family = get_font_for_language(request.language)
template = jinja_env.from_string(PDF_HTML_TEMPLATE)
full_html = template.render(
messages=processed_msgs,
document_title="NeuralStream AI (HTML Preview)",
date=datetime.now().strftime("%B %d, %Y"),
language=request.language,
font_family=font_family,
text_direction=get_text_direction(request.language),
text_alignment=get_text_alignment(request.language),
title="Chat HTML Preview"
)
return Response(
content=full_html,
media_type="text/html",
headers={"X-Debug": "html-preview"}
)
# Startup/Shutdown events
@app.on_event("startup")
async def startup_event():
"""Initialize browser pool on startup to warm up resources"""
print("🚀 Pre-warming browser pool...")
try:
# Launch browser immediately so we catch any errors at deploy time
# instead of failing on the first user request.
await browser_pool.get_page()
print("✅ Browser pool warmed up successfully")
except Exception as e:
print(f"⚠️ Warning: Failed to pre-warm browser pool: {e}")
# We don't raise here so the server still starts; individual requests will retry.
@app.on_event("shutdown")
async def shutdown_event():
"""Clean shutdown of browser pool"""
await browser_pool.close()
if __name__ == "__main__":
# Force the Windows Proactor Loop Policy (Required for Playwright)
if sys.platform == 'win32':
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
print("🚀 Starting NeuralStream PDF Backend...")
uvicorn.run(app, host="0.0.0.0", port=7860, reload=False)