handbook_engine / app /services /pdf_renderer.py
internationalscholarsprogram's picture
audit: enforce design rules and enhance handbook formatting
88646e1
"""Playwright-based PDF renderer β€” Chromium headless PDF export.
Replaces WeasyPrint. Uses Playwright to launch headless Chromium,
load the fully-rendered HTML, wait for fonts/images/layout, and
export a print-quality PDF.
"""
from __future__ import annotations
import asyncio
import logging
import os
import tempfile
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
# Singleton browser instance for reuse across requests
_browser = None
_browser_lock = asyncio.Lock()
async def _get_browser():
"""Get or create a persistent Chromium browser instance."""
global _browser
async with _browser_lock:
if _browser is None or not _browser.is_connected():
from playwright.async_api import async_playwright
pw = await async_playwright().start()
_browser = await pw.chromium.launch(
headless=True,
args=[
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--font-render-hinting=none",
],
)
logger.info("Chromium browser launched for PDF rendering")
return _browser
async def shutdown_browser():
"""Gracefully close the browser on application shutdown."""
global _browser
async with _browser_lock:
if _browser and _browser.is_connected():
await _browser.close()
_browser = None
logger.info("Chromium browser closed")
async def render_pdf_from_html(
html_content: str,
*,
format: str = "A4",
print_background: bool = True,
prefer_css_page_size: bool = True,
wait_timeout: int = 30000,
) -> bytes:
"""Render HTML string to PDF bytes using Playwright Chromium.
Generates a base PDF (content only, no decorative header/label),
then creates a one-page overlay with the header image and right-side
label, and stamps the overlay onto content pages (page 3 β†’ last
content page) using pypdf. Pages 1-2 (cover/TOC) and trailing
full-page image pages get no overlay.
Args:
html_content: Complete HTML document string.
format: Page format (default A4).
print_background: Include background colors/images.
prefer_css_page_size: Use @page CSS rules for sizing.
wait_timeout: Max time (ms) to wait for page load.
Returns:
PDF file bytes.
"""
browser = await _get_browser()
context = await browser.new_context(
viewport={"width": 794, "height": 1123}, # A4 at 96dpi
device_scale_factor=2,
java_script_enabled=True,
)
page = await context.new_page()
try:
# Write HTML to a temp file so Chromium can load local file:// resources
with tempfile.NamedTemporaryFile(
mode="w",
suffix=".html",
delete=False,
encoding="utf-8",
) as tmp:
tmp.write(html_content)
tmp_path = tmp.name
try:
file_url = Path(tmp_path).as_uri()
await page.goto(file_url, wait_until="networkidle", timeout=wait_timeout)
# Wait for fonts and images to be fully loaded
await page.evaluate("() => document.fonts.ready")
await page.evaluate("""
() => {
const images = Array.from(document.querySelectorAll('img'));
return Promise.all(images.map(img => {
if (img.complete) return Promise.resolve();
return new Promise(r => {
img.addEventListener('load', r);
img.addEventListener('error', r);
});
}));
}
""")
await page.wait_for_timeout(500)
# ── Collect info from DOM before hiding elements ──
header_src = await page.evaluate("""
() => {
const img = document.querySelector('.page-header img');
return img ? img.src : '';
}
""")
label_src = await page.evaluate("""
() => {
const img = document.querySelector('.hb-right-label img');
return img ? img.src : '';
}
""")
num_bottom_pages = await page.evaluate("""
() => document.querySelectorAll('.fullpage-img-wrap').length
""")
# Cover page count: cover + TOC image (each is a .cover-page)
num_cover_pages = await page.evaluate("""
() => document.querySelectorAll('.cover-page').length
""")
logger.info(
"Overlay info: header=%s, label=%s, covers=%d, bottoms=%d",
bool(header_src), bool(label_src),
num_cover_pages, num_bottom_pages,
)
# ── Hide header, footer, and label from the base PDF ──
await page.evaluate("""
() => {
document.querySelectorAll('.page-header, .page-footer, .hb-right-label')
.forEach(el => el.style.display = 'none');
}
""")
# ── Render BASE PDF (no header, no label) ──
base_pdf = await page.pdf(
format=format,
print_background=print_background,
prefer_css_page_size=prefer_css_page_size,
margin={
"top": "2.54cm",
"right": "2.54cm",
"bottom": "2.54cm",
"left": "2.54cm",
},
display_header_footer=True,
header_template='<span></span>',
footer_template=(
'<div style="width:100%;text-align:center;font-size:10px;'
'font-family:Century Gothic,Segoe UI,sans-serif;color:#1C75BC;'
'padding:0 0 4px 0;">'
'<span class="pageNumber"></span></div>'
),
)
logger.info("Base PDF rendered, size=%d bytes", len(base_pdf))
finally:
os.unlink(tmp_path)
# ── Build overlay (header + label) and stamp onto content pages ──
if not header_src and not label_src:
logger.info("No header or label to overlay, returning base PDF")
return base_pdf
overlay_pdf = await _build_overlay_pdf(
page, header_src, label_src, format, wait_timeout
)
merged = _stamp_overlay(
base_pdf, overlay_pdf,
skip_front=num_cover_pages,
skip_back=num_bottom_pages,
)
logger.info("Final PDF with overlay, size=%d bytes", len(merged))
return merged
finally:
await context.close()
async def _build_overlay_pdf(
page, header_src: str, label_src: str,
format: str, timeout: int,
) -> bytes:
"""Render a single-page transparent overlay PDF with header + label."""
parts = []
if header_src:
parts.append(
f'<div style="position:fixed;top:0;left:0;width:100%;height:2.54cm;'
f'margin:0;padding:0;overflow:hidden;z-index:1;">'
f'<img src="{header_src}" style="display:block;width:100%;'
f'height:100%;object-fit:fill;margin:0;padding:0;" /></div>'
)
if label_src:
parts.append(
f'<div style="position:fixed;top:3.14cm;right:0;width:1.65cm;'
f'height:23.42cm;z-index:2;overflow:hidden;">'
f'<img src="{label_src}" style="display:block;width:100%;'
f'height:100%;object-fit:fill;" /></div>'
)
overlay_html = (
'<!doctype html><html><head><meta charset="utf-8">'
'<style>'
'@page{size:A4;margin:0}'
'html,body{margin:0;padding:0;background:transparent}'
'</style></head><body>'
+ '\n'.join(parts)
+ '<div style="height:297mm;width:210mm;"></div>'
'</body></html>'
)
with tempfile.NamedTemporaryFile(
mode="w", suffix=".html", delete=False, encoding="utf-8",
) as tmp:
tmp.write(overlay_html)
tmp_path = tmp.name
try:
await page.goto(
Path(tmp_path).as_uri(),
wait_until="networkidle",
timeout=timeout,
)
await page.evaluate("() => document.fonts.ready")
await page.evaluate("""
() => {
const images = Array.from(document.querySelectorAll('img'));
return Promise.all(images.map(img => {
if (img.complete) return Promise.resolve();
return new Promise(r => {
img.addEventListener('load', r);
img.addEventListener('error', r);
});
}));
}
""")
await page.wait_for_timeout(300)
overlay_bytes = await page.pdf(
format=format,
print_background=True,
prefer_css_page_size=True,
margin={"top": "0", "right": "0", "bottom": "0", "left": "0"},
display_header_footer=False,
)
logger.info("Overlay PDF rendered, size=%d bytes", len(overlay_bytes))
return overlay_bytes
finally:
os.unlink(tmp_path)
def _stamp_overlay(
base_pdf: bytes,
overlay_pdf: bytes,
skip_front: int = 2,
skip_back: int = 4,
) -> bytes:
"""Merge overlay onto content pages of the base PDF.
Pages 0..(skip_front-1) and (total-skip_back)..(total-1) are left
untouched. All other pages get the overlay stamped on top.
"""
import io
from pypdf import PdfReader, PdfWriter
base = PdfReader(io.BytesIO(base_pdf))
overlay_reader = PdfReader(io.BytesIO(overlay_pdf))
overlay_page = overlay_reader.pages[0]
writer = PdfWriter()
total = len(base.pages)
first_content = skip_front # e.g. page index 2
last_content = total - skip_back - 1 # e.g. total-5
for i, pg in enumerate(base.pages):
if first_content <= i <= last_content:
pg.merge_page(overlay_page)
writer.add_page(pg)
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()