File size: 10,571 Bytes
ec94fc1 dbfc012 ec94fc1 f368078 ec94fc1 dbfc012 ec94fc1 dbfc012 ec94fc1 dbfc012 ec94fc1 dbfc012 ec94fc1 dbfc012 ec94fc1 dbfc012 ec94fc1 dbfc012 205c317 dbfc012 205c317 dbfc012 ec94fc1 dbfc012 ec94fc1 539f2a2 f35bd84 539f2a2 ec94fc1 dbfc012 ec94fc1 dbfc012 ec94fc1 dbfc012 e48fa15 dbfc012 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 | """Playwright-based PDF renderer β Chromium headless PDF export.
Replaces WeasyPrint. Uses Playwright to launch headless Chromium,
load the fully-rendered HTML, wait for fonts/images/layout, and
export a print-quality PDF.
"""
from __future__ import annotations
import asyncio
import logging
import os
import tempfile
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
# Singleton browser instance for reuse across requests
_browser = None
_browser_lock = asyncio.Lock()
async def _get_browser():
"""Get or create a persistent Chromium browser instance."""
global _browser
async with _browser_lock:
if _browser is None or not _browser.is_connected():
from playwright.async_api import async_playwright
pw = await async_playwright().start()
_browser = await pw.chromium.launch(
headless=True,
args=[
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--font-render-hinting=none",
],
)
logger.info("Chromium browser launched for PDF rendering")
return _browser
async def shutdown_browser():
"""Gracefully close the browser on application shutdown."""
global _browser
async with _browser_lock:
if _browser and _browser.is_connected():
await _browser.close()
_browser = None
logger.info("Chromium browser closed")
async def render_pdf_from_html(
html_content: str,
*,
format: str = "A4",
print_background: bool = True,
prefer_css_page_size: bool = True,
wait_timeout: int = 30000,
) -> bytes:
"""Render HTML string to PDF bytes using Playwright Chromium.
Generates a base PDF (content only, no decorative header/label),
then creates a one-page overlay with the header image and right-side
label, and stamps the overlay onto content pages (page 3 β last
content page) using pypdf. Pages 1-2 (cover/TOC) and trailing
full-page image pages get no overlay.
Args:
html_content: Complete HTML document string.
format: Page format (default A4).
print_background: Include background colors/images.
prefer_css_page_size: Use @page CSS rules for sizing.
wait_timeout: Max time (ms) to wait for page load.
Returns:
PDF file bytes.
"""
browser = await _get_browser()
context = await browser.new_context(
viewport={"width": 794, "height": 1123}, # A4 at 96dpi
device_scale_factor=4, # 4Γ for maximum text clarity
java_script_enabled=True,
)
page = await context.new_page()
try:
# Write HTML to a temp file so Chromium can load local file:// resources
with tempfile.NamedTemporaryFile(
mode="w",
suffix=".html",
delete=False,
encoding="utf-8",
) as tmp:
tmp.write(html_content)
tmp_path = tmp.name
try:
file_url = Path(tmp_path).as_uri()
await page.goto(file_url, wait_until="networkidle", timeout=wait_timeout)
# Wait for fonts and images to be fully loaded
await page.evaluate("() => document.fonts.ready")
await page.evaluate("""
() => {
const images = Array.from(document.querySelectorAll('img'));
return Promise.all(images.map(img => {
if (img.complete) return Promise.resolve();
return new Promise(r => {
img.addEventListener('load', r);
img.addEventListener('error', r);
});
}));
}
""")
await page.wait_for_timeout(500)
# ββ Collect info from DOM before hiding elements ββ
header_src = await page.evaluate("""
() => {
const img = document.querySelector('.page-header img');
return img ? img.src : '';
}
""")
label_src = await page.evaluate("""
() => {
const img = document.querySelector('.hb-right-label img');
return img ? img.src : '';
}
""")
num_bottom_pages = await page.evaluate("""
() => document.querySelectorAll('.fullpage-img-wrap').length
""")
# Cover page count: cover + TOC image (each is a .cover-page)
num_cover_pages = await page.evaluate("""
() => document.querySelectorAll('.cover-page').length
""")
logger.info(
"Overlay info: header=%s, label=%s, covers=%d, bottoms=%d",
bool(header_src), bool(label_src),
num_cover_pages, num_bottom_pages,
)
# ββ Hide header, footer, and label from the base PDF ββ
await page.evaluate("""
() => {
document.querySelectorAll('.page-header, .page-footer, .hb-right-label')
.forEach(el => el.style.display = 'none');
}
""")
# ββ Render BASE PDF (no header, no label) ββ
base_pdf = await page.pdf(
format=format,
print_background=print_background,
prefer_css_page_size=prefer_css_page_size,
margin={
"top": "2.54cm",
"right": "2.54cm",
"bottom": "2.54cm",
"left": "2.54cm",
},
display_header_footer=True,
header_template='<span></span>',
footer_template=(
'<div style="width:100%;text-align:center;font-size:9px;'
'font-family:Century Gothic,Segoe UI,sans-serif;color:#0263A3;'
'padding:0 0 6px 0;letter-spacing:0.5px;">'
'<span style="font-weight:700;" class="pageNumber"></span></div>'
),
)
logger.info("Base PDF rendered, size=%d bytes", len(base_pdf))
finally:
os.unlink(tmp_path)
# ββ Build overlay (header + label) and stamp onto content pages ββ
if not header_src and not label_src:
logger.info("No header or label to overlay, returning base PDF")
return base_pdf
overlay_pdf = await _build_overlay_pdf(
page, header_src, label_src, format, wait_timeout
)
merged = _stamp_overlay(
base_pdf, overlay_pdf,
skip_front=num_cover_pages,
skip_back=num_bottom_pages,
)
logger.info("Final PDF with overlay, size=%d bytes", len(merged))
return merged
finally:
await context.close()
async def _build_overlay_pdf(
page, header_src: str, label_src: str,
format: str, timeout: int,
) -> bytes:
"""Render a single-page transparent overlay PDF with header + label."""
parts = []
if header_src:
parts.append(
f'<div style="position:fixed;top:0;left:0;width:100%;height:2.54cm;'
f'margin:0;padding:0;overflow:hidden;z-index:1;">'
f'<img src="{header_src}" style="display:block;width:100%;'
f'height:100%;object-fit:fill;margin:0;padding:0;" /></div>'
)
if label_src:
parts.append(
f'<div style="position:fixed;top:3.14cm;right:0;width:2.05cm;'
f'height:21.76cm;z-index:2;overflow:hidden;">'
f'<img src="{label_src}" style="display:block;width:100%;'
f'height:100%;object-fit:fill;" /></div>'
)
overlay_html = (
'<!doctype html><html><head><meta charset="utf-8">'
'<style>'
'@page{size:A4;margin:0}'
'html,body{margin:0;padding:0;background:transparent}'
'</style></head><body>'
+ '\n'.join(parts)
+ '<div style="height:297mm;width:210mm;"></div>'
'</body></html>'
)
with tempfile.NamedTemporaryFile(
mode="w", suffix=".html", delete=False, encoding="utf-8",
) as tmp:
tmp.write(overlay_html)
tmp_path = tmp.name
try:
await page.goto(
Path(tmp_path).as_uri(),
wait_until="networkidle",
timeout=timeout,
)
await page.evaluate("() => document.fonts.ready")
await page.evaluate("""
() => {
const images = Array.from(document.querySelectorAll('img'));
return Promise.all(images.map(img => {
if (img.complete) return Promise.resolve();
return new Promise(r => {
img.addEventListener('load', r);
img.addEventListener('error', r);
});
}));
}
""")
await page.wait_for_timeout(300)
overlay_bytes = await page.pdf(
format=format,
print_background=True,
prefer_css_page_size=True,
margin={"top": "0", "right": "0", "bottom": "0", "left": "0"},
display_header_footer=False,
)
logger.info("Overlay PDF rendered, size=%d bytes", len(overlay_bytes))
return overlay_bytes
finally:
os.unlink(tmp_path)
def _stamp_overlay(
base_pdf: bytes,
overlay_pdf: bytes,
skip_front: int = 2,
skip_back: int = 4,
) -> bytes:
"""Merge overlay onto content pages of the base PDF.
Pages 0..(skip_front-1) and (total-skip_back)..(total-1) are left
untouched. All other pages get the overlay stamped on top.
"""
import io
from pypdf import PdfReader, PdfWriter
base = PdfReader(io.BytesIO(base_pdf))
overlay_reader = PdfReader(io.BytesIO(overlay_pdf))
overlay_page = overlay_reader.pages[0]
writer = PdfWriter()
total = len(base.pages)
first_content = skip_front # e.g. page index 2
last_content = total - skip_back - 1 # e.g. total-5
for i, pg in enumerate(base.pages):
if first_content <= i <= last_content:
pg.merge_page(overlay_page)
writer.add_page(pg)
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()
|