internationalscholarsprogram commited on
Commit
dbfc012
·
1 Parent(s): 68be85f

feat: overlay-based header+label on content pages only (pypdf merge)

Browse files

- Render base PDF with no header/label via Playwright
- Build single-page overlay PDF with header image + label
- Stamp overlay onto content pages (skip cover/toc + final image pages)
- Header flush at top edge, label at right edge (1.65cm x 23.42cm)
- Pages 1-2: clean (no header, no label)
- Final image pages: clean (no header, no label)
- Added pypdf>=4.0.0 dependency

app/services/pdf_renderer.py CHANGED
@@ -63,6 +63,12 @@ async def render_pdf_from_html(
63
  ) -> bytes:
64
  """Render HTML string to PDF bytes using Playwright Chromium.
65
 
 
 
 
 
 
 
66
  Args:
67
  html_content: Complete HTML document string.
68
  format: Page format (default A4).
@@ -96,91 +102,59 @@ async def render_pdf_from_html(
96
  file_url = Path(tmp_path).as_uri()
97
  await page.goto(file_url, wait_until="networkidle", timeout=wait_timeout)
98
 
99
- # Wait for fonts to be fully loaded
100
- await page.evaluate("""
101
- () => document.fonts.ready
102
- """)
103
-
104
- # Wait for all images to complete loading
105
  await page.evaluate("""
106
  () => {
107
  const images = Array.from(document.querySelectorAll('img'));
108
  return Promise.all(images.map(img => {
109
  if (img.complete) return Promise.resolve();
110
- return new Promise((resolve) => {
111
- img.addEventListener('load', resolve);
112
- img.addEventListener('error', resolve);
113
  });
114
  }));
115
  }
116
  """)
117
-
118
- # Small delay for final layout settle
119
  await page.wait_for_timeout(500)
120
 
121
- # Debug: verify label element presence
122
- label_info = await page.evaluate("""
123
- () => {
124
- const el = document.querySelector('.hb-right-label');
125
- if (!el) return 'NO .hb-right-label element found';
126
- const img = el.querySelector('img');
127
- const src = img ? img.src.substring(0, 80) : 'NO img';
128
- const loaded = img ? img.complete : false;
129
- const natW = img ? img.naturalWidth : 0;
130
- const natH = img ? img.naturalHeight : 0;
131
- return `label OK: loaded=${loaded}, natural=${natW}x${natH}, src=${src}...`;
132
- }
133
- """)
134
- logger.info("Label check: %s", label_info)
135
-
136
- # Extract header image src from DOM for Playwright header_template
137
  header_src = await page.evaluate("""
138
  () => {
139
  const img = document.querySelector('.page-header img');
140
  return img ? img.src : '';
141
  }
142
  """)
143
-
144
- # Build header template — image scales to fit the top margin zone.
145
- # Playwright margin.top = 2.54cm; header image fills that entire zone.
146
- if header_src:
147
- header_tpl = (
148
- '<div style="width:100%;height:100%;margin:0;padding:0;overflow:hidden;">'
149
- f'<img src="{header_src}" '
150
- 'style="display:block;width:100%;height:100%;margin:0;padding:0;object-fit:fill;" />'
151
- '</div>'
152
- )
153
- else:
154
- header_tpl = '<span></span>'
155
-
156
- # Hide in-page header and footer divs before PDF generation.
157
- # .hb-right-label stays visible (position:fixed, repeats every page).
158
- await page.evaluate("""
159
  () => {
160
- document.querySelectorAll('.page-header, .page-footer')
161
- .forEach(el => el.style.display = 'none');
162
  }
163
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- # Hide the label on cover and fullpage image pages.
166
- # These pages expand to fill the full page so the label
167
- # should not be visible on them.
168
  await page.evaluate("""
169
  () => {
170
- document.querySelectorAll('.cover-page, .fullpage-img-wrap').forEach(el => {
171
- const overlay = document.createElement('div');
172
- overlay.style.cssText = 'position:absolute;top:0;right:0;width:2cm;height:100%;background:#fff;z-index:20;';
173
- el.style.position = 'relative';
174
- el.appendChild(overlay);
175
- });
176
  }
177
  """)
178
 
179
- # NON-NEGOTIABLE: 2.54 cm margins on all four sides.
180
- # Header: rendered by header_template inside the top margin zone.
181
- # Footer: page number rendered by footer_template in the bottom margin.
182
- # Content: sits inside the margin box — no CSS padding needed.
183
- pdf_bytes = await page.pdf(
184
  format=format,
185
  print_background=print_background,
186
  prefer_css_page_size=prefer_css_page_size,
@@ -191,7 +165,7 @@ async def render_pdf_from_html(
191
  "left": "2.54cm",
192
  },
193
  display_header_footer=True,
194
- header_template=header_tpl,
195
  footer_template=(
196
  '<div style="width:100%;text-align:center;font-size:10px;'
197
  'font-family:Century Gothic,Segoe UI,sans-serif;color:#333;'
@@ -199,12 +173,132 @@ async def render_pdf_from_html(
199
  '<span class="pageNumber"></span></div>'
200
  ),
201
  )
202
-
203
- logger.info("PDF rendered via Playwright, size=%d bytes", len(pdf_bytes))
204
- return pdf_bytes
205
 
206
  finally:
207
  os.unlink(tmp_path)
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  finally:
210
  await context.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  ) -> bytes:
64
  """Render HTML string to PDF bytes using Playwright Chromium.
65
 
66
+ Generates a base PDF (content only, no decorative header/label),
67
+ then creates a one-page overlay with the header image and right-side
68
+ label, and stamps the overlay onto content pages (page 3 → last
69
+ content page) using pypdf. Pages 1-2 (cover/TOC) and trailing
70
+ full-page image pages get no overlay.
71
+
72
  Args:
73
  html_content: Complete HTML document string.
74
  format: Page format (default A4).
 
102
  file_url = Path(tmp_path).as_uri()
103
  await page.goto(file_url, wait_until="networkidle", timeout=wait_timeout)
104
 
105
+ # Wait for fonts and images to be fully loaded
106
+ await page.evaluate("() => document.fonts.ready")
 
 
 
 
107
  await page.evaluate("""
108
  () => {
109
  const images = Array.from(document.querySelectorAll('img'));
110
  return Promise.all(images.map(img => {
111
  if (img.complete) return Promise.resolve();
112
+ return new Promise(r => {
113
+ img.addEventListener('load', r);
114
+ img.addEventListener('error', r);
115
  });
116
  }));
117
  }
118
  """)
 
 
119
  await page.wait_for_timeout(500)
120
 
121
+ # ── Collect info from DOM before hiding elements ──
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  header_src = await page.evaluate("""
123
  () => {
124
  const img = document.querySelector('.page-header img');
125
  return img ? img.src : '';
126
  }
127
  """)
128
+ label_src = await page.evaluate("""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  () => {
130
+ const img = document.querySelector('.hb-right-label img');
131
+ return img ? img.src : '';
132
  }
133
  """)
134
+ num_bottom_pages = await page.evaluate("""
135
+ () => document.querySelectorAll('.fullpage-img-wrap').length
136
+ """)
137
+ # Cover page count: cover + TOC image (each is a .cover-page)
138
+ num_cover_pages = await page.evaluate("""
139
+ () => document.querySelectorAll('.cover-page').length
140
+ """)
141
+
142
+ logger.info(
143
+ "Overlay info: header=%s, label=%s, covers=%d, bottoms=%d",
144
+ bool(header_src), bool(label_src),
145
+ num_cover_pages, num_bottom_pages,
146
+ )
147
 
148
+ # ── Hide header, footer, and label from the base PDF ──
 
 
149
  await page.evaluate("""
150
  () => {
151
+ document.querySelectorAll('.page-header, .page-footer, .hb-right-label')
152
+ .forEach(el => el.style.display = 'none');
 
 
 
 
153
  }
154
  """)
155
 
156
+ # ── Render BASE PDF (no header, no label) ──
157
+ base_pdf = await page.pdf(
 
 
 
158
  format=format,
159
  print_background=print_background,
160
  prefer_css_page_size=prefer_css_page_size,
 
165
  "left": "2.54cm",
166
  },
167
  display_header_footer=True,
168
+ header_template='<span></span>',
169
  footer_template=(
170
  '<div style="width:100%;text-align:center;font-size:10px;'
171
  'font-family:Century Gothic,Segoe UI,sans-serif;color:#333;'
 
173
  '<span class="pageNumber"></span></div>'
174
  ),
175
  )
176
+ logger.info("Base PDF rendered, size=%d bytes", len(base_pdf))
 
 
177
 
178
  finally:
179
  os.unlink(tmp_path)
180
 
181
+ # ── Build overlay (header + label) and stamp onto content pages ──
182
+ if not header_src and not label_src:
183
+ logger.info("No header or label to overlay, returning base PDF")
184
+ return base_pdf
185
+
186
+ overlay_pdf = await _build_overlay_pdf(
187
+ page, header_src, label_src, format, wait_timeout
188
+ )
189
+
190
+ merged = _stamp_overlay(
191
+ base_pdf, overlay_pdf,
192
+ skip_front=num_cover_pages,
193
+ skip_back=num_bottom_pages,
194
+ )
195
+ logger.info("Final PDF with overlay, size=%d bytes", len(merged))
196
+ return merged
197
+
198
  finally:
199
  await context.close()
200
+
201
+
202
+ async def _build_overlay_pdf(
203
+ page, header_src: str, label_src: str,
204
+ format: str, timeout: int,
205
+ ) -> bytes:
206
+ """Render a single-page transparent overlay PDF with header + label."""
207
+ parts = []
208
+ if header_src:
209
+ parts.append(
210
+ f'<div style="position:fixed;top:0;left:0;width:100%;height:2.54cm;'
211
+ f'margin:0;padding:0;overflow:hidden;z-index:1;">'
212
+ f'<img src="{header_src}" style="display:block;width:100%;'
213
+ f'height:100%;object-fit:fill;margin:0;padding:0;" /></div>'
214
+ )
215
+ if label_src:
216
+ parts.append(
217
+ f'<div style="position:fixed;top:0;right:0;width:1.65cm;'
218
+ f'height:23.42cm;z-index:2;overflow:hidden;">'
219
+ f'<img src="{label_src}" style="display:block;width:100%;'
220
+ f'height:100%;object-fit:fill;" /></div>'
221
+ )
222
+
223
+ overlay_html = (
224
+ '<!doctype html><html><head><meta charset="utf-8">'
225
+ '<style>'
226
+ '@page{size:A4;margin:0}'
227
+ 'html,body{margin:0;padding:0;background:transparent}'
228
+ '</style></head><body>'
229
+ + '\n'.join(parts)
230
+ + '<div style="height:297mm;width:210mm;"></div>'
231
+ '</body></html>'
232
+ )
233
+
234
+ with tempfile.NamedTemporaryFile(
235
+ mode="w", suffix=".html", delete=False, encoding="utf-8",
236
+ ) as tmp:
237
+ tmp.write(overlay_html)
238
+ tmp_path = tmp.name
239
+
240
+ try:
241
+ await page.goto(
242
+ Path(tmp_path).as_uri(),
243
+ wait_until="networkidle",
244
+ timeout=timeout,
245
+ )
246
+ await page.evaluate("() => document.fonts.ready")
247
+ await page.evaluate("""
248
+ () => {
249
+ const images = Array.from(document.querySelectorAll('img'));
250
+ return Promise.all(images.map(img => {
251
+ if (img.complete) return Promise.resolve();
252
+ return new Promise(r => {
253
+ img.addEventListener('load', r);
254
+ img.addEventListener('error', r);
255
+ });
256
+ }));
257
+ }
258
+ """)
259
+ await page.wait_for_timeout(300)
260
+
261
+ overlay_bytes = await page.pdf(
262
+ format=format,
263
+ print_background=True,
264
+ prefer_css_page_size=True,
265
+ margin={"top": "0", "right": "0", "bottom": "0", "left": "0"},
266
+ display_header_footer=False,
267
+ )
268
+ logger.info("Overlay PDF rendered, size=%d bytes", len(overlay_bytes))
269
+ return overlay_bytes
270
+ finally:
271
+ os.unlink(tmp_path)
272
+
273
+
274
+ def _stamp_overlay(
275
+ base_pdf: bytes,
276
+ overlay_pdf: bytes,
277
+ skip_front: int = 2,
278
+ skip_back: int = 4,
279
+ ) -> bytes:
280
+ """Merge overlay onto content pages of the base PDF.
281
+
282
+ Pages 0..(skip_front-1) and (total-skip_back)..(total-1) are left
283
+ untouched. All other pages get the overlay stamped on top.
284
+ """
285
+ import io
286
+ from pypdf import PdfReader, PdfWriter
287
+
288
+ base = PdfReader(io.BytesIO(base_pdf))
289
+ overlay_reader = PdfReader(io.BytesIO(overlay_pdf))
290
+ overlay_page = overlay_reader.pages[0]
291
+ writer = PdfWriter()
292
+
293
+ total = len(base.pages)
294
+ first_content = skip_front # e.g. page index 2
295
+ last_content = total - skip_back - 1 # e.g. total-5
296
+
297
+ for i, pg in enumerate(base.pages):
298
+ if first_content <= i <= last_content:
299
+ pg.merge_page(overlay_page)
300
+ writer.add_page(pg)
301
+
302
+ buf = io.BytesIO()
303
+ writer.write(buf)
304
+ return buf.getvalue()
app/static/css/print.css CHANGED
@@ -65,7 +65,7 @@ body {
65
  width: 100%;
66
  max-width: 100%;
67
  margin: 0;
68
- padding: 0 1.85cm 0 0;
69
  position: relative;
70
  z-index: 1;
71
  overflow: visible;
@@ -90,28 +90,23 @@ body {
90
 
91
  /* ------------------------------
92
  DECORATIVE RIGHT-SIDE LABEL
93
- position:fixed repeats on every printed page.
94
- Placed at right:0 inside the content area.
95
- page-content gets padding-right to prevent overlap.
96
- Original: 43.28cm × 3.31cm → scaled 54% × 50%
97
- = 23.42cm × 1.65cm
98
  ------------------------------ */
99
  .hb-right-label {
100
- position: fixed;
101
- top: 0;
102
- right: 0;
103
- width: 1.65cm;
104
- height: 23.42cm;
105
- z-index: 10;
106
- pointer-events: none;
107
  overflow: hidden;
 
108
  }
109
 
110
  .hb-right-label img {
111
  display: block;
112
- width: 100%;
113
- height: 100%;
114
- object-fit: fill;
115
  }
116
 
117
  /* ------------------------------
 
65
  width: 100%;
66
  max-width: 100%;
67
  margin: 0;
68
+ padding: 0;
69
  position: relative;
70
  z-index: 1;
71
  overflow: visible;
 
90
 
91
  /* ------------------------------
92
  DECORATIVE RIGHT-SIDE LABEL
93
+ Hidden in the base HTML. Rendered as a PDF overlay
94
+ by pdf_renderer.py onto content pages only.
95
+ Kept here so the <img> loads for src extraction.
96
+ Original: 43.28cm × 3.31cm → 54% × 50% = 23.42cm × 1.65cm
 
97
  ------------------------------ */
98
  .hb-right-label {
99
+ position: absolute;
100
+ top: -9999px;
101
+ left: -9999px;
102
+ width: 1px;
103
+ height: 1px;
 
 
104
  overflow: hidden;
105
+ pointer-events: none;
106
  }
107
 
108
  .hb-right-label img {
109
  display: block;
 
 
 
110
  }
111
 
112
  /* ------------------------------
requirements.txt CHANGED
@@ -6,4 +6,5 @@ httpx>=0.27.0
6
  jinja2>=3.1.0
7
  markupsafe>=2.1.0
8
  playwright>=1.40.0
 
9
  python-dotenv>=1.0.0
 
6
  jinja2>=3.1.0
7
  markupsafe>=2.1.0
8
  playwright>=1.40.0
9
+ pypdf>=4.0.0
10
  python-dotenv>=1.0.0