handbook-engine / app /services /html_builder.py
internationalscholarsprogram's picture
Initial deploy: ISP Handbook PDF engine
2deab8c verified
"""HTML builder β€” assembles the full ISP Handbook HTML document.
Uses Jinja2 templates for HTML generation. Data preparation logic is
preserved from the original string-concatenation approach. The output
is a self-contained HTML suitable for Playwright Chromium PDF export.
"""
from __future__ import annotations
import base64
import logging
import mimetypes
import os
import re
from pathlib import Path
from typing import Any
from jinja2 import Environment, FileSystemLoader, select_autoescape
from markupsafe import Markup
from app.core.config import get_settings
from app.core.fonts import font_face_css, select_font_family
from app.services.normalizer import normalize_section, normalize_university
from app.services.renderers import (
fetch_image_data_uri,
render_global_blocks,
sort_toc,
_extract_university_funding,
)
from app.services.utils import (
format_money_figures,
get_any,
h,
handbook_anchor,
hb_slug,
is_truthy,
sort_sections_stable,
)
logger = logging.getLogger(__name__)
# Jinja2 environment β€” templates live alongside the app package
_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
def _get_jinja_env() -> Environment:
"""Create a Jinja2 environment pointing to our templates directory."""
env = Environment(
loader=FileSystemLoader(str(_TEMPLATES_DIR)),
autoescape=select_autoescape(["html"]),
trim_blocks=True,
lstrip_blocks=True,
)
return env
def _static_base_url() -> str:
"""Return absolute file:// URL to the static directory."""
static_dir = Path(__file__).resolve().parent.parent / "static"
return static_dir.as_uri()
def _unused_pdf_override_css(font_stack: str) -> str:
"""Legacy inline PDF override CSS β€” kept for reference only.
All styling now lives in static/css/print.css for Chromium rendering.
"""
return ""
# Section class map
SECTION_CLASS_MAP = {
"overview": "sec-overview",
"how_the_program_works": "sec-how",
"qualification_requirements": "sec-qualification",
"enrolment_steps": "sec-steps",
"withdrawal_refund_policy": "sec-policy",
"refund_guidelines": "sec-refund",
"program_contributions": "sec-contributions",
"program_features_breakdown": "sec-breakdown",
"funding_options_available": "sec-funding",
"summary_of_universities": "sec-summary",
"summary_of_universities_cosigner": "sec-summary-cosigner",
}
PAGE_BREAK_KEYS = {
"overview",
"how_the_program_works",
"qualification_requirements",
"enrolment_steps",
"withdrawal_refund_policy",
"refund_guidelines",
"program_contributions",
"program_features_breakdown",
"funding_options_available",
"summary_of_universities",
"summary_of_universities_cosigner",
}
def _collect_program_option_inconsistencies(value: Any, path: str, hits: list[str]) -> None:
"""Collect paths where only REGULAR or PRIME appears."""
if isinstance(value, dict):
for k, v in value.items():
_collect_program_option_inconsistencies(v, f"{path}.{k}" if path else str(k), hits)
return
if isinstance(value, list):
for i, v in enumerate(value):
_collect_program_option_inconsistencies(v, f"{path}[{i}]", hits)
return
if value is None:
return
text = str(value)
has_regular = bool(re.search(r"\bREGULAR\b", text, flags=re.IGNORECASE))
has_prime = bool(re.search(r"\bPRIME\b", text, flags=re.IGNORECASE))
if has_regular ^ has_prime:
hits.append(path)
def _prepare_university_data(
uni_raw: dict[str, Any],
allow_remote: bool,
include_inactive_programs: bool,
debug: bool,
stats: dict[str, Any],
) -> dict[str, Any]:
"""Prepare a single university's template data.
Extracts overview, campus image, benefits, programs, and extra sections
from the raw sections list. This moves the logic that was in
render_university_section into a data-preparation step so that the
Jinja2 template handles the HTML.
"""
uni_name = uni_raw["name"]
sections = uni_raw.get("sections", [])
is_first = uni_raw.get("_is_first", False)
stats["universities"] = stats.get("universities", 0) + 1
# Build section map; merge duplicate "programs"
sec_map: dict[str, dict] = {}
for s in sections:
if not isinstance(s, dict):
continue
k = str(s.get("section_key", ""))
if not k:
continue
if k == "programs" and k in sec_map:
existing = sec_map["programs"].get("section_json", {})
incoming = s.get("section_json", {})
if not isinstance(existing, dict):
existing = {}
if not isinstance(incoming, dict):
incoming = {}
a = existing.get("programs", [])
b = incoming.get("programs", [])
if not isinstance(a, list):
a = []
if not isinstance(b, list):
b = []
existing["programs"] = a + b
sec_map["programs"]["section_json"] = existing
continue
sec_map[k] = s
# Campus image
img_section = sec_map.get("campus_image") or sec_map.get("image")
campus_image = ""
campus_caption = ""
if img_section:
j = img_section.get("section_json", {})
if isinstance(j, dict):
campus_url = str(j.get("image_url", "")).strip()
campus_caption = str(j.get("caption", "")).strip()
if allow_remote and campus_url:
embedded = fetch_image_data_uri(campus_url)
if embedded:
campus_image = embedded
stats["images_embedded"] = stats.get("images_embedded", 0) + 1
else:
stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1
else:
stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1
# Overview and website
resolved_website = (uni_raw.get("website") or "").strip()
overview_data = None
if "overview" in sec_map:
overview_json = sec_map["overview"].get("section_json", {})
if not isinstance(overview_json, dict):
overview_json = {}
site_from_overview = get_any(
overview_json,
["university_website", "university_website_url", "website", "site", "url", "homepage", "web_url"],
)
if not resolved_website and site_from_overview:
resolved_website = site_from_overview
overview_data = {
"founded": get_any(overview_json, ["founded", "Founded"]),
"total_students": get_any(overview_json, ["total_students", "Total Students"]),
"undergraduates": get_any(overview_json, ["undergraduates", "Undergraduate Students", "undergraduate_students"]),
"postgraduates": get_any(overview_json, ["postgraduate_students", "Postgraduate Students"]),
"acceptance_rate": get_any(overview_json, ["acceptance_rate", "Acceptance Rate"]),
"location": get_any(overview_json, ["location", "Location"]),
"tuition": format_money_figures(str(get_any(overview_json, [
"tuition_out_of_state_yearly",
"Yearly Out of State Tuition Fees",
"Yearly Out-of-State Tuition Fees",
"Yearly Tuition Fees",
"Yearly Out-of-State Tuition Fees:",
]) or "")) or None,
}
if resolved_website:
stats["university_links"] = stats.get("university_links", 0) + 1
stats["website_rows"] = stats.get("website_rows", 0) + 1
# Benefits
# Benefits + Funding
benefits = []
funding_heading = "Funding Available"
funding_items: list[str] = []
if "benefits" in sec_map:
j = sec_map["benefits"].get("section_json", {})
if not isinstance(j, dict):
j = {}
raw_benefits = j.get("benefits", [])
if isinstance(raw_benefits, list):
benefits = [str(b).strip() for b in raw_benefits if str(b).strip()]
else:
benefits = []
funding_heading, funding_items = _extract_university_funding(
j,
{
"school_category": uni_raw.get("school_category"),
"status": "in" if is_truthy(uni_raw.get("is_active", True)) else "out",
},
)
# Programs
programs = None
if "programs" in sec_map:
j = sec_map["programs"].get("section_json", {})
if not isinstance(j, dict):
j = {}
programs_raw = j.get("programs", [])
if not isinstance(programs_raw, list):
programs_raw = []
if not include_inactive_programs:
programs_raw = [
p for p in programs_raw
if isinstance(p, dict) and is_truthy(
p.get("program_active", p.get("is_active", p.get("active", 1)))
)
]
programs = []
seen_names = set()
for p in programs_raw:
if not isinstance(p, dict):
continue
program_name = str(p.get("program_name", "")).strip()
# Deduplicate by lowercase program name
key = program_name.lower()
if key in seen_names:
continue
seen_names.add(key)
link = str(p.get("program_link", "")).strip()
if not link and isinstance(p.get("program_links"), dict):
link = str(p["program_links"].get("web_link", "")).strip()
programs.append({
"name": program_name,
"link": link,
"designation": str(p.get("designation", "")),
"entrance": str(p.get("entrance_exam", p.get("entrance_examination", ""))),
})
# Extra sections
skip_keys = {"campus_image", "image", "overview", "benefits", "programs"}
extra_sections = []
for s in sections:
if not isinstance(s, dict):
continue
k = str(s.get("section_key", ""))
if not k or k in skip_keys:
continue
title = str(s.get("section_title", ""))
j = s.get("section_json", {})
if not isinstance(j, dict):
j = {}
rendered = render_global_blocks(k, title, j, debug)
extra_sections.append({"rendered_html": Markup(rendered)})
classes = ["uni"]
if not is_first:
classes.append("page-break")
return {
"name": uni_name,
"anchor": uni_raw.get("anchor"),
"sort_order": uni_raw.get("sort_order"),
"website": resolved_website,
"classes": classes,
"overview": overview_data,
"campus_image": campus_image,
"campus_caption": campus_caption,
"benefits": benefits,
"funding_heading": funding_heading,
"funding_items": funding_items,
"programs": programs,
"extra_sections": extra_sections,
}
def build_handbook_html(
globals_data: list[dict[str, Any]],
by_uni: dict[int, dict[str, Any]],
images: dict[str, Any],
allow_remote: bool,
include_inactive_programs: bool = False,
debug: bool = False,
) -> str:
"""Build the full handbook HTML document using Jinja2 templates.
Preserves the same data preparation logic from the original version.
Rendering is delegated to Jinja2 templates with Playwright-compatible
HTML/CSS output.
"""
env = _get_jinja_env()
template = env.get_template("handbook.html")
font_meta = select_font_family()
font_css = font_face_css(font_meta)
# Base URL for static assets (CSS, images, etc.)
base_url = _static_base_url()
stats: dict[str, Any] = {
"universities": 0,
"images_embedded": 0,
"images_placeholder": 0,
"program_links_total": 0,
"program_missing_links_total": 0,
"missing_program_links": {},
"university_links": 0,
"website_rows": 0,
"program_option_warnings": [],
}
# ── Cover Image ──
cover_image = images.get("coverImage", "")
if cover_image and os.path.isfile(cover_image):
cover_image = Path(cover_image).as_uri()
else:
cover_image = ""
# ── TOC Image ──
toc_image = images.get("tocImage", "")
if toc_image and os.path.isfile(toc_image):
toc_image = Path(toc_image).as_uri()
else:
toc_image = ""
# ── Header Image (repeating page header) ──
header_image = images.get("headerImage", "")
if header_image and os.path.isfile(header_image):
mime = mimetypes.guess_type(header_image)[0] or "image/jpeg"
with open(header_image, "rb") as f:
header_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
else:
header_image = ""
# ── Label Image (repeating right-side label) ──
label_image = images.get("labelImage", "")
if label_image and os.path.isfile(label_image):
mime = mimetypes.guess_type(label_image)[0] or "image/png"
with open(label_image, "rb") as f:
label_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
else:
logger.warning("Label image not found locally: %s", label_image)
label_image = ""
# ── Prepare active universities (sorted: Tier One first, Tier Two second) ──
active_universities: list[dict[str, Any]] = []
for uid, uni in by_uni.items():
if not isinstance(uni, dict):
continue
if not is_truthy(uni.get("is_active", True)):
continue
name = str(uni.get("university_name", f"University #{uid}"))
anchor = handbook_anchor("uni", name, int(uid))
school_category = str(uni.get("school_category", "")).strip()
tier = uni.get("tier")
tier_label = str(uni.get("tier_label", "")).strip()
active_universities.append({
"id": int(uid),
"anchor": anchor,
"name": name,
"sections": uni.get("sections", []) if isinstance(uni.get("sections"), list) else [],
"website": str(uni.get("website", "")),
"sort_order": int(uni["sort_order"]) if uni.get("sort_order") is not None and str(uni.get("sort_order", "")).lstrip("-").isdigit() else None,
"school_category": school_category,
"tier": tier,
"tier_label": tier_label,
})
# Stable tier ordering: Tier One (non_cosigner) β†’ Tier Two (cosigner) β†’ others, then alphabetical
def _tier_sort(u: dict) -> tuple:
t = u.get("tier")
rank = t if isinstance(t, int) else 99
return (rank, (u.get("name") or "").lower(), u.get("id", 0))
active_universities.sort(key=_tier_sort)
# ── Normalise globals ──
globals_data = sort_sections_stable(globals_data)
required_keys = [
"table_of_contents",
"overview",
"how_the_program_works",
]
existing_keys = {str(g.get("section_key", "")).lower() for g in globals_data if isinstance(g, dict)}
missing = [k for k in required_keys if k not in existing_keys]
if missing:
msg = f"Handbook required sections missing: {','.join(missing)}"
logger.error(msg)
raise RuntimeError(msg)
general_sections: list[dict[str, Any]] = []
toc_sort_order = None
toc_title = "Table of Contents"
for idx, g in enumerate(globals_data):
if not isinstance(g, dict):
continue
key_raw = str(g.get("section_key", ""))
key = key_raw.lower()
sort_order = int(g["sort_order"]) if g.get("sort_order") is not None and str(g.get("sort_order", "")).lstrip("-").isdigit() else None
if key == "table_of_contents" and toc_sort_order is None:
toc_sort_order = sort_order if sort_order is not None else (idx + 1)
toc_title = str(g.get("section_title", "Table of Contents"))
continue
section_hits: list[str] = []
_collect_program_option_inconsistencies(
g.get("section_json", {}),
f"global.{key_raw}",
section_hits,
)
for hit in section_hits:
if hit not in stats["program_option_warnings"]:
stats["program_option_warnings"].append(hit)
anchor = handbook_anchor("g", str(g.get("section_title", g.get("section_key", "section"))), idx)
general_sections.append({
"anchor": anchor,
"data": g,
"sort_order": sort_order,
})
# ── Build TOC items ──
toc_items: list[dict[str, Any]] = []
for gs in general_sections:
# Prefer the JSON-level title (display-ready) over the DB section_title
gs_json = gs["data"].get("section_json", {})
if isinstance(gs_json, dict) and gs_json.get("title", "").strip():
title = gs_json["title"].strip()
else:
title = str(gs["data"].get("section_title", gs["data"].get("section_key", "Section")))
toc_items.append({
"title": title,
"target": "#" + gs["anchor"],
"level": 0,
"bold": True,
"sort": gs["sort_order"],
})
for u in active_universities:
toc_items.append({
"title": u["name"],
"target": "#" + u["anchor"],
"level": 1,
"bold": False,
"sort": u.get("sort_order"),
})
# ── Prepare sorted TOC items for template ──
sorted_toc = sort_toc(list(toc_items))
toc_items_sorted = []
for e in sorted_toc:
if not isinstance(e, dict):
continue
title = str(e.get("title", "")).strip()
if not title:
continue
level = max(0, min(3, int(e.get("level", 0))))
bold = bool(e.get("bold", False))
upper = bool(e.get("upper", False))
if level == 0:
bold = True
upper = True
display_title = title.upper() if upper else title
page = str(e.get("page", "")).strip()
toc_items_sorted.append({
"title": title,
"display_title": display_title,
"target": str(e.get("target", e.get("anchor", ""))).strip(),
"level": level,
"bold": bold,
"upper": upper,
"page": page,
})
# ── Prepare general sections with rendered HTML and typed blocks ──
template_sections = []
for gs in general_sections:
data = gs["data"]
key_lower = str(data.get("section_key", "")).lower()
sec_class = SECTION_CLASS_MAP.get(key_lower)
if sec_class is None:
sec_class = "sec-" + re.sub(r"[^a-z0-9]+", "-", key_lower)
section_json = data.get("section_json", {})
if not isinstance(section_json, dict):
section_json = {}
# Typed blocks for the new rendering path
blocks = normalize_section(
str(data.get("section_key", "")),
str(data.get("section_title", "")),
section_json,
debug=debug,
)
# Legacy HTML fallback
section_html = render_global_blocks(
str(data.get("section_key", "")),
str(data.get("section_title", "")),
section_json,
debug,
)
if not section_html.strip() and not blocks:
logger.warning(
"Empty section render key=%s sort_order=%s",
data.get("section_key"),
data.get("sort_order"),
)
template_sections.append({
"anchor": gs["anchor"],
"data": data,
"page_break": key_lower in PAGE_BREAK_KEYS,
"sec_class": sec_class,
"blocks": blocks,
"rendered_html": Markup(section_html),
})
# ── Prepare university data for templates (both old + new paths) ──
# Group by tier for tier heading insertion in the PDF output
university_template_data = []
university_block_data = []
# Track which tier label was last emitted so we can insert tier divider headings
_seen_tier_labels: set[str] = set()
for idx, uni_raw in enumerate(active_universities):
uni_raw["_is_first"] = (idx == 0)
# Insert tier group heading when tier changes
current_tier_label = str(uni_raw.get("tier_label", "")).strip()
if current_tier_label and current_tier_label not in _seen_tier_labels:
_seen_tier_labels.add(current_tier_label)
# Mark this university as starting a new tier group
uni_raw["_tier_group_start"] = True
uni_raw["_tier_group_label"] = f"{current_tier_label} Schools"
uni_hits: list[str] = []
_collect_program_option_inconsistencies(
uni_raw.get("sections", []),
f"university.{uni_raw.get('name', idx)}",
uni_hits,
)
for hit in uni_hits:
if hit not in stats["program_option_warnings"]:
stats["program_option_warnings"].append(hit)
# Legacy path
uni_data = _prepare_university_data(
uni_raw, allow_remote, include_inactive_programs, debug, stats,
)
# Carry tier metadata to template data
uni_data["tier"] = uni_raw.get("tier")
uni_data["tier_label"] = uni_raw.get("tier_label", "")
uni_data["tier_group_start"] = uni_raw.get("_tier_group_start", False)
uni_data["tier_group_label"] = uni_raw.get("_tier_group_label", "")
university_template_data.append(uni_data)
# New block path
uni_block = normalize_university(
uni_raw, allow_remote, include_inactive_programs, debug, stats,
)
university_block_data.append(uni_block)
# ── Bottom pages ──
bottom_pages_urls = []
raw_bottom = images.get("bottomPages", [])
if isinstance(raw_bottom, list):
for img_path in raw_bottom:
if os.path.isfile(str(img_path)):
bottom_pages_urls.append(Path(str(img_path)).as_uri())
# ── Render template ──
if stats["program_option_warnings"]:
logger.warning(
"Program option consistency warnings (missing REGULAR or PRIME pair): %s",
stats["program_option_warnings"],
)
html = template.render(
font_css=Markup(font_css),
base_url=base_url,
extra_css="",
header_image=header_image,
label_image=label_image,
cover_image=cover_image,
toc_image=toc_image,
toc_items=toc_items,
toc_items_sorted=toc_items_sorted,
toc_title=toc_title,
toc_sort_order=toc_sort_order,
general_sections=template_sections,
summary_block=None,
universities=university_template_data,
university_blocks=university_block_data,
bottom_pages=bottom_pages_urls,
debug=debug,
stats=stats,
)
return html