"""HTML builder — assembles the full ISP Handbook HTML document.
Uses Jinja2 templates for HTML generation. Data preparation logic is
preserved from the original string-concatenation approach. The output
is a self-contained HTML suitable for Playwright Chromium PDF export.
"""
from __future__ import annotations
import base64
import logging
import mimetypes
import os
import re
from pathlib import Path
from typing import Any
from jinja2 import Environment, FileSystemLoader, select_autoescape
from markupsafe import Markup
from app.core.config import get_settings
from app.core.fonts import font_face_css, select_font_family
from app.services.normalizer import normalize_section, normalize_university
from app.services.renderers import (
fetch_image_data_uri,
render_global_blocks,
sort_toc,
)
from app.services.utils import (
format_money_figures,
get_any,
h,
handbook_anchor,
hb_slug,
is_truthy,
sort_sections_stable,
)
logger = logging.getLogger(__name__)
# Jinja2 environment — templates live alongside the app package
_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
def _get_jinja_env() -> Environment:
"""Create a Jinja2 environment pointing to our templates directory."""
env = Environment(
loader=FileSystemLoader(str(_TEMPLATES_DIR)),
autoescape=select_autoescape(["html"]),
trim_blocks=True,
lstrip_blocks=True,
)
return env
def _static_base_url() -> str:
"""Return absolute file:// URL to the static directory."""
static_dir = Path(__file__).resolve().parent.parent / "static"
return static_dir.as_uri()
def _unused_pdf_override_css(font_stack: str) -> str:
"""Legacy inline PDF override CSS — kept for reference only.
All styling now lives in static/css/print.css for Chromium rendering.
"""
return ""
# Section class map
SECTION_CLASS_MAP = {
"overview": "sec-overview",
"how_program_works_and_qualification_requirements": "sec-qualification",
"enrolment_steps": "sec-steps",
"withdrawal_late_payment_refund_policy": "sec-policy",
"refund_guidelines": "sec-refund",
"program_contributions": "sec-contributions",
"funding_options_available": "sec-funding",
"summary_of_universities": "sec-summary",
}
PAGE_BREAK_KEYS = {
"overview",
"how_program_works_and_qualification_requirements",
"enrolment_steps",
"withdrawal_late_payment_refund_policy",
"refund_guidelines",
"program_contributions",
"funding_options_available",
"summary_of_universities",
}
def _prepare_university_data(
uni_raw: dict[str, Any],
allow_remote: bool,
include_inactive_programs: bool,
debug: bool,
stats: dict[str, Any],
) -> dict[str, Any]:
"""Prepare a single university's template data.
Extracts overview, campus image, benefits, programs, and extra sections
from the raw sections list. This moves the logic that was in
render_university_section into a data-preparation step so that the
Jinja2 template handles the HTML.
"""
uni_name = uni_raw["name"]
sections = uni_raw.get("sections", [])
is_first = uni_raw.get("_is_first", False)
stats["universities"] = stats.get("universities", 0) + 1
# Build section map; merge duplicate "programs"
sec_map: dict[str, dict] = {}
for s in sections:
if not isinstance(s, dict):
continue
k = str(s.get("section_key", ""))
if not k:
continue
if k == "programs" and k in sec_map:
existing = sec_map["programs"].get("section_json", {})
incoming = s.get("section_json", {})
if not isinstance(existing, dict):
existing = {}
if not isinstance(incoming, dict):
incoming = {}
a = existing.get("programs", [])
b = incoming.get("programs", [])
if not isinstance(a, list):
a = []
if not isinstance(b, list):
b = []
existing["programs"] = a + b
sec_map["programs"]["section_json"] = existing
continue
sec_map[k] = s
# Campus image
img_section = sec_map.get("campus_image") or sec_map.get("image")
campus_image = ""
campus_caption = ""
if img_section:
j = img_section.get("section_json", {})
if isinstance(j, dict):
campus_url = str(j.get("image_url", "")).strip()
campus_caption = str(j.get("caption", "")).strip()
if allow_remote and campus_url:
embedded = fetch_image_data_uri(campus_url)
if embedded:
campus_image = embedded
stats["images_embedded"] = stats.get("images_embedded", 0) + 1
else:
stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1
else:
stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1
# Overview and website
resolved_website = (uni_raw.get("website") or "").strip()
overview_data = None
if "overview" in sec_map:
overview_json = sec_map["overview"].get("section_json", {})
if not isinstance(overview_json, dict):
overview_json = {}
site_from_overview = get_any(
overview_json,
["university_website", "university_website_url", "website", "site", "url", "homepage", "web_url"],
)
if not resolved_website and site_from_overview:
resolved_website = site_from_overview
overview_data = {
"founded": get_any(overview_json, ["founded", "Founded"]),
"total_students": get_any(overview_json, ["total_students", "Total Students"]),
"undergraduates": get_any(overview_json, ["undergraduates", "Undergraduate Students", "undergraduate_students"]),
"postgraduates": get_any(overview_json, ["postgraduate_students", "Postgraduate Students"]),
"acceptance_rate": get_any(overview_json, ["acceptance_rate", "Acceptance Rate"]),
"location": get_any(overview_json, ["location", "Location"]),
"tuition": get_any(overview_json, [
"tuition_out_of_state_yearly",
"Yearly Out of State Tuition Fees",
"Yearly Out-of-State Tuition Fees",
"Yearly Tuition Fees",
"Yearly Out-of-State Tuition Fees:",
]),
}
if resolved_website:
stats["university_links"] = stats.get("university_links", 0) + 1
stats["website_rows"] = stats.get("website_rows", 0) + 1
# Benefits
benefits = None
if "benefits" in sec_map:
j = sec_map["benefits"].get("section_json", {})
if not isinstance(j, dict):
j = {}
raw_benefits = j.get("benefits", [])
if isinstance(raw_benefits, list):
benefits = [str(b).strip() for b in raw_benefits if str(b).strip()]
else:
benefits = []
# Programs
programs = None
if "programs" in sec_map:
j = sec_map["programs"].get("section_json", {})
if not isinstance(j, dict):
j = {}
programs_raw = j.get("programs", [])
if not isinstance(programs_raw, list):
programs_raw = []
if not include_inactive_programs:
programs_raw = [
p for p in programs_raw
if isinstance(p, dict) and is_truthy(
p.get("program_active", p.get("is_active", p.get("active", 1)))
)
]
programs = []
seen_names = set()
for p in programs_raw:
if not isinstance(p, dict):
continue
program_name = str(p.get("program_name", "")).strip()
# Deduplicate by lowercase program name
key = program_name.lower()
if key in seen_names:
continue
seen_names.add(key)
link = str(p.get("program_link", "")).strip()
if not link and isinstance(p.get("program_links"), dict):
link = str(p["program_links"].get("web_link", "")).strip()
# Build career HTML
career = p.get("career_pathways", [])
career_html = ""
if isinstance(career, list):
career_items = [str(x).strip() for x in career if str(x).strip()]
if career_items:
career_html = '
'
for ci in career_items:
career_html += f"- {h(ci)}
"
career_html += "
"
else:
raw = str(career).strip()
if raw:
import re as _re
lines = [l.strip() for l in _re.split(r"[\r\n]+", raw) if l.strip()]
if len(lines) > 1:
career_html = ''
for line in lines:
career_html += f"- {h(line)}
"
career_html += "
"
else:
career_html = h(raw)
if not career_html:
career_html = " "
programs.append({
"name": program_name,
"link": link,
"designation": str(p.get("designation", "")),
"entrance": str(p.get("entrance_exam", p.get("entrance_examination", ""))),
"career_html": Markup(career_html),
"funding": str(p.get("funding_category", "")),
})
# Extra sections
skip_keys = {"campus_image", "image", "overview", "benefits", "programs"}
extra_sections = []
for s in sections:
if not isinstance(s, dict):
continue
k = str(s.get("section_key", ""))
if not k or k in skip_keys:
continue
title = str(s.get("section_title", ""))
j = s.get("section_json", {})
if not isinstance(j, dict):
j = {}
rendered = render_global_blocks(k, title, j, debug)
extra_sections.append({"rendered_html": Markup(rendered)})
classes = ["uni"]
if not is_first:
classes.append("page-break")
return {
"name": uni_name,
"anchor": uni_raw.get("anchor"),
"sort_order": uni_raw.get("sort_order"),
"website": resolved_website,
"classes": classes,
"overview": overview_data,
"campus_image": campus_image,
"campus_caption": campus_caption,
"benefits": benefits,
"programs": programs,
"extra_sections": extra_sections,
}
def build_handbook_html(
globals_data: list[dict[str, Any]],
by_uni: dict[int, dict[str, Any]],
images: dict[str, Any],
allow_remote: bool,
include_inactive_programs: bool = False,
debug: bool = False,
) -> str:
"""Build the full handbook HTML document using Jinja2 templates.
Preserves the same data preparation logic from the original version.
Rendering is delegated to Jinja2 templates with Playwright-compatible
HTML/CSS output.
"""
env = _get_jinja_env()
template = env.get_template("handbook.html")
font_meta = select_font_family()
font_css = font_face_css(font_meta)
# Base URL for static assets (CSS, images, etc.)
base_url = _static_base_url()
stats: dict[str, Any] = {
"universities": 0,
"images_embedded": 0,
"images_placeholder": 0,
"program_links_total": 0,
"program_missing_links_total": 0,
"missing_program_links": {},
"university_links": 0,
"website_rows": 0,
}
# ── Cover Image ──
cover_image = images.get("coverImage", "")
if cover_image and os.path.isfile(cover_image):
cover_image = Path(cover_image).as_uri()
else:
cover_image = ""
# ── TOC Image ──
toc_image = images.get("tocImage", "")
if toc_image and os.path.isfile(toc_image):
toc_image = Path(toc_image).as_uri()
else:
toc_image = ""
# ── Header Image (repeating page header) ──
header_image = images.get("headerImage", "")
if header_image and os.path.isfile(header_image):
mime = mimetypes.guess_type(header_image)[0] or "image/jpeg"
with open(header_image, "rb") as f:
header_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
else:
header_image = ""
# ── Label Image (repeating right-side label) ──
label_image = images.get("labelImage", "")
if label_image and os.path.isfile(label_image):
mime = mimetypes.guess_type(label_image)[0] or "image/jpeg"
with open(label_image, "rb") as f:
label_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
else:
# Fallback to remote URL when local file is unavailable
label_image = "https://finsapdev.qhtestingserver.com/MODEL_APIS/handbook/images/label.jpeg"
# ── Prepare active universities ──
active_universities: list[dict[str, Any]] = []
for uid, uni in by_uni.items():
if not isinstance(uni, dict):
continue
if not is_truthy(uni.get("is_active", True)):
continue
name = str(uni.get("university_name", f"University #{uid}"))
anchor = handbook_anchor("uni", name, int(uid))
active_universities.append({
"id": int(uid),
"anchor": anchor,
"name": name,
"sections": uni.get("sections", []) if isinstance(uni.get("sections"), list) else [],
"website": str(uni.get("website", "")),
"sort_order": int(uni["sort_order"]) if uni.get("sort_order") is not None and str(uni.get("sort_order", "")).lstrip("-").isdigit() else None,
})
# ── Normalise globals ──
globals_data = sort_sections_stable(globals_data)
required_keys = [
"table_of_contents",
"overview",
"how_program_works_and_qualification_requirements",
]
existing_keys = {str(g.get("section_key", "")).lower() for g in globals_data if isinstance(g, dict)}
missing = [k for k in required_keys if k not in existing_keys]
if missing:
msg = f"Handbook required sections missing: {','.join(missing)}"
logger.error(msg)
raise RuntimeError(msg)
general_sections: list[dict[str, Any]] = []
summary_block: dict[str, Any] | None = None
toc_sort_order = None
toc_title = "Table of Contents"
for idx, g in enumerate(globals_data):
if not isinstance(g, dict):
continue
key_raw = str(g.get("section_key", ""))
key = key_raw.lower()
sort_order = int(g["sort_order"]) if g.get("sort_order") is not None and str(g.get("sort_order", "")).lstrip("-").isdigit() else None
if key == "table_of_contents" and toc_sort_order is None:
toc_sort_order = sort_order if sort_order is not None else (idx + 1)
toc_title = str(g.get("section_title", "Table of Contents"))
continue
if key == "summary_of_universities":
summary_block = {
"anchor": handbook_anchor("summary", "summary-of-universities", idx),
"data": g,
"sort_order": sort_order,
}
continue
anchor = handbook_anchor("g", str(g.get("section_title", g.get("section_key", "section"))), idx)
general_sections.append({
"anchor": anchor,
"data": g,
"sort_order": sort_order,
})
# ── Build TOC items ──
toc_items: list[dict[str, Any]] = []
for gs in general_sections:
title = str(gs["data"].get("section_title", gs["data"].get("section_key", "Section")))
toc_items.append({
"title": title,
"target": "#" + gs["anchor"],
"level": 0,
"bold": True,
"sort": gs["sort_order"],
})
if summary_block:
title = str(summary_block["data"].get("section_title", "Summary of Universities"))
toc_items.append({
"title": title,
"target": "#" + summary_block["anchor"],
"level": 0,
"bold": True,
"sort": summary_block["sort_order"],
})
for u in active_universities:
toc_items.append({
"title": u["name"],
"target": "#" + u["anchor"],
"level": 1,
"bold": False,
"sort": u.get("sort_order"),
})
# ── Prepare sorted TOC items for template ──
sorted_toc = sort_toc(list(toc_items))
toc_items_sorted = []
for e in sorted_toc:
if not isinstance(e, dict):
continue
title = str(e.get("title", "")).strip()
if not title:
continue
level = max(0, min(3, int(e.get("level", 0))))
bold = bool(e.get("bold", False))
upper = bool(e.get("upper", False))
if level == 0:
bold = True
upper = True
display_title = title.upper() if upper else title
page = str(e.get("page", "")).strip()
toc_items_sorted.append({
"title": title,
"display_title": display_title,
"target": str(e.get("target", e.get("anchor", ""))).strip(),
"level": level,
"bold": bold,
"upper": upper,
"page": page,
})
# ── Prepare general sections with rendered HTML and typed blocks ──
template_sections = []
for gs in general_sections:
data = gs["data"]
key_lower = str(data.get("section_key", "")).lower()
sec_class = SECTION_CLASS_MAP.get(key_lower)
if sec_class is None:
sec_class = "sec-" + re.sub(r"[^a-z0-9]+", "-", key_lower)
section_json = data.get("section_json", {})
if not isinstance(section_json, dict):
section_json = {}
# Typed blocks for the new rendering path
blocks = normalize_section(
str(data.get("section_key", "")),
str(data.get("section_title", "")),
section_json,
debug=debug,
)
# Legacy HTML fallback
section_html = render_global_blocks(
str(data.get("section_key", "")),
str(data.get("section_title", "")),
section_json,
debug,
)
if not section_html.strip() and not blocks:
logger.warning(
"Empty section render key=%s sort_order=%s",
data.get("section_key"),
data.get("sort_order"),
)
template_sections.append({
"anchor": gs["anchor"],
"data": data,
"page_break": key_lower in PAGE_BREAK_KEYS,
"sec_class": sec_class,
"blocks": blocks,
"rendered_html": Markup(section_html),
})
# ── Prepare summary block ──
summary_template = None
if summary_block:
data = summary_block["data"]
section_json = data.get("section_json", {})
if not isinstance(section_json, dict):
section_json = {}
# Typed blocks for summary
summary_blocks = normalize_section(
str(data.get("section_key", "")),
str(data.get("section_title", "")),
section_json,
universities=active_universities,
debug=debug,
)
summary_html = render_global_blocks(
str(data.get("section_key", "")),
str(data.get("section_title", "")),
section_json,
debug,
universities=active_universities,
)
summary_template = {
"anchor": summary_block["anchor"],
"data": data,
"blocks": summary_blocks,
"rendered_html": Markup(summary_html),
}
# ── Prepare university data for templates (both old + new paths) ──
university_template_data = []
university_block_data = []
for idx, uni_raw in enumerate(active_universities):
uni_raw["_is_first"] = (idx == 0)
# Legacy path
uni_data = _prepare_university_data(
uni_raw, allow_remote, include_inactive_programs, debug, stats,
)
university_template_data.append(uni_data)
# New block path
uni_block = normalize_university(
uni_raw, allow_remote, include_inactive_programs, debug, stats,
)
university_block_data.append(uni_block)
# ── Bottom pages ──
bottom_pages_urls = []
raw_bottom = images.get("bottomPages", [])
if isinstance(raw_bottom, list):
for img_path in raw_bottom:
if os.path.isfile(str(img_path)):
bottom_pages_urls.append(Path(str(img_path)).as_uri())
# ── Render template ──
html = template.render(
font_css=Markup(font_css),
base_url=base_url,
extra_css="",
header_image=header_image,
label_image=label_image,
cover_image=cover_image,
toc_image=toc_image,
toc_items=toc_items,
toc_items_sorted=toc_items_sorted,
toc_title=toc_title,
toc_sort_order=toc_sort_order,
general_sections=template_sections,
summary_block=summary_template,
universities=university_template_data,
university_blocks=university_block_data,
bottom_pages=bottom_pages_urls,
debug=debug,
stats=stats,
)
return html