"""HTML builder — assembles the full ISP Handbook HTML document. Uses Jinja2 templates for HTML generation. Data preparation logic is preserved from the original string-concatenation approach. The output is a self-contained HTML suitable for Playwright Chromium PDF export. """ from __future__ import annotations import base64 import logging import mimetypes import os import re from pathlib import Path from typing import Any from jinja2 import Environment, FileSystemLoader, select_autoescape from markupsafe import Markup from app.core.config import get_settings from app.core.fonts import font_face_css, select_font_family from app.services.normalizer import normalize_section, normalize_university from app.services.renderers import ( fetch_image_data_uri, render_global_blocks, sort_toc, ) from app.services.utils import ( format_money_figures, get_any, h, handbook_anchor, hb_slug, is_truthy, sort_sections_stable, ) logger = logging.getLogger(__name__) # Jinja2 environment — templates live alongside the app package _TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates" def _get_jinja_env() -> Environment: """Create a Jinja2 environment pointing to our templates directory.""" env = Environment( loader=FileSystemLoader(str(_TEMPLATES_DIR)), autoescape=select_autoescape(["html"]), trim_blocks=True, lstrip_blocks=True, ) return env def _static_base_url() -> str: """Return absolute file:// URL to the static directory.""" static_dir = Path(__file__).resolve().parent.parent / "static" return static_dir.as_uri() def _unused_pdf_override_css(font_stack: str) -> str: """Legacy inline PDF override CSS — kept for reference only. All styling now lives in static/css/print.css for Chromium rendering. """ return "" # Section class map SECTION_CLASS_MAP = { "overview": "sec-overview", "how_program_works_and_qualification_requirements": "sec-qualification", "enrolment_steps": "sec-steps", "withdrawal_late_payment_refund_policy": "sec-policy", "refund_guidelines": "sec-refund", "program_contributions": "sec-contributions", "funding_options_available": "sec-funding", "summary_of_universities": "sec-summary", } PAGE_BREAK_KEYS = { "overview", "how_program_works_and_qualification_requirements", "enrolment_steps", "withdrawal_late_payment_refund_policy", "refund_guidelines", "program_contributions", "funding_options_available", "summary_of_universities", } def _prepare_university_data( uni_raw: dict[str, Any], allow_remote: bool, include_inactive_programs: bool, debug: bool, stats: dict[str, Any], ) -> dict[str, Any]: """Prepare a single university's template data. Extracts overview, campus image, benefits, programs, and extra sections from the raw sections list. This moves the logic that was in render_university_section into a data-preparation step so that the Jinja2 template handles the HTML. """ uni_name = uni_raw["name"] sections = uni_raw.get("sections", []) is_first = uni_raw.get("_is_first", False) stats["universities"] = stats.get("universities", 0) + 1 # Build section map; merge duplicate "programs" sec_map: dict[str, dict] = {} for s in sections: if not isinstance(s, dict): continue k = str(s.get("section_key", "")) if not k: continue if k == "programs" and k in sec_map: existing = sec_map["programs"].get("section_json", {}) incoming = s.get("section_json", {}) if not isinstance(existing, dict): existing = {} if not isinstance(incoming, dict): incoming = {} a = existing.get("programs", []) b = incoming.get("programs", []) if not isinstance(a, list): a = [] if not isinstance(b, list): b = [] existing["programs"] = a + b sec_map["programs"]["section_json"] = existing continue sec_map[k] = s # Campus image img_section = sec_map.get("campus_image") or sec_map.get("image") campus_image = "" campus_caption = "" if img_section: j = img_section.get("section_json", {}) if isinstance(j, dict): campus_url = str(j.get("image_url", "")).strip() campus_caption = str(j.get("caption", "")).strip() if allow_remote and campus_url: embedded = fetch_image_data_uri(campus_url) if embedded: campus_image = embedded stats["images_embedded"] = stats.get("images_embedded", 0) + 1 else: stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1 else: stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1 # Overview and website resolved_website = (uni_raw.get("website") or "").strip() overview_data = None if "overview" in sec_map: overview_json = sec_map["overview"].get("section_json", {}) if not isinstance(overview_json, dict): overview_json = {} site_from_overview = get_any( overview_json, ["university_website", "university_website_url", "website", "site", "url", "homepage", "web_url"], ) if not resolved_website and site_from_overview: resolved_website = site_from_overview overview_data = { "founded": get_any(overview_json, ["founded", "Founded"]), "total_students": get_any(overview_json, ["total_students", "Total Students"]), "undergraduates": get_any(overview_json, ["undergraduates", "Undergraduate Students", "undergraduate_students"]), "postgraduates": get_any(overview_json, ["postgraduate_students", "Postgraduate Students"]), "acceptance_rate": get_any(overview_json, ["acceptance_rate", "Acceptance Rate"]), "location": get_any(overview_json, ["location", "Location"]), "tuition": get_any(overview_json, [ "tuition_out_of_state_yearly", "Yearly Out of State Tuition Fees", "Yearly Out-of-State Tuition Fees", "Yearly Tuition Fees", "Yearly Out-of-State Tuition Fees:", ]), } if resolved_website: stats["university_links"] = stats.get("university_links", 0) + 1 stats["website_rows"] = stats.get("website_rows", 0) + 1 # Benefits benefits = None if "benefits" in sec_map: j = sec_map["benefits"].get("section_json", {}) if not isinstance(j, dict): j = {} raw_benefits = j.get("benefits", []) if isinstance(raw_benefits, list): benefits = [str(b).strip() for b in raw_benefits if str(b).strip()] else: benefits = [] # Programs programs = None if "programs" in sec_map: j = sec_map["programs"].get("section_json", {}) if not isinstance(j, dict): j = {} programs_raw = j.get("programs", []) if not isinstance(programs_raw, list): programs_raw = [] if not include_inactive_programs: programs_raw = [ p for p in programs_raw if isinstance(p, dict) and is_truthy( p.get("program_active", p.get("is_active", p.get("active", 1))) ) ] programs = [] seen_names = set() for p in programs_raw: if not isinstance(p, dict): continue program_name = str(p.get("program_name", "")).strip() # Deduplicate by lowercase program name key = program_name.lower() if key in seen_names: continue seen_names.add(key) link = str(p.get("program_link", "")).strip() if not link and isinstance(p.get("program_links"), dict): link = str(p["program_links"].get("web_link", "")).strip() # Build career HTML career = p.get("career_pathways", []) career_html = "" if isinstance(career, list): career_items = [str(x).strip() for x in career if str(x).strip()] if career_items: career_html = '" else: raw = str(career).strip() if raw: import re as _re lines = [l.strip() for l in _re.split(r"[\r\n]+", raw) if l.strip()] if len(lines) > 1: career_html = '" else: career_html = h(raw) if not career_html: career_html = " " programs.append({ "name": program_name, "link": link, "designation": str(p.get("designation", "")), "entrance": str(p.get("entrance_exam", p.get("entrance_examination", ""))), "career_html": Markup(career_html), "funding": str(p.get("funding_category", "")), }) # Extra sections skip_keys = {"campus_image", "image", "overview", "benefits", "programs"} extra_sections = [] for s in sections: if not isinstance(s, dict): continue k = str(s.get("section_key", "")) if not k or k in skip_keys: continue title = str(s.get("section_title", "")) j = s.get("section_json", {}) if not isinstance(j, dict): j = {} rendered = render_global_blocks(k, title, j, debug) extra_sections.append({"rendered_html": Markup(rendered)}) classes = ["uni"] if not is_first: classes.append("page-break") return { "name": uni_name, "anchor": uni_raw.get("anchor"), "sort_order": uni_raw.get("sort_order"), "website": resolved_website, "classes": classes, "overview": overview_data, "campus_image": campus_image, "campus_caption": campus_caption, "benefits": benefits, "programs": programs, "extra_sections": extra_sections, } def build_handbook_html( globals_data: list[dict[str, Any]], by_uni: dict[int, dict[str, Any]], images: dict[str, Any], allow_remote: bool, include_inactive_programs: bool = False, debug: bool = False, ) -> str: """Build the full handbook HTML document using Jinja2 templates. Preserves the same data preparation logic from the original version. Rendering is delegated to Jinja2 templates with Playwright-compatible HTML/CSS output. """ env = _get_jinja_env() template = env.get_template("handbook.html") font_meta = select_font_family() font_css = font_face_css(font_meta) # Base URL for static assets (CSS, images, etc.) base_url = _static_base_url() stats: dict[str, Any] = { "universities": 0, "images_embedded": 0, "images_placeholder": 0, "program_links_total": 0, "program_missing_links_total": 0, "missing_program_links": {}, "university_links": 0, "website_rows": 0, } # ── Cover Image ── cover_image = images.get("coverImage", "") if cover_image and os.path.isfile(cover_image): cover_image = Path(cover_image).as_uri() else: cover_image = "" # ── TOC Image ── toc_image = images.get("tocImage", "") if toc_image and os.path.isfile(toc_image): toc_image = Path(toc_image).as_uri() else: toc_image = "" # ── Header Image (repeating page header) ── header_image = images.get("headerImage", "") if header_image and os.path.isfile(header_image): mime = mimetypes.guess_type(header_image)[0] or "image/jpeg" with open(header_image, "rb") as f: header_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}" else: header_image = "" # ── Label Image (repeating right-side label) ── label_image = images.get("labelImage", "") if label_image and os.path.isfile(label_image): mime = mimetypes.guess_type(label_image)[0] or "image/jpeg" with open(label_image, "rb") as f: label_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}" else: # Fallback to remote URL when local file is unavailable label_image = "https://finsapdev.qhtestingserver.com/MODEL_APIS/handbook/images/label.jpeg" # ── Prepare active universities ── active_universities: list[dict[str, Any]] = [] for uid, uni in by_uni.items(): if not isinstance(uni, dict): continue if not is_truthy(uni.get("is_active", True)): continue name = str(uni.get("university_name", f"University #{uid}")) anchor = handbook_anchor("uni", name, int(uid)) active_universities.append({ "id": int(uid), "anchor": anchor, "name": name, "sections": uni.get("sections", []) if isinstance(uni.get("sections"), list) else [], "website": str(uni.get("website", "")), "sort_order": int(uni["sort_order"]) if uni.get("sort_order") is not None and str(uni.get("sort_order", "")).lstrip("-").isdigit() else None, }) # ── Normalise globals ── globals_data = sort_sections_stable(globals_data) required_keys = [ "table_of_contents", "overview", "how_program_works_and_qualification_requirements", ] existing_keys = {str(g.get("section_key", "")).lower() for g in globals_data if isinstance(g, dict)} missing = [k for k in required_keys if k not in existing_keys] if missing: msg = f"Handbook required sections missing: {','.join(missing)}" logger.error(msg) raise RuntimeError(msg) general_sections: list[dict[str, Any]] = [] summary_block: dict[str, Any] | None = None toc_sort_order = None toc_title = "Table of Contents" for idx, g in enumerate(globals_data): if not isinstance(g, dict): continue key_raw = str(g.get("section_key", "")) key = key_raw.lower() sort_order = int(g["sort_order"]) if g.get("sort_order") is not None and str(g.get("sort_order", "")).lstrip("-").isdigit() else None if key == "table_of_contents" and toc_sort_order is None: toc_sort_order = sort_order if sort_order is not None else (idx + 1) toc_title = str(g.get("section_title", "Table of Contents")) continue if key == "summary_of_universities": summary_block = { "anchor": handbook_anchor("summary", "summary-of-universities", idx), "data": g, "sort_order": sort_order, } continue anchor = handbook_anchor("g", str(g.get("section_title", g.get("section_key", "section"))), idx) general_sections.append({ "anchor": anchor, "data": g, "sort_order": sort_order, }) # ── Build TOC items ── toc_items: list[dict[str, Any]] = [] for gs in general_sections: title = str(gs["data"].get("section_title", gs["data"].get("section_key", "Section"))) toc_items.append({ "title": title, "target": "#" + gs["anchor"], "level": 0, "bold": True, "sort": gs["sort_order"], }) if summary_block: title = str(summary_block["data"].get("section_title", "Summary of Universities")) toc_items.append({ "title": title, "target": "#" + summary_block["anchor"], "level": 0, "bold": True, "sort": summary_block["sort_order"], }) for u in active_universities: toc_items.append({ "title": u["name"], "target": "#" + u["anchor"], "level": 1, "bold": False, "sort": u.get("sort_order"), }) # ── Prepare sorted TOC items for template ── sorted_toc = sort_toc(list(toc_items)) toc_items_sorted = [] for e in sorted_toc: if not isinstance(e, dict): continue title = str(e.get("title", "")).strip() if not title: continue level = max(0, min(3, int(e.get("level", 0)))) bold = bool(e.get("bold", False)) upper = bool(e.get("upper", False)) if level == 0: bold = True upper = True display_title = title.upper() if upper else title page = str(e.get("page", "")).strip() toc_items_sorted.append({ "title": title, "display_title": display_title, "target": str(e.get("target", e.get("anchor", ""))).strip(), "level": level, "bold": bold, "upper": upper, "page": page, }) # ── Prepare general sections with rendered HTML and typed blocks ── template_sections = [] for gs in general_sections: data = gs["data"] key_lower = str(data.get("section_key", "")).lower() sec_class = SECTION_CLASS_MAP.get(key_lower) if sec_class is None: sec_class = "sec-" + re.sub(r"[^a-z0-9]+", "-", key_lower) section_json = data.get("section_json", {}) if not isinstance(section_json, dict): section_json = {} # Typed blocks for the new rendering path blocks = normalize_section( str(data.get("section_key", "")), str(data.get("section_title", "")), section_json, debug=debug, ) # Legacy HTML fallback section_html = render_global_blocks( str(data.get("section_key", "")), str(data.get("section_title", "")), section_json, debug, ) if not section_html.strip() and not blocks: logger.warning( "Empty section render key=%s sort_order=%s", data.get("section_key"), data.get("sort_order"), ) template_sections.append({ "anchor": gs["anchor"], "data": data, "page_break": key_lower in PAGE_BREAK_KEYS, "sec_class": sec_class, "blocks": blocks, "rendered_html": Markup(section_html), }) # ── Prepare summary block ── summary_template = None if summary_block: data = summary_block["data"] section_json = data.get("section_json", {}) if not isinstance(section_json, dict): section_json = {} # Typed blocks for summary summary_blocks = normalize_section( str(data.get("section_key", "")), str(data.get("section_title", "")), section_json, universities=active_universities, debug=debug, ) summary_html = render_global_blocks( str(data.get("section_key", "")), str(data.get("section_title", "")), section_json, debug, universities=active_universities, ) summary_template = { "anchor": summary_block["anchor"], "data": data, "blocks": summary_blocks, "rendered_html": Markup(summary_html), } # ── Prepare university data for templates (both old + new paths) ── university_template_data = [] university_block_data = [] for idx, uni_raw in enumerate(active_universities): uni_raw["_is_first"] = (idx == 0) # Legacy path uni_data = _prepare_university_data( uni_raw, allow_remote, include_inactive_programs, debug, stats, ) university_template_data.append(uni_data) # New block path uni_block = normalize_university( uni_raw, allow_remote, include_inactive_programs, debug, stats, ) university_block_data.append(uni_block) # ── Bottom pages ── bottom_pages_urls = [] raw_bottom = images.get("bottomPages", []) if isinstance(raw_bottom, list): for img_path in raw_bottom: if os.path.isfile(str(img_path)): bottom_pages_urls.append(Path(str(img_path)).as_uri()) # ── Render template ── html = template.render( font_css=Markup(font_css), base_url=base_url, extra_css="", header_image=header_image, label_image=label_image, cover_image=cover_image, toc_image=toc_image, toc_items=toc_items, toc_items_sorted=toc_items_sorted, toc_title=toc_title, toc_sort_order=toc_sort_order, general_sections=template_sections, summary_block=summary_template, universities=university_template_data, university_blocks=university_block_data, bottom_pages=bottom_pages_urls, debug=debug, stats=stats, ) return html