Spaces:

internationalscholarsprogram
/

handbook_engine

Running

App Files Files Community

handbook_engine / app /services /html_builder.py

internationalscholarsprogram

feat: right-side label on content pages only (pages 3+, excludes cover/fullpage)

205c317 1 day ago

raw

history blame contribute delete

21.8 kB

	"""HTML builder — assembles the full ISP Handbook HTML document.

	Uses Jinja2 templates for HTML generation. Data preparation logic is
	preserved from the original string-concatenation approach. The output
	is a self-contained HTML suitable for Playwright Chromium PDF export.
	"""

	from __future__ import annotations

	import base64
	import logging
	import mimetypes
	import os
	import re
	from pathlib import Path
	from typing import Any

	from jinja2 import Environment, FileSystemLoader, select_autoescape
	from markupsafe import Markup

	from app.core.config import get_settings
	from app.core.fonts import font_face_css, select_font_family
	from app.services.normalizer import normalize_section, normalize_university
	from app.services.renderers import (
	fetch_image_data_uri,
	render_global_blocks,
	sort_toc,
	)
	from app.services.utils import (
	format_money_figures,
	get_any,
	h,
	handbook_anchor,
	hb_slug,
	is_truthy,
	sort_sections_stable,
	)

	logger = logging.getLogger(__name__)

	# Jinja2 environment — templates live alongside the app package
	_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"


	def _get_jinja_env() -> Environment:
	"""Create a Jinja2 environment pointing to our templates directory."""
	env = Environment(
	loader=FileSystemLoader(str(_TEMPLATES_DIR)),
	autoescape=select_autoescape(["html"]),
	trim_blocks=True,
	lstrip_blocks=True,
	)
	return env


	def _static_base_url() -> str:
	"""Return absolute file:// URL to the static directory."""
	static_dir = Path(__file__).resolve().parent.parent / "static"
	return static_dir.as_uri()


	def _unused_pdf_override_css(font_stack: str) -> str:
	"""Legacy inline PDF override CSS — kept for reference only.
	All styling now lives in static/css/print.css for Chromium rendering.
	"""
	return ""


	# Section class map
	SECTION_CLASS_MAP = {
	"overview": "sec-overview",
	"how_program_works_and_qualification_requirements": "sec-qualification",
	"enrolment_steps": "sec-steps",
	"withdrawal_late_payment_refund_policy": "sec-policy",
	"refund_guidelines": "sec-refund",
	"program_contributions": "sec-contributions",
	"funding_options_available": "sec-funding",
	"summary_of_universities": "sec-summary",
	}

	PAGE_BREAK_KEYS = {
	"overview",
	"how_program_works_and_qualification_requirements",
	"enrolment_steps",
	"withdrawal_late_payment_refund_policy",
	"refund_guidelines",
	"program_contributions",
	"funding_options_available",
	"summary_of_universities",
	}


	def _prepare_university_data(
	uni_raw: dict[str, Any],
	allow_remote: bool,
	include_inactive_programs: bool,
	debug: bool,
	stats: dict[str, Any],
	) -> dict[str, Any]:
	"""Prepare a single university's template data.

	Extracts overview, campus image, benefits, programs, and extra sections
	from the raw sections list. This moves the logic that was in
	render_university_section into a data-preparation step so that the
	Jinja2 template handles the HTML.
	"""
	uni_name = uni_raw["name"]
	sections = uni_raw.get("sections", [])
	is_first = uni_raw.get("_is_first", False)

	stats["universities"] = stats.get("universities", 0) + 1

	# Build section map; merge duplicate "programs"
	sec_map: dict[str, dict] = {}
	for s in sections:
	if not isinstance(s, dict):
	continue
	k = str(s.get("section_key", ""))
	if not k:
	continue
	if k == "programs" and k in sec_map:
	existing = sec_map["programs"].get("section_json", {})
	incoming = s.get("section_json", {})
	if not isinstance(existing, dict):
	existing = {}
	if not isinstance(incoming, dict):
	incoming = {}
	a = existing.get("programs", [])
	b = incoming.get("programs", [])
	if not isinstance(a, list):
	a = []
	if not isinstance(b, list):
	b = []
	existing["programs"] = a + b
	sec_map["programs"]["section_json"] = existing
	continue
	sec_map[k] = s

	# Campus image
	img_section = sec_map.get("campus_image") or sec_map.get("image")
	campus_image = ""
	campus_caption = ""
	if img_section:
	j = img_section.get("section_json", {})
	if isinstance(j, dict):
	campus_url = str(j.get("image_url", "")).strip()
	campus_caption = str(j.get("caption", "")).strip()
	if allow_remote and campus_url:
	embedded = fetch_image_data_uri(campus_url)
	if embedded:
	campus_image = embedded
	stats["images_embedded"] = stats.get("images_embedded", 0) + 1
	else:
	stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1
	else:
	stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1

	# Overview and website
	resolved_website = (uni_raw.get("website") or "").strip()
	overview_data = None

	if "overview" in sec_map:
	overview_json = sec_map["overview"].get("section_json", {})
	if not isinstance(overview_json, dict):
	overview_json = {}

	site_from_overview = get_any(
	overview_json,
	["university_website", "university_website_url", "website", "site", "url", "homepage", "web_url"],
	)
	if not resolved_website and site_from_overview:
	resolved_website = site_from_overview

	overview_data = {
	"founded": get_any(overview_json, ["founded", "Founded"]),
	"total_students": get_any(overview_json, ["total_students", "Total Students"]),
	"undergraduates": get_any(overview_json, ["undergraduates", "Undergraduate Students", "undergraduate_students"]),
	"postgraduates": get_any(overview_json, ["postgraduate_students", "Postgraduate Students"]),
	"acceptance_rate": get_any(overview_json, ["acceptance_rate", "Acceptance Rate"]),
	"location": get_any(overview_json, ["location", "Location"]),
	"tuition": get_any(overview_json, [
	"tuition_out_of_state_yearly",
	"Yearly Out of State Tuition Fees",
	"Yearly Out-of-State Tuition Fees",
	"Yearly Tuition Fees",
	"Yearly Out-of-State Tuition Fees:",
	]),
	}

	if resolved_website:
	stats["university_links"] = stats.get("university_links", 0) + 1
	stats["website_rows"] = stats.get("website_rows", 0) + 1

	# Benefits
	benefits = None
	if "benefits" in sec_map:
	j = sec_map["benefits"].get("section_json", {})
	if not isinstance(j, dict):
	j = {}
	raw_benefits = j.get("benefits", [])
	if isinstance(raw_benefits, list):
	benefits = [str(b).strip() for b in raw_benefits if str(b).strip()]
	else:
	benefits = []

	# Programs
	programs = None
	if "programs" in sec_map:
	j = sec_map["programs"].get("section_json", {})
	if not isinstance(j, dict):
	j = {}
	programs_raw = j.get("programs", [])
	if not isinstance(programs_raw, list):
	programs_raw = []

	if not include_inactive_programs:
	programs_raw = [
	p for p in programs_raw
	if isinstance(p, dict) and is_truthy(
	p.get("program_active", p.get("is_active", p.get("active", 1)))
	)
	]

	programs = []
	seen_names = set()
	for p in programs_raw:
	if not isinstance(p, dict):
	continue
	program_name = str(p.get("program_name", "")).strip()
	# Deduplicate by lowercase program name
	key = program_name.lower()
	if key in seen_names:
	continue
	seen_names.add(key)
	link = str(p.get("program_link", "")).strip()
	if not link and isinstance(p.get("program_links"), dict):
	link = str(p["program_links"].get("web_link", "")).strip()

	# Build career HTML
	career = p.get("career_pathways", [])
	career_html = ""
	if isinstance(career, list):
	career_items = [str(x).strip() for x in career if str(x).strip()]
	if career_items:
	career_html = '<ul class="career-list">'
	for ci in career_items:
	career_html += f"<li>{h(ci)}</li>"
	career_html += "</ul>"
	else:
	raw = str(career).strip()
	if raw:
	import re as _re
	lines = [l.strip() for l in _re.split(r"[\r\n]+", raw) if l.strip()]
	if len(lines) > 1:
	career_html = '<ul class="career-list">'
	for line in lines:
	career_html += f"<li>{h(line)}</li>"
	career_html += "</ul>"
	else:
	career_html = h(raw)

	if not career_html:
	career_html = " "

	programs.append({
	"name": program_name,
	"link": link,
	"designation": str(p.get("designation", "")),
	"entrance": str(p.get("entrance_exam", p.get("entrance_examination", ""))),
	"career_html": Markup(career_html),
	"funding": str(p.get("funding_category", "")),
	})

	# Extra sections
	skip_keys = {"campus_image", "image", "overview", "benefits", "programs"}
	extra_sections = []
	for s in sections:
	if not isinstance(s, dict):
	continue
	k = str(s.get("section_key", ""))
	if not k or k in skip_keys:
	continue
	title = str(s.get("section_title", ""))
	j = s.get("section_json", {})
	if not isinstance(j, dict):
	j = {}
	rendered = render_global_blocks(k, title, j, debug)
	extra_sections.append({"rendered_html": Markup(rendered)})

	classes = ["uni"]
	if not is_first:
	classes.append("page-break")

	return {
	"name": uni_name,
	"anchor": uni_raw.get("anchor"),
	"sort_order": uni_raw.get("sort_order"),
	"website": resolved_website,
	"classes": classes,
	"overview": overview_data,
	"campus_image": campus_image,
	"campus_caption": campus_caption,
	"benefits": benefits,
	"programs": programs,
	"extra_sections": extra_sections,
	}


	def build_handbook_html(
	globals_data: list[dict[str, Any]],
	by_uni: dict[int, dict[str, Any]],
	images: dict[str, Any],
	allow_remote: bool,
	include_inactive_programs: bool = False,
	debug: bool = False,
	) -> str:
	"""Build the full handbook HTML document using Jinja2 templates.

	Preserves the same data preparation logic from the original version.
	Rendering is delegated to Jinja2 templates with Playwright-compatible
	HTML/CSS output.
	"""
	env = _get_jinja_env()
	template = env.get_template("handbook.html")

	font_meta = select_font_family()
	font_css = font_face_css(font_meta)

	# Base URL for static assets (CSS, images, etc.)
	base_url = _static_base_url()

	stats: dict[str, Any] = {
	"universities": 0,
	"images_embedded": 0,
	"images_placeholder": 0,
	"program_links_total": 0,
	"program_missing_links_total": 0,
	"missing_program_links": {},
	"university_links": 0,
	"website_rows": 0,
	}

	# ── Cover Image ──
	cover_image = images.get("coverImage", "")
	if cover_image and os.path.isfile(cover_image):
	cover_image = Path(cover_image).as_uri()
	else:
	cover_image = ""

	# ── TOC Image ──
	toc_image = images.get("tocImage", "")
	if toc_image and os.path.isfile(toc_image):
	toc_image = Path(toc_image).as_uri()
	else:
	toc_image = ""

	# ── Header Image (repeating page header) ──
	header_image = images.get("headerImage", "")
	if header_image and os.path.isfile(header_image):
	mime = mimetypes.guess_type(header_image)[0] or "image/jpeg"
	with open(header_image, "rb") as f:
	header_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
	else:
	header_image = ""

	# ── Label Image (repeating right-side label) ──
	label_image = images.get("labelImage", "")
	if label_image and os.path.isfile(label_image):
	mime = mimetypes.guess_type(label_image)[0] or "image/jpeg"
	with open(label_image, "rb") as f:
	label_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
	else:
	# Fallback to remote URL when local file is unavailable
	label_image = "https://finsapdev.qhtestingserver.com/MODEL_APIS/handbook/images/label.jpeg"

	# ── Prepare active universities ──
	active_universities: list[dict[str, Any]] = []
	for uid, uni in by_uni.items():
	if not isinstance(uni, dict):
	continue
	if not is_truthy(uni.get("is_active", True)):
	continue
	name = str(uni.get("university_name", f"University #{uid}"))
	anchor = handbook_anchor("uni", name, int(uid))
	active_universities.append({
	"id": int(uid),
	"anchor": anchor,
	"name": name,
	"sections": uni.get("sections", []) if isinstance(uni.get("sections"), list) else [],
	"website": str(uni.get("website", "")),
	"sort_order": int(uni["sort_order"]) if uni.get("sort_order") is not None and str(uni.get("sort_order", "")).lstrip("-").isdigit() else None,
	})

	# ── Normalise globals ──
	globals_data = sort_sections_stable(globals_data)

	required_keys = [
	"table_of_contents",
	"overview",
	"how_program_works_and_qualification_requirements",
	]
	existing_keys = {str(g.get("section_key", "")).lower() for g in globals_data if isinstance(g, dict)}
	missing = [k for k in required_keys if k not in existing_keys]
	if missing:
	msg = f"Handbook required sections missing: {','.join(missing)}"
	logger.error(msg)
	raise RuntimeError(msg)

	general_sections: list[dict[str, Any]] = []
	summary_block: dict[str, Any] \| None = None
	toc_sort_order = None
	toc_title = "Table of Contents"

	for idx, g in enumerate(globals_data):
	if not isinstance(g, dict):
	continue
	key_raw = str(g.get("section_key", ""))
	key = key_raw.lower()
	sort_order = int(g["sort_order"]) if g.get("sort_order") is not None and str(g.get("sort_order", "")).lstrip("-").isdigit() else None

	if key == "table_of_contents" and toc_sort_order is None:
	toc_sort_order = sort_order if sort_order is not None else (idx + 1)
	toc_title = str(g.get("section_title", "Table of Contents"))
	continue

	if key == "summary_of_universities":
	summary_block = {
	"anchor": handbook_anchor("summary", "summary-of-universities", idx),
	"data": g,
	"sort_order": sort_order,
	}
	continue

	anchor = handbook_anchor("g", str(g.get("section_title", g.get("section_key", "section"))), idx)
	general_sections.append({
	"anchor": anchor,
	"data": g,
	"sort_order": sort_order,
	})

	# ── Build TOC items ──
	toc_items: list[dict[str, Any]] = []
	for gs in general_sections:
	title = str(gs["data"].get("section_title", gs["data"].get("section_key", "Section")))
	toc_items.append({
	"title": title,
	"target": "#" + gs["anchor"],
	"level": 0,
	"bold": True,
	"sort": gs["sort_order"],
	})

	if summary_block:
	title = str(summary_block["data"].get("section_title", "Summary of Universities"))
	toc_items.append({
	"title": title,
	"target": "#" + summary_block["anchor"],
	"level": 0,
	"bold": True,
	"sort": summary_block["sort_order"],
	})

	for u in active_universities:
	toc_items.append({
	"title": u["name"],
	"target": "#" + u["anchor"],
	"level": 1,
	"bold": False,
	"sort": u.get("sort_order"),
	})

	# ── Prepare sorted TOC items for template ──
	sorted_toc = sort_toc(list(toc_items))
	toc_items_sorted = []
	for e in sorted_toc:
	if not isinstance(e, dict):
	continue
	title = str(e.get("title", "")).strip()
	if not title:
	continue
	level = max(0, min(3, int(e.get("level", 0))))
	bold = bool(e.get("bold", False))
	upper = bool(e.get("upper", False))
	if level == 0:
	bold = True
	upper = True
	display_title = title.upper() if upper else title
	page = str(e.get("page", "")).strip()

	toc_items_sorted.append({
	"title": title,
	"display_title": display_title,
	"target": str(e.get("target", e.get("anchor", ""))).strip(),
	"level": level,
	"bold": bold,
	"upper": upper,
	"page": page,
	})

	# ── Prepare general sections with rendered HTML and typed blocks ──
	template_sections = []
	for gs in general_sections:
	data = gs["data"]
	key_lower = str(data.get("section_key", "")).lower()

	sec_class = SECTION_CLASS_MAP.get(key_lower)
	if sec_class is None:
	sec_class = "sec-" + re.sub(r"[^a-z0-9]+", "-", key_lower)

	section_json = data.get("section_json", {})
	if not isinstance(section_json, dict):
	section_json = {}

	# Typed blocks for the new rendering path
	blocks = normalize_section(
	str(data.get("section_key", "")),
	str(data.get("section_title", "")),
	section_json,
	debug=debug,
	)

	# Legacy HTML fallback
	section_html = render_global_blocks(
	str(data.get("section_key", "")),
	str(data.get("section_title", "")),
	section_json,
	debug,
	)

	if not section_html.strip() and not blocks:
	logger.warning(
	"Empty section render key=%s sort_order=%s",
	data.get("section_key"),
	data.get("sort_order"),
	)

	template_sections.append({
	"anchor": gs["anchor"],
	"data": data,
	"page_break": key_lower in PAGE_BREAK_KEYS,
	"sec_class": sec_class,
	"blocks": blocks,
	"rendered_html": Markup(section_html),
	})

	# ── Prepare summary block ──
	summary_template = None
	if summary_block:
	data = summary_block["data"]
	section_json = data.get("section_json", {})
	if not isinstance(section_json, dict):
	section_json = {}

	# Typed blocks for summary
	summary_blocks = normalize_section(
	str(data.get("section_key", "")),
	str(data.get("section_title", "")),
	section_json,
	universities=active_universities,
	debug=debug,
	)

	summary_html = render_global_blocks(
	str(data.get("section_key", "")),
	str(data.get("section_title", "")),
	section_json,
	debug,
	universities=active_universities,
	)

	summary_template = {
	"anchor": summary_block["anchor"],
	"data": data,
	"blocks": summary_blocks,
	"rendered_html": Markup(summary_html),
	}

	# ── Prepare university data for templates (both old + new paths) ──
	university_template_data = []
	university_block_data = []
	for idx, uni_raw in enumerate(active_universities):
	uni_raw["_is_first"] = (idx == 0)
	# Legacy path
	uni_data = _prepare_university_data(
	uni_raw, allow_remote, include_inactive_programs, debug, stats,
	)
	university_template_data.append(uni_data)
	# New block path
	uni_block = normalize_university(
	uni_raw, allow_remote, include_inactive_programs, debug, stats,
	)
	university_block_data.append(uni_block)

	# ── Bottom pages ──
	bottom_pages_urls = []
	raw_bottom = images.get("bottomPages", [])
	if isinstance(raw_bottom, list):
	for img_path in raw_bottom:
	if os.path.isfile(str(img_path)):
	bottom_pages_urls.append(Path(str(img_path)).as_uri())

	# ── Render template ──
	html = template.render(
	font_css=Markup(font_css),
	base_url=base_url,
	extra_css="",
	header_image=header_image,
	label_image=label_image,
	cover_image=cover_image,
	toc_image=toc_image,
	toc_items=toc_items,
	toc_items_sorted=toc_items_sorted,
	toc_title=toc_title,
	toc_sort_order=toc_sort_order,
	general_sections=template_sections,
	summary_block=summary_template,
	universities=university_template_data,
	university_blocks=university_block_data,
	bottom_pages=bottom_pages_urls,
	debug=debug,
	stats=stats,
	)

	return html