Spaces:

internationalscholarsprogram
/

handbook-engine

Sleeping

App Files Files Community

handbook-engine / app /services /html_builder.py

internationalscholarsprogram

Initial deploy: ISP Handbook PDF engine

2deab8c verified about 1 month ago

raw

history blame contribute delete

23.2 kB

	"""HTML builder — assembles the full ISP Handbook HTML document.

	Uses Jinja2 templates for HTML generation. Data preparation logic is
	preserved from the original string-concatenation approach. The output
	is a self-contained HTML suitable for Playwright Chromium PDF export.
	"""

	from __future__ import annotations

	import base64
	import logging
	import mimetypes
	import os
	import re
	from pathlib import Path
	from typing import Any

	from jinja2 import Environment, FileSystemLoader, select_autoescape
	from markupsafe import Markup

	from app.core.config import get_settings
	from app.core.fonts import font_face_css, select_font_family
	from app.services.normalizer import normalize_section, normalize_university
	from app.services.renderers import (
	fetch_image_data_uri,
	render_global_blocks,
	sort_toc,
	_extract_university_funding,
	)
	from app.services.utils import (
	format_money_figures,
	get_any,
	h,
	handbook_anchor,
	hb_slug,
	is_truthy,
	sort_sections_stable,
	)

	logger = logging.getLogger(__name__)

	# Jinja2 environment — templates live alongside the app package
	_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"


	def _get_jinja_env() -> Environment:
	"""Create a Jinja2 environment pointing to our templates directory."""
	env = Environment(
	loader=FileSystemLoader(str(_TEMPLATES_DIR)),
	autoescape=select_autoescape(["html"]),
	trim_blocks=True,
	lstrip_blocks=True,
	)
	return env


	def _static_base_url() -> str:
	"""Return absolute file:// URL to the static directory."""
	static_dir = Path(__file__).resolve().parent.parent / "static"
	return static_dir.as_uri()


	def _unused_pdf_override_css(font_stack: str) -> str:
	"""Legacy inline PDF override CSS — kept for reference only.
	All styling now lives in static/css/print.css for Chromium rendering.
	"""
	return ""


	# Section class map
	SECTION_CLASS_MAP = {
	"overview": "sec-overview",
	"how_the_program_works": "sec-how",
	"qualification_requirements": "sec-qualification",
	"enrolment_steps": "sec-steps",
	"withdrawal_refund_policy": "sec-policy",
	"refund_guidelines": "sec-refund",
	"program_contributions": "sec-contributions",
	"program_features_breakdown": "sec-breakdown",
	"funding_options_available": "sec-funding",
	"summary_of_universities": "sec-summary",
	"summary_of_universities_cosigner": "sec-summary-cosigner",
	}

	PAGE_BREAK_KEYS = {
	"overview",
	"how_the_program_works",
	"qualification_requirements",
	"enrolment_steps",
	"withdrawal_refund_policy",
	"refund_guidelines",
	"program_contributions",
	"program_features_breakdown",
	"funding_options_available",
	"summary_of_universities",
	"summary_of_universities_cosigner",
	}


	def _collect_program_option_inconsistencies(value: Any, path: str, hits: list[str]) -> None:
	"""Collect paths where only REGULAR or PRIME appears."""
	if isinstance(value, dict):
	for k, v in value.items():
	_collect_program_option_inconsistencies(v, f"{path}.{k}" if path else str(k), hits)
	return
	if isinstance(value, list):
	for i, v in enumerate(value):
	_collect_program_option_inconsistencies(v, f"{path}[{i}]", hits)
	return
	if value is None:
	return

	text = str(value)
	has_regular = bool(re.search(r"\bREGULAR\b", text, flags=re.IGNORECASE))
	has_prime = bool(re.search(r"\bPRIME\b", text, flags=re.IGNORECASE))
	if has_regular ^ has_prime:
	hits.append(path)


	def _prepare_university_data(
	uni_raw: dict[str, Any],
	allow_remote: bool,
	include_inactive_programs: bool,
	debug: bool,
	stats: dict[str, Any],
	) -> dict[str, Any]:
	"""Prepare a single university's template data.

	Extracts overview, campus image, benefits, programs, and extra sections
	from the raw sections list. This moves the logic that was in
	render_university_section into a data-preparation step so that the
	Jinja2 template handles the HTML.
	"""
	uni_name = uni_raw["name"]
	sections = uni_raw.get("sections", [])
	is_first = uni_raw.get("_is_first", False)

	stats["universities"] = stats.get("universities", 0) + 1

	# Build section map; merge duplicate "programs"
	sec_map: dict[str, dict] = {}
	for s in sections:
	if not isinstance(s, dict):
	continue
	k = str(s.get("section_key", ""))
	if not k:
	continue
	if k == "programs" and k in sec_map:
	existing = sec_map["programs"].get("section_json", {})
	incoming = s.get("section_json", {})
	if not isinstance(existing, dict):
	existing = {}
	if not isinstance(incoming, dict):
	incoming = {}
	a = existing.get("programs", [])
	b = incoming.get("programs", [])
	if not isinstance(a, list):
	a = []
	if not isinstance(b, list):
	b = []
	existing["programs"] = a + b
	sec_map["programs"]["section_json"] = existing
	continue
	sec_map[k] = s

	# Campus image
	img_section = sec_map.get("campus_image") or sec_map.get("image")
	campus_image = ""
	campus_caption = ""
	if img_section:
	j = img_section.get("section_json", {})
	if isinstance(j, dict):
	campus_url = str(j.get("image_url", "")).strip()
	campus_caption = str(j.get("caption", "")).strip()
	if allow_remote and campus_url:
	embedded = fetch_image_data_uri(campus_url)
	if embedded:
	campus_image = embedded
	stats["images_embedded"] = stats.get("images_embedded", 0) + 1
	else:
	stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1
	else:
	stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1

	# Overview and website
	resolved_website = (uni_raw.get("website") or "").strip()
	overview_data = None

	if "overview" in sec_map:
	overview_json = sec_map["overview"].get("section_json", {})
	if not isinstance(overview_json, dict):
	overview_json = {}

	site_from_overview = get_any(
	overview_json,
	["university_website", "university_website_url", "website", "site", "url", "homepage", "web_url"],
	)
	if not resolved_website and site_from_overview:
	resolved_website = site_from_overview

	overview_data = {
	"founded": get_any(overview_json, ["founded", "Founded"]),
	"total_students": get_any(overview_json, ["total_students", "Total Students"]),
	"undergraduates": get_any(overview_json, ["undergraduates", "Undergraduate Students", "undergraduate_students"]),
	"postgraduates": get_any(overview_json, ["postgraduate_students", "Postgraduate Students"]),
	"acceptance_rate": get_any(overview_json, ["acceptance_rate", "Acceptance Rate"]),
	"location": get_any(overview_json, ["location", "Location"]),
	"tuition": format_money_figures(str(get_any(overview_json, [
	"tuition_out_of_state_yearly",
	"Yearly Out of State Tuition Fees",
	"Yearly Out-of-State Tuition Fees",
	"Yearly Tuition Fees",
	"Yearly Out-of-State Tuition Fees:",
	]) or "")) or None,
	}

	if resolved_website:
	stats["university_links"] = stats.get("university_links", 0) + 1
	stats["website_rows"] = stats.get("website_rows", 0) + 1

	# Benefits
	# Benefits + Funding
	benefits = []
	funding_heading = "Funding Available"
	funding_items: list[str] = []

	if "benefits" in sec_map:
	j = sec_map["benefits"].get("section_json", {})
	if not isinstance(j, dict):
	j = {}

	raw_benefits = j.get("benefits", [])
	if isinstance(raw_benefits, list):
	benefits = [str(b).strip() for b in raw_benefits if str(b).strip()]
	else:
	benefits = []

	funding_heading, funding_items = _extract_university_funding(
	j,
	{
	"school_category": uni_raw.get("school_category"),
	"status": "in" if is_truthy(uni_raw.get("is_active", True)) else "out",
	},
	)

	# Programs
	programs = None
	if "programs" in sec_map:
	j = sec_map["programs"].get("section_json", {})
	if not isinstance(j, dict):
	j = {}
	programs_raw = j.get("programs", [])
	if not isinstance(programs_raw, list):
	programs_raw = []

	if not include_inactive_programs:
	programs_raw = [
	p for p in programs_raw
	if isinstance(p, dict) and is_truthy(
	p.get("program_active", p.get("is_active", p.get("active", 1)))
	)
	]

	programs = []
	seen_names = set()
	for p in programs_raw:
	if not isinstance(p, dict):
	continue
	program_name = str(p.get("program_name", "")).strip()
	# Deduplicate by lowercase program name
	key = program_name.lower()
	if key in seen_names:
	continue
	seen_names.add(key)
	link = str(p.get("program_link", "")).strip()
	if not link and isinstance(p.get("program_links"), dict):
	link = str(p["program_links"].get("web_link", "")).strip()

	programs.append({
	"name": program_name,
	"link": link,
	"designation": str(p.get("designation", "")),
	"entrance": str(p.get("entrance_exam", p.get("entrance_examination", ""))),
	})

	# Extra sections
	skip_keys = {"campus_image", "image", "overview", "benefits", "programs"}
	extra_sections = []
	for s in sections:
	if not isinstance(s, dict):
	continue
	k = str(s.get("section_key", ""))
	if not k or k in skip_keys:
	continue
	title = str(s.get("section_title", ""))
	j = s.get("section_json", {})
	if not isinstance(j, dict):
	j = {}
	rendered = render_global_blocks(k, title, j, debug)
	extra_sections.append({"rendered_html": Markup(rendered)})

	classes = ["uni"]
	if not is_first:
	classes.append("page-break")

	return {
	"name": uni_name,
	"anchor": uni_raw.get("anchor"),
	"sort_order": uni_raw.get("sort_order"),
	"website": resolved_website,
	"classes": classes,
	"overview": overview_data,
	"campus_image": campus_image,
	"campus_caption": campus_caption,
	"benefits": benefits,
	"funding_heading": funding_heading,
	"funding_items": funding_items,
	"programs": programs,
	"extra_sections": extra_sections,
	}


	def build_handbook_html(
	globals_data: list[dict[str, Any]],
	by_uni: dict[int, dict[str, Any]],
	images: dict[str, Any],
	allow_remote: bool,
	include_inactive_programs: bool = False,
	debug: bool = False,
	) -> str:
	"""Build the full handbook HTML document using Jinja2 templates.

	Preserves the same data preparation logic from the original version.
	Rendering is delegated to Jinja2 templates with Playwright-compatible
	HTML/CSS output.
	"""
	env = _get_jinja_env()
	template = env.get_template("handbook.html")

	font_meta = select_font_family()
	font_css = font_face_css(font_meta)

	# Base URL for static assets (CSS, images, etc.)
	base_url = _static_base_url()

	stats: dict[str, Any] = {
	"universities": 0,
	"images_embedded": 0,
	"images_placeholder": 0,
	"program_links_total": 0,
	"program_missing_links_total": 0,
	"missing_program_links": {},
	"university_links": 0,
	"website_rows": 0,
	"program_option_warnings": [],
	}

	# ── Cover Image ──
	cover_image = images.get("coverImage", "")
	if cover_image and os.path.isfile(cover_image):
	cover_image = Path(cover_image).as_uri()
	else:
	cover_image = ""

	# ── TOC Image ──
	toc_image = images.get("tocImage", "")
	if toc_image and os.path.isfile(toc_image):
	toc_image = Path(toc_image).as_uri()
	else:
	toc_image = ""

	# ── Header Image (repeating page header) ──
	header_image = images.get("headerImage", "")
	if header_image and os.path.isfile(header_image):
	mime = mimetypes.guess_type(header_image)[0] or "image/jpeg"
	with open(header_image, "rb") as f:
	header_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
	else:
	header_image = ""

	# ── Label Image (repeating right-side label) ──
	label_image = images.get("labelImage", "")
	if label_image and os.path.isfile(label_image):
	mime = mimetypes.guess_type(label_image)[0] or "image/png"
	with open(label_image, "rb") as f:
	label_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
	else:
	logger.warning("Label image not found locally: %s", label_image)
	label_image = ""

	# ── Prepare active universities (sorted: Tier One first, Tier Two second) ──
	active_universities: list[dict[str, Any]] = []
	for uid, uni in by_uni.items():
	if not isinstance(uni, dict):
	continue
	if not is_truthy(uni.get("is_active", True)):
	continue
	name = str(uni.get("university_name", f"University #{uid}"))
	anchor = handbook_anchor("uni", name, int(uid))
	school_category = str(uni.get("school_category", "")).strip()
	tier = uni.get("tier")
	tier_label = str(uni.get("tier_label", "")).strip()
	active_universities.append({
	"id": int(uid),
	"anchor": anchor,
	"name": name,
	"sections": uni.get("sections", []) if isinstance(uni.get("sections"), list) else [],
	"website": str(uni.get("website", "")),
	"sort_order": int(uni["sort_order"]) if uni.get("sort_order") is not None and str(uni.get("sort_order", "")).lstrip("-").isdigit() else None,
	"school_category": school_category,
	"tier": tier,
	"tier_label": tier_label,
	})

	# Stable tier ordering: Tier One (non_cosigner) → Tier Two (cosigner) → others, then alphabetical
	def _tier_sort(u: dict) -> tuple:
	t = u.get("tier")
	rank = t if isinstance(t, int) else 99
	return (rank, (u.get("name") or "").lower(), u.get("id", 0))
	active_universities.sort(key=_tier_sort)

	# ── Normalise globals ──
	globals_data = sort_sections_stable(globals_data)

	required_keys = [
	"table_of_contents",
	"overview",
	"how_the_program_works",
	]
	existing_keys = {str(g.get("section_key", "")).lower() for g in globals_data if isinstance(g, dict)}
	missing = [k for k in required_keys if k not in existing_keys]
	if missing:
	msg = f"Handbook required sections missing: {','.join(missing)}"
	logger.error(msg)
	raise RuntimeError(msg)

	general_sections: list[dict[str, Any]] = []
	toc_sort_order = None
	toc_title = "Table of Contents"

	for idx, g in enumerate(globals_data):
	if not isinstance(g, dict):
	continue
	key_raw = str(g.get("section_key", ""))
	key = key_raw.lower()
	sort_order = int(g["sort_order"]) if g.get("sort_order") is not None and str(g.get("sort_order", "")).lstrip("-").isdigit() else None

	if key == "table_of_contents" and toc_sort_order is None:
	toc_sort_order = sort_order if sort_order is not None else (idx + 1)
	toc_title = str(g.get("section_title", "Table of Contents"))
	continue

	section_hits: list[str] = []
	_collect_program_option_inconsistencies(
	g.get("section_json", {}),
	f"global.{key_raw}",
	section_hits,
	)
	for hit in section_hits:
	if hit not in stats["program_option_warnings"]:
	stats["program_option_warnings"].append(hit)

	anchor = handbook_anchor("g", str(g.get("section_title", g.get("section_key", "section"))), idx)
	general_sections.append({
	"anchor": anchor,
	"data": g,
	"sort_order": sort_order,
	})

	# ── Build TOC items ──
	toc_items: list[dict[str, Any]] = []
	for gs in general_sections:
	# Prefer the JSON-level title (display-ready) over the DB section_title
	gs_json = gs["data"].get("section_json", {})
	if isinstance(gs_json, dict) and gs_json.get("title", "").strip():
	title = gs_json["title"].strip()
	else:
	title = str(gs["data"].get("section_title", gs["data"].get("section_key", "Section")))
	toc_items.append({
	"title": title,
	"target": "#" + gs["anchor"],
	"level": 0,
	"bold": True,
	"sort": gs["sort_order"],
	})

	for u in active_universities:
	toc_items.append({
	"title": u["name"],
	"target": "#" + u["anchor"],
	"level": 1,
	"bold": False,
	"sort": u.get("sort_order"),
	})

	# ── Prepare sorted TOC items for template ──
	sorted_toc = sort_toc(list(toc_items))
	toc_items_sorted = []
	for e in sorted_toc:
	if not isinstance(e, dict):
	continue
	title = str(e.get("title", "")).strip()
	if not title:
	continue
	level = max(0, min(3, int(e.get("level", 0))))
	bold = bool(e.get("bold", False))
	upper = bool(e.get("upper", False))
	if level == 0:
	bold = True
	upper = True
	display_title = title.upper() if upper else title
	page = str(e.get("page", "")).strip()

	toc_items_sorted.append({
	"title": title,
	"display_title": display_title,
	"target": str(e.get("target", e.get("anchor", ""))).strip(),
	"level": level,
	"bold": bold,
	"upper": upper,
	"page": page,
	})

	# ── Prepare general sections with rendered HTML and typed blocks ──
	template_sections = []
	for gs in general_sections:
	data = gs["data"]
	key_lower = str(data.get("section_key", "")).lower()

	sec_class = SECTION_CLASS_MAP.get(key_lower)
	if sec_class is None:
	sec_class = "sec-" + re.sub(r"[^a-z0-9]+", "-", key_lower)

	section_json = data.get("section_json", {})
	if not isinstance(section_json, dict):
	section_json = {}

	# Typed blocks for the new rendering path
	blocks = normalize_section(
	str(data.get("section_key", "")),
	str(data.get("section_title", "")),
	section_json,
	debug=debug,
	)

	# Legacy HTML fallback
	section_html = render_global_blocks(
	str(data.get("section_key", "")),
	str(data.get("section_title", "")),
	section_json,
	debug,
	)

	if not section_html.strip() and not blocks:
	logger.warning(
	"Empty section render key=%s sort_order=%s",
	data.get("section_key"),
	data.get("sort_order"),
	)

	template_sections.append({
	"anchor": gs["anchor"],
	"data": data,
	"page_break": key_lower in PAGE_BREAK_KEYS,
	"sec_class": sec_class,
	"blocks": blocks,
	"rendered_html": Markup(section_html),
	})

	# ── Prepare university data for templates (both old + new paths) ──
	# Group by tier for tier heading insertion in the PDF output
	university_template_data = []
	university_block_data = []
	# Track which tier label was last emitted so we can insert tier divider headings
	_seen_tier_labels: set[str] = set()

	for idx, uni_raw in enumerate(active_universities):
	uni_raw["_is_first"] = (idx == 0)

	# Insert tier group heading when tier changes
	current_tier_label = str(uni_raw.get("tier_label", "")).strip()
	if current_tier_label and current_tier_label not in _seen_tier_labels:
	_seen_tier_labels.add(current_tier_label)
	# Mark this university as starting a new tier group
	uni_raw["_tier_group_start"] = True
	uni_raw["_tier_group_label"] = f"{current_tier_label} Schools"

	uni_hits: list[str] = []
	_collect_program_option_inconsistencies(
	uni_raw.get("sections", []),
	f"university.{uni_raw.get('name', idx)}",
	uni_hits,
	)
	for hit in uni_hits:
	if hit not in stats["program_option_warnings"]:
	stats["program_option_warnings"].append(hit)

	# Legacy path
	uni_data = _prepare_university_data(
	uni_raw, allow_remote, include_inactive_programs, debug, stats,
	)
	# Carry tier metadata to template data
	uni_data["tier"] = uni_raw.get("tier")
	uni_data["tier_label"] = uni_raw.get("tier_label", "")
	uni_data["tier_group_start"] = uni_raw.get("_tier_group_start", False)
	uni_data["tier_group_label"] = uni_raw.get("_tier_group_label", "")
	university_template_data.append(uni_data)
	# New block path
	uni_block = normalize_university(
	uni_raw, allow_remote, include_inactive_programs, debug, stats,
	)
	university_block_data.append(uni_block)

	# ── Bottom pages ──
	bottom_pages_urls = []
	raw_bottom = images.get("bottomPages", [])
	if isinstance(raw_bottom, list):
	for img_path in raw_bottom:
	if os.path.isfile(str(img_path)):
	bottom_pages_urls.append(Path(str(img_path)).as_uri())

	# ── Render template ──
	if stats["program_option_warnings"]:
	logger.warning(
	"Program option consistency warnings (missing REGULAR or PRIME pair): %s",
	stats["program_option_warnings"],
	)

	html = template.render(
	font_css=Markup(font_css),
	base_url=base_url,
	extra_css="",
	header_image=header_image,
	label_image=label_image,
	cover_image=cover_image,
	toc_image=toc_image,
	toc_items=toc_items,
	toc_items_sorted=toc_items_sorted,
	toc_title=toc_title,
	toc_sort_order=toc_sort_order,
	general_sections=template_sections,
	summary_block=None,
	universities=university_template_data,
	university_blocks=university_block_data,
	bottom_pages=bottom_pages_urls,
	debug=debug,
	stats=stats,
	)

	return html