Spaces:

internationalscholarsprogram
/

handbook_engine

Running

File size: 21,787 Bytes

"""HTML builder — assembles the full ISP Handbook HTML document.

Uses Jinja2 templates for HTML generation. Data preparation logic is
preserved from the original string-concatenation approach. The output
is a self-contained HTML suitable for Playwright Chromium PDF export.
"""

from __future__ import annotations

import base64
import logging
import mimetypes
import os
import re
from pathlib import Path
from typing import Any

from jinja2 import Environment, FileSystemLoader, select_autoescape
from markupsafe import Markup

from app.core.config import get_settings
from app.core.fonts import font_face_css, select_font_family
from app.services.normalizer import normalize_section, normalize_university
from app.services.renderers import (
    fetch_image_data_uri,
    render_global_blocks,
    sort_toc,
)
from app.services.utils import (
    format_money_figures,
    get_any,
    h,
    handbook_anchor,
    hb_slug,
    is_truthy,
    sort_sections_stable,
)

logger = logging.getLogger(__name__)

# Jinja2 environment — templates live alongside the app package
_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"


def _get_jinja_env() -> Environment:
    """Create a Jinja2 environment pointing to our templates directory."""
    env = Environment(
        loader=FileSystemLoader(str(_TEMPLATES_DIR)),
        autoescape=select_autoescape(["html"]),
        trim_blocks=True,
        lstrip_blocks=True,
    )
    return env


def _static_base_url() -> str:
    """Return absolute file:// URL to the static directory."""
    static_dir = Path(__file__).resolve().parent.parent / "static"
    return static_dir.as_uri()


def _unused_pdf_override_css(font_stack: str) -> str:
    """Legacy inline PDF override CSS — kept for reference only.
    All styling now lives in static/css/print.css for Chromium rendering.
    """
    return ""


# Section class map
SECTION_CLASS_MAP = {
    "overview": "sec-overview",
    "how_program_works_and_qualification_requirements": "sec-qualification",
    "enrolment_steps": "sec-steps",
    "withdrawal_late_payment_refund_policy": "sec-policy",
    "refund_guidelines": "sec-refund",
    "program_contributions": "sec-contributions",
    "funding_options_available": "sec-funding",
    "summary_of_universities": "sec-summary",
}

PAGE_BREAK_KEYS = {
    "overview",
    "how_program_works_and_qualification_requirements",
    "enrolment_steps",
    "withdrawal_late_payment_refund_policy",
    "refund_guidelines",
    "program_contributions",
    "funding_options_available",
    "summary_of_universities",
}


def _prepare_university_data(
    uni_raw: dict[str, Any],
    allow_remote: bool,
    include_inactive_programs: bool,
    debug: bool,
    stats: dict[str, Any],
) -> dict[str, Any]:
    """Prepare a single university's template data.

    Extracts overview, campus image, benefits, programs, and extra sections
    from the raw sections list. This moves the logic that was in
    render_university_section into a data-preparation step so that the
    Jinja2 template handles the HTML.
    """
    uni_name = uni_raw["name"]
    sections = uni_raw.get("sections", [])
    is_first = uni_raw.get("_is_first", False)

    stats["universities"] = stats.get("universities", 0) + 1

    # Build section map; merge duplicate "programs"
    sec_map: dict[str, dict] = {}
    for s in sections:
        if not isinstance(s, dict):
            continue
        k = str(s.get("section_key", ""))
        if not k:
            continue
        if k == "programs" and k in sec_map:
            existing = sec_map["programs"].get("section_json", {})
            incoming = s.get("section_json", {})
            if not isinstance(existing, dict):
                existing = {}
            if not isinstance(incoming, dict):
                incoming = {}
            a = existing.get("programs", [])
            b = incoming.get("programs", [])
            if not isinstance(a, list):
                a = []
            if not isinstance(b, list):
                b = []
            existing["programs"] = a + b
            sec_map["programs"]["section_json"] = existing
            continue
        sec_map[k] = s

    # Campus image
    img_section = sec_map.get("campus_image") or sec_map.get("image")
    campus_image = ""
    campus_caption = ""
    if img_section:
        j = img_section.get("section_json", {})
        if isinstance(j, dict):
            campus_url = str(j.get("image_url", "")).strip()
            campus_caption = str(j.get("caption", "")).strip()
            if allow_remote and campus_url:
                embedded = fetch_image_data_uri(campus_url)
                if embedded:
                    campus_image = embedded
                    stats["images_embedded"] = stats.get("images_embedded", 0) + 1
                else:
                    stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1
            else:
                stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1

    # Overview and website
    resolved_website = (uni_raw.get("website") or "").strip()
    overview_data = None

    if "overview" in sec_map:
        overview_json = sec_map["overview"].get("section_json", {})
        if not isinstance(overview_json, dict):
            overview_json = {}

        site_from_overview = get_any(
            overview_json,
            ["university_website", "university_website_url", "website", "site", "url", "homepage", "web_url"],
        )
        if not resolved_website and site_from_overview:
            resolved_website = site_from_overview

        overview_data = {
            "founded": get_any(overview_json, ["founded", "Founded"]),
            "total_students": get_any(overview_json, ["total_students", "Total Students"]),
            "undergraduates": get_any(overview_json, ["undergraduates", "Undergraduate Students", "undergraduate_students"]),
            "postgraduates": get_any(overview_json, ["postgraduate_students", "Postgraduate Students"]),
            "acceptance_rate": get_any(overview_json, ["acceptance_rate", "Acceptance Rate"]),
            "location": get_any(overview_json, ["location", "Location"]),
            "tuition": get_any(overview_json, [
                "tuition_out_of_state_yearly",
                "Yearly Out of State Tuition Fees",
                "Yearly Out-of-State Tuition Fees",
                "Yearly Tuition Fees",
                "Yearly Out-of-State Tuition Fees:",
            ]),
        }

    if resolved_website:
        stats["university_links"] = stats.get("university_links", 0) + 1
        stats["website_rows"] = stats.get("website_rows", 0) + 1

    # Benefits
    benefits = None
    if "benefits" in sec_map:
        j = sec_map["benefits"].get("section_json", {})
        if not isinstance(j, dict):
            j = {}
        raw_benefits = j.get("benefits", [])
        if isinstance(raw_benefits, list):
            benefits = [str(b).strip() for b in raw_benefits if str(b).strip()]
        else:
            benefits = []

    # Programs
    programs = None
    if "programs" in sec_map:
        j = sec_map["programs"].get("section_json", {})
        if not isinstance(j, dict):
            j = {}
        programs_raw = j.get("programs", [])
        if not isinstance(programs_raw, list):
            programs_raw = []

        if not include_inactive_programs:
            programs_raw = [
                p for p in programs_raw
                if isinstance(p, dict) and is_truthy(
                    p.get("program_active", p.get("is_active", p.get("active", 1)))
                )
            ]

        programs = []
        seen_names = set()
        for p in programs_raw:
            if not isinstance(p, dict):
                continue
            program_name = str(p.get("program_name", "")).strip()
            # Deduplicate by lowercase program name
            key = program_name.lower()
            if key in seen_names:
                continue
            seen_names.add(key)
            link = str(p.get("program_link", "")).strip()
            if not link and isinstance(p.get("program_links"), dict):
                link = str(p["program_links"].get("web_link", "")).strip()

            # Build career HTML
            career = p.get("career_pathways", [])
            career_html = ""
            if isinstance(career, list):
                career_items = [str(x).strip() for x in career if str(x).strip()]
                if career_items:
                    career_html = '<ul class="career-list">'
                    for ci in career_items:
                        career_html += f"<li>{h(ci)}</li>"
                    career_html += "</ul>"
            else:
                raw = str(career).strip()
                if raw:
                    import re as _re
                    lines = [l.strip() for l in _re.split(r"[\r\n]+", raw) if l.strip()]
                    if len(lines) > 1:
                        career_html = '<ul class="career-list">'
                        for line in lines:
                            career_html += f"<li>{h(line)}</li>"
                        career_html += "</ul>"
                    else:
                        career_html = h(raw)

            if not career_html:
                career_html = "&nbsp;"

            programs.append({
                "name": program_name,
                "link": link,
                "designation": str(p.get("designation", "")),
                "entrance": str(p.get("entrance_exam", p.get("entrance_examination", ""))),
                "career_html": Markup(career_html),
                "funding": str(p.get("funding_category", "")),
            })

    # Extra sections
    skip_keys = {"campus_image", "image", "overview", "benefits", "programs"}
    extra_sections = []
    for s in sections:
        if not isinstance(s, dict):
            continue
        k = str(s.get("section_key", ""))
        if not k or k in skip_keys:
            continue
        title = str(s.get("section_title", ""))
        j = s.get("section_json", {})
        if not isinstance(j, dict):
            j = {}
        rendered = render_global_blocks(k, title, j, debug)
        extra_sections.append({"rendered_html": Markup(rendered)})

    classes = ["uni"]
    if not is_first:
        classes.append("page-break")

    return {
        "name": uni_name,
        "anchor": uni_raw.get("anchor"),
        "sort_order": uni_raw.get("sort_order"),
        "website": resolved_website,
        "classes": classes,
        "overview": overview_data,
        "campus_image": campus_image,
        "campus_caption": campus_caption,
        "benefits": benefits,
        "programs": programs,
        "extra_sections": extra_sections,
    }


def build_handbook_html(
    globals_data: list[dict[str, Any]],
    by_uni: dict[int, dict[str, Any]],
    images: dict[str, Any],
    allow_remote: bool,
    include_inactive_programs: bool = False,
    debug: bool = False,
) -> str:
    """Build the full handbook HTML document using Jinja2 templates.

    Preserves the same data preparation logic from the original version.
    Rendering is delegated to Jinja2 templates with Playwright-compatible
    HTML/CSS output.
    """
    env = _get_jinja_env()
    template = env.get_template("handbook.html")

    font_meta = select_font_family()
    font_css = font_face_css(font_meta)

    # Base URL for static assets (CSS, images, etc.)
    base_url = _static_base_url()

    stats: dict[str, Any] = {
        "universities": 0,
        "images_embedded": 0,
        "images_placeholder": 0,
        "program_links_total": 0,
        "program_missing_links_total": 0,
        "missing_program_links": {},
        "university_links": 0,
        "website_rows": 0,
    }

    # ── Cover Image ──
    cover_image = images.get("coverImage", "")
    if cover_image and os.path.isfile(cover_image):
        cover_image = Path(cover_image).as_uri()
    else:
        cover_image = ""

    # ── TOC Image ──
    toc_image = images.get("tocImage", "")
    if toc_image and os.path.isfile(toc_image):
        toc_image = Path(toc_image).as_uri()
    else:
        toc_image = ""

    # ── Header Image (repeating page header) ──
    header_image = images.get("headerImage", "")
    if header_image and os.path.isfile(header_image):
        mime = mimetypes.guess_type(header_image)[0] or "image/jpeg"
        with open(header_image, "rb") as f:
            header_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
    else:
        header_image = ""

    # ── Label Image (repeating right-side label) ──
    label_image = images.get("labelImage", "")
    if label_image and os.path.isfile(label_image):
        mime = mimetypes.guess_type(label_image)[0] or "image/jpeg"
        with open(label_image, "rb") as f:
            label_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
    else:
        # Fallback to remote URL when local file is unavailable
        label_image = "https://finsapdev.qhtestingserver.com/MODEL_APIS/handbook/images/label.jpeg"

    # ── Prepare active universities ──
    active_universities: list[dict[str, Any]] = []
    for uid, uni in by_uni.items():
        if not isinstance(uni, dict):
            continue
        if not is_truthy(uni.get("is_active", True)):
            continue
        name = str(uni.get("university_name", f"University #{uid}"))
        anchor = handbook_anchor("uni", name, int(uid))
        active_universities.append({
            "id": int(uid),
            "anchor": anchor,
            "name": name,
            "sections": uni.get("sections", []) if isinstance(uni.get("sections"), list) else [],
            "website": str(uni.get("website", "")),
            "sort_order": int(uni["sort_order"]) if uni.get("sort_order") is not None and str(uni.get("sort_order", "")).lstrip("-").isdigit() else None,
        })

    # ── Normalise globals ──
    globals_data = sort_sections_stable(globals_data)

    required_keys = [
        "table_of_contents",
        "overview",
        "how_program_works_and_qualification_requirements",
    ]
    existing_keys = {str(g.get("section_key", "")).lower() for g in globals_data if isinstance(g, dict)}
    missing = [k for k in required_keys if k not in existing_keys]
    if missing:
        msg = f"Handbook required sections missing: {','.join(missing)}"
        logger.error(msg)
        raise RuntimeError(msg)

    general_sections: list[dict[str, Any]] = []
    summary_block: dict[str, Any] | None = None
    toc_sort_order = None
    toc_title = "Table of Contents"

    for idx, g in enumerate(globals_data):
        if not isinstance(g, dict):
            continue
        key_raw = str(g.get("section_key", ""))
        key = key_raw.lower()
        sort_order = int(g["sort_order"]) if g.get("sort_order") is not None and str(g.get("sort_order", "")).lstrip("-").isdigit() else None

        if key == "table_of_contents" and toc_sort_order is None:
            toc_sort_order = sort_order if sort_order is not None else (idx + 1)
            toc_title = str(g.get("section_title", "Table of Contents"))
            continue

        if key == "summary_of_universities":
            summary_block = {
                "anchor": handbook_anchor("summary", "summary-of-universities", idx),
                "data": g,
                "sort_order": sort_order,
            }
            continue

        anchor = handbook_anchor("g", str(g.get("section_title", g.get("section_key", "section"))), idx)
        general_sections.append({
            "anchor": anchor,
            "data": g,
            "sort_order": sort_order,
        })

    # ── Build TOC items ──
    toc_items: list[dict[str, Any]] = []
    for gs in general_sections:
        title = str(gs["data"].get("section_title", gs["data"].get("section_key", "Section")))
        toc_items.append({
            "title": title,
            "target": "#" + gs["anchor"],
            "level": 0,
            "bold": True,
            "sort": gs["sort_order"],
        })

    if summary_block:
        title = str(summary_block["data"].get("section_title", "Summary of Universities"))
        toc_items.append({
            "title": title,
            "target": "#" + summary_block["anchor"],
            "level": 0,
            "bold": True,
            "sort": summary_block["sort_order"],
        })

    for u in active_universities:
        toc_items.append({
            "title": u["name"],
            "target": "#" + u["anchor"],
            "level": 1,
            "bold": False,
            "sort": u.get("sort_order"),
        })

    # ── Prepare sorted TOC items for template ──
    sorted_toc = sort_toc(list(toc_items))
    toc_items_sorted = []
    for e in sorted_toc:
        if not isinstance(e, dict):
            continue
        title = str(e.get("title", "")).strip()
        if not title:
            continue
        level = max(0, min(3, int(e.get("level", 0))))
        bold = bool(e.get("bold", False))
        upper = bool(e.get("upper", False))
        if level == 0:
            bold = True
            upper = True
        display_title = title.upper() if upper else title
        page = str(e.get("page", "")).strip()

        toc_items_sorted.append({
            "title": title,
            "display_title": display_title,
            "target": str(e.get("target", e.get("anchor", ""))).strip(),
            "level": level,
            "bold": bold,
            "upper": upper,
            "page": page,
        })

    # ── Prepare general sections with rendered HTML and typed blocks ──
    template_sections = []
    for gs in general_sections:
        data = gs["data"]
        key_lower = str(data.get("section_key", "")).lower()

        sec_class = SECTION_CLASS_MAP.get(key_lower)
        if sec_class is None:
            sec_class = "sec-" + re.sub(r"[^a-z0-9]+", "-", key_lower)

        section_json = data.get("section_json", {})
        if not isinstance(section_json, dict):
            section_json = {}

        # Typed blocks for the new rendering path
        blocks = normalize_section(
            str(data.get("section_key", "")),
            str(data.get("section_title", "")),
            section_json,
            debug=debug,
        )

        # Legacy HTML fallback
        section_html = render_global_blocks(
            str(data.get("section_key", "")),
            str(data.get("section_title", "")),
            section_json,
            debug,
        )

        if not section_html.strip() and not blocks:
            logger.warning(
                "Empty section render key=%s sort_order=%s",
                data.get("section_key"),
                data.get("sort_order"),
            )

        template_sections.append({
            "anchor": gs["anchor"],
            "data": data,
            "page_break": key_lower in PAGE_BREAK_KEYS,
            "sec_class": sec_class,
            "blocks": blocks,
            "rendered_html": Markup(section_html),
        })

    # ── Prepare summary block ──
    summary_template = None
    if summary_block:
        data = summary_block["data"]
        section_json = data.get("section_json", {})
        if not isinstance(section_json, dict):
            section_json = {}

        # Typed blocks for summary
        summary_blocks = normalize_section(
            str(data.get("section_key", "")),
            str(data.get("section_title", "")),
            section_json,
            universities=active_universities,
            debug=debug,
        )

        summary_html = render_global_blocks(
            str(data.get("section_key", "")),
            str(data.get("section_title", "")),
            section_json,
            debug,
            universities=active_universities,
        )

        summary_template = {
            "anchor": summary_block["anchor"],
            "data": data,
            "blocks": summary_blocks,
            "rendered_html": Markup(summary_html),
        }

    # ── Prepare university data for templates (both old + new paths) ──
    university_template_data = []
    university_block_data = []
    for idx, uni_raw in enumerate(active_universities):
        uni_raw["_is_first"] = (idx == 0)
        # Legacy path
        uni_data = _prepare_university_data(
            uni_raw, allow_remote, include_inactive_programs, debug, stats,
        )
        university_template_data.append(uni_data)
        # New block path
        uni_block = normalize_university(
            uni_raw, allow_remote, include_inactive_programs, debug, stats,
        )
        university_block_data.append(uni_block)

    # ── Bottom pages ──
    bottom_pages_urls = []
    raw_bottom = images.get("bottomPages", [])
    if isinstance(raw_bottom, list):
        for img_path in raw_bottom:
            if os.path.isfile(str(img_path)):
                bottom_pages_urls.append(Path(str(img_path)).as_uri())

    # ── Render template ──
    html = template.render(
        font_css=Markup(font_css),
        base_url=base_url,
        extra_css="",
        header_image=header_image,
        label_image=label_image,
        cover_image=cover_image,
        toc_image=toc_image,
        toc_items=toc_items,
        toc_items_sorted=toc_items_sorted,
        toc_title=toc_title,
        toc_sort_order=toc_sort_order,
        general_sections=template_sections,
        summary_block=summary_template,
        universities=university_template_data,
        university_blocks=university_block_data,
        bottom_pages=bottom_pages_urls,
        debug=debug,
        stats=stats,
    )

    return html