Spaces:

internationalscholarsprogram
/

handbook-engine

Sleeping

File size: 37,911 Bytes

2deab8c

"""Normalization layer — converts raw MySQL handbook content into typed render blocks.

Each section_json from the database is parsed into a list of RenderBlock
objects.  Every block has a `block_type` that maps 1-to-1 to a Jinja
partial and a CSS class.  This prevents ad-hoc interpretation of raw
JSON throughout the rendering pipeline.

Block types (from theme.BLOCK_TYPES):
    heading_1, heading_2, paragraph, bullet_list, note, table,
    enrollment_steps, school_profile, university_summary, toc,
    cover, full_page_image
"""

from __future__ import annotations

import re
from urllib.parse import quote_plus
from dataclasses import dataclass, field
from typing import Any

from app.services.renderers import _extract_university_funding
from app.services.utils import (
    ensure_program_options_pair,
    emphasize_keywords,
    format_money_figures,
    get_any,
    h,
    hb_slug,
    is_assoc,
    is_truthy,
    linkify_urls,
)
from app.services.renderers import fetch_image_data_uri


# ───────────────────────────────────────────────────────────────
# Block data-classes
# ───────────────────────────────────────────────────────────────

@dataclass
class RenderBlock:
    """Base typed render block."""
    block_type: str
    css_class: str = ""
    data: dict[str, Any] = field(default_factory=dict)


# ───────────────────────────────────────────────────────────────
# Section → blocks
# ───────────────────────────────────────────────────────────────

def normalize_section(
    section_key: str,
    section_title: str,
    section_json: dict | list,
    *,
    universities: list[dict] | None = None,
    debug: bool = False,
) -> list[RenderBlock]:
    """Convert a single global section payload into a list of RenderBlocks.

    This is the single translation point between the database schema
    and the rendering layer.
    """
    blocks: list[RenderBlock] = []
    key_norm = section_key.lower().strip()

    if not isinstance(section_json, dict):
        section_json = {}

    layout_norm = str(section_json.get("layout", "")).lower().strip()

    # ── Section heading ──
    # Prefer the JSON-level title (display-ready) over the DB section_title
    json_title = str(section_json.get("title", "")).strip() if isinstance(section_json, dict) else ""
    title = json_title or section_title.strip()
    if title and key_norm != "table_of_contents":
        blocks.append(RenderBlock(
            block_type="heading_1",
            css_class="hb-heading-1",
            data={"text": title},
        ))

    # ── Steps → enrollment_steps ──
    steps = section_json.get("steps")
    if isinstance(steps, list):
        blocks.append(RenderBlock(
            block_type="enrollment_steps",
            css_class="hb-enrollment-steps",
            data={"steps": _normalize_steps(steps)},
        ))
        return blocks

    # ── Bullets ──
    has_bullets = isinstance(section_json.get("bullets"), list)
    has_items = isinstance(section_json.get("items"), list)
    if has_bullets or (layout_norm == "bullets_with_note" and has_items):
        from markupsafe import Markup
        lst = section_json.get("items") if has_items else section_json.get("bullets")
        items = [_normalize_text_content(str(b).strip()) for b in lst if str(b).strip()]
        html_items = [Markup(emphasize_keywords(it)) for it in items]
        blocks.append(RenderBlock(
            block_type="bullet_list",
            css_class="hb-bullet-list",
            data={"entries": html_items, "html_entries": True},
        ))
        note = _normalize_text_content(
            str(section_json.get("note", section_json.get("footnote", ""))).strip()
        )
        if note:
            blocks.append(RenderBlock(
                block_type="note",
                css_class="hb-note",
                data={"text": note},
            ))
        return blocks

    # ── Basic table ──
    cols = section_json.get("columns")
    rows = section_json.get("rows")
    if isinstance(cols, list) and isinstance(rows, list):
        blocks.append(_normalize_basic_table(cols, rows))
        return blocks

    # ── table_v2 ──
    if layout_norm == "table_v2":
        blocks.append(_normalize_table_v2(section_json))
        return blocks

    # ── doc_v1 ──
    if layout_norm == "doc_v1" and isinstance(section_json.get("blocks"), list):
        blocks.extend(_normalize_doc_v1(section_json["blocks"], skip_title=title))
        # Post-process breakdown section for Relocation Cost layout
        if key_norm == "program_features_breakdown":
            blocks = _postprocess_breakdown(blocks, section_json["blocks"])
        # Post-process Tier 2 section for sub-bullet styling
        if key_norm == "summary_of_universities_cosigner":
            blocks = _postprocess_tier2(blocks)
        return blocks

    # ── Fallback ──
    if "text" in section_json:
        text = _normalize_text_content(str(section_json["text"]))
        if text.strip():
            from markupsafe import Markup
            blocks.append(RenderBlock(
                block_type="paragraph",
                css_class="hb-paragraph",
                data={
                    "text": text,
                    "html": Markup(emphasize_keywords(text)),
                },
            ))

    return blocks


def _normalize_text_content(text: str) -> str:
    """Apply global handbook text normalization in a single place."""
    return ensure_program_options_pair(format_money_figures(text))


# ───────────────────────────────────────────────────────────────
# University profile normalisation
# ───────────────────────────────────────────────────────────────

def normalize_university(
    uni_raw: dict[str, Any],
    allow_remote: bool,
    include_inactive_programs: bool,
    debug: bool,
    stats: dict[str, Any],
) -> RenderBlock:
    """Convert raw university data into a school_profile RenderBlock."""
    uni_name = uni_raw["name"]
    sections = uni_raw.get("sections", [])
    is_first = uni_raw.get("_is_first", False)

    stats["universities"] = stats.get("universities", 0) + 1

    # Build section map; merge duplicate "programs" sections
    sec_map: dict[str, dict] = {}
    for s in sections:
        if not isinstance(s, dict):
            continue
        k = str(s.get("section_key", ""))
        if not k:
            continue
        if k == "programs" and k in sec_map:
            existing = sec_map["programs"].get("section_json", {})
            incoming = s.get("section_json", {})
            if not isinstance(existing, dict):
                existing = {}
            if not isinstance(incoming, dict):
                incoming = {}
            a = existing.get("programs", [])
            b = incoming.get("programs", [])
            if not isinstance(a, list):
                a = []
            if not isinstance(b, list):
                b = []
            existing["programs"] = a + b
            sec_map["programs"]["section_json"] = existing
            continue
        sec_map[k] = s

    # Campus image
    img_section = sec_map.get("campus_image") or sec_map.get("image")
    campus_image = ""
    campus_caption = ""
    if img_section:
        j = img_section.get("section_json", {})
        if isinstance(j, dict):
            campus_url = str(j.get("image_url", "")).strip()
            campus_caption = str(j.get("caption", "")).strip()
            if allow_remote and campus_url:
                embedded = fetch_image_data_uri(campus_url)
                if embedded:
                    campus_image = embedded
                    stats["images_embedded"] = stats.get("images_embedded", 0) + 1
                else:
                    stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1
            else:
                stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1

    # Overview and website
    resolved_website = (uni_raw.get("website") or "").strip()
    overview_data = None

    if "overview" in sec_map:
        overview_json = sec_map["overview"].get("section_json", {})
        if not isinstance(overview_json, dict):
            overview_json = {}

        site_from_overview = get_any(
            overview_json,
            ["university_website", "university_website_url", "website",
             "site", "url", "homepage", "web_url"],
        )
        if not resolved_website and site_from_overview:
            resolved_website = site_from_overview

        overview_data = {
            "founded": get_any(overview_json, ["founded", "Founded"]),
            "total_students": get_any(overview_json, ["total_students", "Total Students"]),
            "undergraduates": get_any(overview_json, [
                "undergraduates", "Undergraduate Students", "undergraduate_students",
            ]),
            "postgraduates": get_any(overview_json, [
                "postgraduate_students", "Postgraduate Students",
            ]),
            "acceptance_rate": get_any(overview_json, ["acceptance_rate", "Acceptance Rate"]),
            "location": get_any(overview_json, ["location", "Location"]),
            "tuition": format_money_figures(str(get_any(overview_json, [
                "tuition_out_of_state_yearly",
                "Yearly Out of State Tuition Fees",
                "Yearly Out-of-State Tuition Fees",
                "Yearly Tuition Fees",
                "Yearly Out-of-State Tuition Fees:",
            ]) or "")) or None,
        }

    if resolved_website:
        stats["university_links"] = stats.get("university_links", 0) + 1
        stats["website_rows"] = stats.get("website_rows", 0) + 1

    # Benefits + Funding
    benefits: list[str] | None = []
    funding_heading = "Funding Available"
    funding_items: list[str] = []
    if "benefits" in sec_map:
        j = sec_map["benefits"].get("section_json", {})
        if not isinstance(j, dict):
            j = {}
        raw_benefits = j.get("benefits", [])
        if isinstance(raw_benefits, list):
            benefits = [
                _normalize_text_content(str(b).strip())
                for b in raw_benefits
                if str(b).strip()
            ]
        else:
            benefits = []

        funding_heading, funding_items = _extract_university_funding(
            j,
            {
                "school_category": uni_raw.get("school_category"),
                "status": "in" if is_truthy(uni_raw.get("is_active", True)) else "out",
            },
        )
        # Normalize money formatting in funding items
        funding_items = [_normalize_text_content(item) for item in funding_items]

    # Programs
    programs = None
    if "programs" in sec_map:
        j = sec_map["programs"].get("section_json", {})
        if not isinstance(j, dict):
            j = {}
        programs_raw = j.get("programs", [])
        if not isinstance(programs_raw, list):
            programs_raw = []

        if not include_inactive_programs:
            programs_raw = [
                p for p in programs_raw
                if isinstance(p, dict) and is_truthy(
                    p.get("program_active", p.get("is_active", p.get("active", 1)))
                )
            ]

        programs = []
        seen_names = set()
        for p in programs_raw:
            if not isinstance(p, dict):
                continue
            program_name = _normalize_text_content(str(p.get("program_name", "")).strip())
            # Deduplicate by lowercase program name
            key = program_name.lower()
            if key in seen_names:
                continue
            seen_names.add(key)
            link = str(p.get("program_link", "")).strip()
            if not link and isinstance(p.get("program_links"), dict):
                link = str(p["program_links"].get("web_link", "")).strip()

            programs.append({
                "name": program_name,
                "link": link,
                "designation": _normalize_text_content(str(p.get("designation", ""))),
                "entrance": _normalize_text_content(str(p.get("entrance_exam", p.get("entrance_examination", "")))),
            })

    # Extra sections (rendered via global blocks normalizer)
    skip_keys = {"campus_image", "image", "overview", "benefits", "programs"}
    extra_blocks: list[list[RenderBlock]] = []
    for s in sections:
        if not isinstance(s, dict):
            continue
        k = str(s.get("section_key", ""))
        if not k or k in skip_keys:
            continue
        title = str(s.get("section_title", ""))
        j = s.get("section_json", {})
        if not isinstance(j, dict):
            j = {}
        extra_blocks.append(normalize_section(k, title, j, debug=debug))

    classes = ["hb-school-profile", "page-break"]

    return RenderBlock(
        block_type="school_profile",
        css_class=" ".join(classes),
        data={
            "name": uni_name,
            "anchor": uni_raw.get("anchor"),
            "sort_order": uni_raw.get("sort_order"),
            "website": resolved_website,
            "overview": overview_data,
            "campus_image": campus_image,
            "campus_caption": campus_caption,
            "benefits": benefits,
            "funding_heading": funding_heading,
            "funding_items": funding_items,
            "programs": programs,
            "extra_blocks": extra_blocks,
        },
    )


# ───────────────────────────────────────────────────────────────
# Internal helpers
# ───────────────────────────────────────────────────────────────

def _normalize_steps(steps: list) -> list[dict]:
    """Normalise enrollment steps into structured dicts."""
    result = []
    step_num = 0
    for s in steps:
        if not isinstance(s, dict):
            continue
        step_num += 1
        step_title = str(s.get("title", s.get("step_title", ""))).strip()
        body = _normalize_text_content(str(s.get("body", s.get("description", ""))).strip())

        # Pre-format body with bold emphasis on REGULAR, PRIME, $ amounts
        from markupsafe import Markup
        body_html = Markup(emphasize_keywords(body)) if body else ""

        links = []
        plain_links = []
        raw_links = s.get("links", [])
        if isinstance(raw_links, list):
            for lnk in raw_links:
                if not isinstance(lnk, dict):
                    continue
                label = str(lnk.get("label", "Link")).strip()
                url = str(lnk.get("url", "")).strip()
                if url:
                    low_label = label.lower()
                    low_url = url.lower()
                    is_telegram = "telegram" in low_label or "t.me" in low_url
                    if step_num == 2 and "internationalscholarsprogram.com" in low_url and not re.match(r"^https?://", url, flags=re.IGNORECASE):
                        url = "https://" + url
                    # All links (including Telegram) are rendered as clickable anchors.
                    # For Telegram use the full URL as visible label so readers can see/type it.
                    link_label = url if is_telegram else label
                    links.append({"label": link_label, "url": url})

        if step_num == 2 and not any(
            "internationalscholarsprogram.com" in str(l.get("url", "")).lower()
            for l in links
        ):
            links.append({
                "label": "www.internationalscholarsprogram.com",
                "url": "https://www.internationalscholarsprogram.com",
            })

        qr = str(s.get("qr_url", s.get("qr_image", ""))).strip()
        telegram_url = ""
        if step_num == 1:
            telegram_ref = ""
            if plain_links:
                telegram_ref = plain_links[0]
            elif isinstance(body, str):
                m = re.search(r"(https?://(?:t\.me|telegram\.me)/[^\s<)]+)", body, flags=re.IGNORECASE)
                if m:
                    telegram_ref = m.group(1)
            if telegram_ref:
                telegram_url = telegram_ref
                if not qr:
                    qr = (
                        "https://api.qrserver.com/v1/create-qr-code/?size=160x160&data="
                        + quote_plus(telegram_ref)
                    )
                # Strip the raw telegram URL and the follow-up description from body
                body = re.sub(r"https?://(?:t\.me|telegram\.me)/[^\s<)]+", "", body, flags=re.IGNORECASE)
                body = re.sub(r"This telegram group will help you interact with program administrators and other prospective students where you can ask any questions you may have about the program\.?", "", body, flags=re.IGNORECASE)
                body = re.sub(r"\n{2,}", "\n", body).strip()
                body_html = Markup(emphasize_keywords(body)) if body else ""

        result.append({
            "number": step_num,
            "title": step_title,
            "body": body,
            "body_html": body_html,
            "links": links,
            "plain_links": plain_links,
            "qr_url": qr,
            "telegram_url": telegram_url,
        })
    return result


def _normalize_basic_table(cols: list, rows: list) -> RenderBlock:
    """Normalise a basic table (columns + rows)."""
    norm_rows = []
    for r in rows:
        if not isinstance(r, (list, dict)):
            continue
        if isinstance(r, dict):
            row = []
            for col_label in cols:
                key_guess = re.sub(r"[^a-z0-9]+", "_", str(col_label).lower())
                cell = r.get(key_guess, "")
                # Normalize text, emphasize keywords, then linkify URLs for clickable links
                cell_html = emphasize_keywords(_normalize_text_content(str(cell)))
                cell_with_links = linkify_urls(cell_html)
                row.append(cell_with_links)
            norm_rows.append(row)
        else:
            norm_rows.append([linkify_urls(emphasize_keywords(_normalize_text_content(str(cell)))) for cell in r])

    return RenderBlock(
        block_type="table",
        css_class="hb-table",
        data={
            "columns": [str(c) for c in cols],
            "rows": norm_rows,
            "variant": "standard",
        },
    )


def _normalize_table_v2(json_data: dict) -> RenderBlock:
    """Normalise table_v2 (comparison table with header groups)."""
    base_cols = json_data.get("base_columns", [])
    groups = json_data.get("header_groups", [])
    rows = json_data.get("rows", [])
    if not isinstance(base_cols, list):
        base_cols = []
    if not isinstance(groups, list):
        groups = []
    if not isinstance(rows, list):
        rows = []

    all_cols: list[dict] = []
    for c in base_cols:
        if isinstance(c, dict):
            all_cols.append({"key": str(c.get("key", "")), "label": str(c.get("label", ""))})
    for g in groups:
        if not isinstance(g, dict):
            continue
        g_cols = g.get("columns", [])
        if not isinstance(g_cols, list):
            g_cols = []
        for c in g_cols:
            if isinstance(c, dict):
                all_cols.append({"key": str(c.get("key", "")), "label": str(c.get("label", ""))})

    norm_rows = []
    for r in rows:
        if not isinstance(r, dict):
            continue
        row = {}
        for c in all_cols:
            k = c.get("key", "")
            val = r.get(k, "")
            if isinstance(val, dict):
                val = val.get("text", "")
            row[k] = emphasize_keywords(_normalize_text_content(str(val)))
        norm_rows.append(row)

    return RenderBlock(
        block_type="table",
        css_class="hb-table hb-table-comparison",
        data={
            "base_columns": [{"key": c.get("key", ""), "label": c.get("label", "")} for c in base_cols if isinstance(c, dict)],
            "header_groups": [
                {
                    "label": str(g.get("label", "")),
                    "columns": [{"key": str(c.get("key", "")), "label": str(c.get("label", ""))}
                                for c in (g.get("columns", []) if isinstance(g.get("columns"), list) else [])
                                if isinstance(c, dict)],
                }
                for g in groups if isinstance(g, dict)
            ],
            "all_columns": all_cols,
            "rows": norm_rows,
            "variant": "comparison",
        },
    )


# ───────────────────────────────────────────────────────────────
# Breakdown section post-processor
# ───────────────────────────────────────────────────────────────

def _postprocess_breakdown(
    blocks: list[RenderBlock],
    raw_blocks: list,
) -> list[RenderBlock]:
    """Rewrite the breakdown section to match the reference layout.

    - "Relocation Cost" becomes a banner heading with page-break-before
    - The relocation table gets a merged right cell (rowspan) with the
      cost-coverage note moved inside it
    - "ISP FINANCING" becomes an inline note with mixed bold/italic
    - "NB: CREDIT FACILITY" is styled green
    - Dollar amounts in parentheticals keep their original $ format
    """
    from markupsafe import Markup

    # Find raw blocks for the relocation cost table (pre-normalised, $ intact)
    raw_reloc_table = None
    raw_note_after_table = None
    found_reloc = False
    for i, rb in enumerate(raw_blocks):
        if not isinstance(rb, dict):
            continue
        if rb.get("type") == "subheading" and "relocation" in str(rb.get("text", "")).lower():
            found_reloc = True
            continue
        if found_reloc and rb.get("type") == "table_v1" and raw_reloc_table is None:
            raw_reloc_table = rb
            continue
        if found_reloc and raw_reloc_table and rb.get("type") == "paragraph" and raw_note_after_table is None:
            raw_note_after_table = rb
            break

    result: list[RenderBlock] = []
    i = 0
    while i < len(blocks):
        blk = blocks[i]

        # ── Detect "Relocation Cost" heading ──
        if (blk.block_type == "heading_2"
                and "relocation" in blk.data.get("text", "").lower()):

            # Banner heading with page break
            result.append(RenderBlock(
                block_type="heading_2",
                css_class="hb-heading-2 hb-banner-heading page-break",
                data={"text": blk.data["text"]},
            ))
            i += 1

            # Replace the next table with spanning variant that has merged cell
            if i < len(blocks) and blocks[i].block_type == "table" and raw_reloc_table:
                raw_rows = raw_reloc_table.get("rows", [])
                # Build the note text for the merged right cell
                note_text = ""
                if raw_note_after_table:
                    note_text = str(raw_note_after_table.get("text", ""))

                spanning_rows = _build_relocation_spanning_rows(raw_rows, note_text)
                result.append(RenderBlock(
                    block_type="table",
                    css_class="hb-table hb-relocation-table",
                    data={"rows": spanning_rows, "variant": "spanning"},
                ))
                i += 1  # skip the original table

                # Skip the paragraph that was moved into the merged cell
                if (i < len(blocks)
                        and blocks[i].block_type == "paragraph"
                        and note_text):
                    i += 1
                continue

        # ── "ISP FINANCING" heading → inline note with mixed formatting ──
        if (blk.block_type == "heading_2"
                and "isp financing" in blk.data.get("text", "").lower()):
            # Next block should be the interest rate paragraph
            rate_text = ""
            if i + 1 < len(blocks) and blocks[i + 1].block_type == "paragraph":
                rate_text = blocks[i + 1].data.get("text", "")
            result.append(RenderBlock(
                block_type="note",
                css_class="hb-note hb-isp-financing",
                data={
                    "parts": [
                        {"text": "ISP FINANCING", "style": "bold"},
                        {"text": " (" + _extract_rate_italic(rate_text) + "): " if rate_text else "", "style": "italic"},
                        {"text": _extract_rate_amount(rate_text), "style": "bold"},
                    ],
                    "inline": True,
                },
            ))
            i += 1  # skip the heading
            if rate_text:
                i += 1  # skip the paragraph
            continue

        # ── "NB: CREDIT FACILITY" note → green styling ──
        if (blk.block_type == "note"
                and "credit facility" in blk.data.get("text", "").lower()):
            result.append(RenderBlock(
                block_type="note",
                css_class="hb-note hb-credit-note",
                data=blk.data,
            ))
            i += 1
            continue

        result.append(blk)
        i += 1

    return result


def _build_relocation_spanning_rows(
    raw_rows: list, note_text: str,
) -> list[list[dict]]:
    """Build spanning rows for the relocation cost table.

    Row 0: normal 2-column (consultation fees | Covered in the contribution)
    Rows 1-7: left cell per row, right cell merged (rowspan) with italic note
    Rows 8+: left cell only, empty right
    """
    from markupsafe import Markup

    if not raw_rows:
        return []

    rows: list[list[dict]] = []

    # Row 0 — has "Covered in the contribution"
    first = raw_rows[0] if raw_rows else ["", ""]
    rows.append([
        {"text": Markup(emphasize_keywords(str(first[0] if len(first) > 0 else ""))), "colspan": 1, "rowspan": 1},
        {"text": Markup("<em>" + h(str(first[1] if len(first) > 1 else "")) + "</em>"), "colspan": 1, "rowspan": 1},
    ])

    # Rows 1-7: items with dollar amounts that get the merged right cell
    # These are the visa/fee/rent/ticket rows (have parenthetical dollar amounts)
    merged_start = 1
    merged_end = min(8, len(raw_rows))  # Visa Integrity through Air ticket

    for idx in range(merged_start, len(raw_rows)):
        cell_text = str(raw_rows[idx][0] if len(raw_rows[idx]) > 0 else "")
        left = {"text": Markup(emphasize_keywords(cell_text)), "colspan": 1, "rowspan": 1}

        if idx == merged_start and note_text:
            # First merged row gets the rowspan cell
            span_count = merged_end - merged_start
            note_html = note_text.replace("\n\n", "<br/><br/>")
            right = {
                "text": Markup('<em class="hb-merged-note">' + h(note_html).replace("&lt;br/&gt;&lt;br/&gt;", "<br/><br/>") + "</em>"),
                "colspan": 1,
                "rowspan": span_count,
            }
            rows.append([left, right])
        elif idx < merged_end:
            # Subsequent merged rows — no right cell (covered by rowspan)
            rows.append([left])
        else:
            # Remaining rows — empty right cell
            rows.append([
                left,
                {"text": "", "colspan": 1, "rowspan": 1},
            ])

    return rows


def _extract_rate_italic(text: str) -> str:
    """Extract the italic portion: 'Interest rate of 12% – 15% Market Rate PA'."""
    # Text is like: "Interest rate of 12% – 15% Market Rate: UP TO USD 10,000"
    m = re.match(r"(Interest rate.*?(?:Market Rate|PA))", text, re.IGNORECASE)
    if m:
        return m.group(1).rstrip(": ")
    # Fallback: everything before the colon
    if ":" in text:
        return text.split(":")[0].strip()
    return text


def _extract_rate_amount(text: str) -> str:
    """Extract the amount portion: 'UP TO USD 10,000'."""
    m = re.search(r"(UP TO.*)", text, re.IGNORECASE)
    if m:
        return m.group(1).strip()
    if ":" in text:
        return text.split(":", 1)[1].strip()
    return ""


# ───────────────────────────────────────────────────────────────
# Tier 2 (cosigner) section post-processor
# ───────────────────────────────────────────────────────────────

def _postprocess_tier2(blocks: list[RenderBlock]) -> list[RenderBlock]:
    """Style the Tier 2 section to match the reference layout.

    - Second consecutive bullet_list (sub-bullets under Sources of Funds)
      gets checkmark styling instead of arrows.
    """
    result: list[RenderBlock] = []
    prev_was_bullet = False
    for blk in blocks:
        if blk.block_type == "bullet_list":
            if prev_was_bullet:
                # This is the sub-bullet list → use checkmark class
                result.append(RenderBlock(
                    block_type="bullet_list",
                    css_class="hb-bullet-list hb-sub-bullets",
                    data=blk.data,
                ))
            else:
                result.append(blk)
            prev_was_bullet = True
        else:
            prev_was_bullet = False
            result.append(blk)
    return result


def _normalize_doc_v1(blocks: list, *, skip_title: str = "") -> list[RenderBlock]:
    """Normalise doc_v1 blocks into typed RenderBlocks.

    Args:
        skip_title: When set, any leading heading/subheading block whose text
            matches this title (case-insensitive) is dropped to avoid
            duplicating the section heading already emitted by the caller.
    """
    from markupsafe import Markup
    _skip_norm = skip_title.strip().lower() if skip_title else ""
    result: list[RenderBlock] = []
    for b in blocks:
        if not isinstance(b, dict):
            continue
        btype = str(b.get("type", ""))

        # Skip heading/subheading blocks that duplicate the section title
        if _skip_norm and btype in ("heading", "subheading"):
            block_text = str(b.get("text", "")).strip().lower()
            if block_text == _skip_norm:
                continue

        if btype == "paragraph":
            t = _normalize_text_content(str(b.get("text", "")))
            if t.strip():
                result.append(RenderBlock(
                    block_type="paragraph",
                    css_class="hb-paragraph",
                    data={
                        "text": t,
                        "html": Markup(emphasize_keywords(t)),
                    },
                ))

        elif btype == "subheading":
            t = _normalize_text_content(str(b.get("text", "")))
            if t.strip():
                result.append(RenderBlock(
                    block_type="heading_2",
                    css_class="hb-heading-2",
                    data={"text": t},
                ))

        elif btype == "bullets":
            items = b.get("items", [])
            if not isinstance(items, list):
                items = []
            normalized = [_normalize_text_content(str(it).strip()) for it in items if str(it).strip()]
            html_items = [Markup(emphasize_keywords(it)) for it in normalized]
            if normalized:
                result.append(RenderBlock(
                    block_type="bullet_list",
                    css_class="hb-bullet-list",
                    data={"entries": html_items, "html_entries": True},
                ))

        elif btype == "numbered_list":
            items = b.get("items", [])
            if not isinstance(items, list):
                items = []
            normalized = [_normalize_text_content(str(it).strip()) for it in items if str(it).strip()]
            html_items = [Markup(emphasize_keywords(it)) for it in normalized]
            if normalized:
                result.append(RenderBlock(
                    block_type="bullet_list",
                    css_class="hb-bullet-list hb-numbered-list",
                    data={"entries": html_items, "ordered": True, "html_entries": True},
                ))

        elif btype == "note":
            t = _normalize_text_content(str(b.get("text", "")))
            if t.strip():
                result.append(RenderBlock(
                    block_type="note",
                    css_class="hb-note",
                    data={"text": t},
                ))

        elif btype == "note_inline":
            parts = b.get("parts", [])
            if not isinstance(parts, list):
                parts = []
            normalized_parts = []
            for p in parts:
                if not isinstance(p, dict):
                    continue
                t = _normalize_text_content(str(p.get("text", "")))
                if t:
                    normalized_parts.append({
                        "text": t,
                        "style": str(p.get("style", "")),
                    })
            if normalized_parts:
                result.append(RenderBlock(
                    block_type="note",
                    css_class="hb-note",
                    data={"parts": normalized_parts, "inline": True},
                ))

        elif btype == "table_v1":
            t_cols = b.get("columns", [])
            t_rows = b.get("rows", [])
            if not isinstance(t_cols, list):
                t_cols = []
            if not isinstance(t_rows, list):
                t_rows = []
            norm_rows = []
            for r in t_rows:
                if not isinstance(r, list):
                    continue
                norm_rows.append([emphasize_keywords(_normalize_text_content(str(cell))) for cell in r])
            result.append(RenderBlock(
                block_type="table",
                css_class="hb-table",
                data={"columns": [str(c) for c in t_cols], "rows": norm_rows, "variant": "standard"},
            ))

        elif btype == "table":
            # Generic table (columns may be objects or strings, rows may be dicts or lists)
            t_cols = b.get("columns", [])
            t_rows = b.get("rows", [])
            if not isinstance(t_cols, list):
                t_cols = []
            if not isinstance(t_rows, list):
                t_rows = []
            col_labels = []
            col_keys = []
            for c in t_cols:
                if isinstance(c, dict):
                    col_labels.append(str(c.get("label", c.get("key", ""))))
                    col_keys.append(str(c.get("key", "")))
                else:
                    col_labels.append(str(c))
                    col_keys.append(re.sub(r"[^a-z0-9]+", "_", str(c).lower()))
            norm_rows = []
            for r in t_rows:
                if isinstance(r, dict):
                    norm_rows.append([emphasize_keywords(_normalize_text_content(str(r.get(k, "")))) for k in col_keys])
                elif isinstance(r, list):
                    norm_rows.append([emphasize_keywords(_normalize_text_content(str(cell))) for cell in r])
            result.append(RenderBlock(
                block_type="table",
                css_class="hb-table",
                data={"columns": col_labels, "rows": norm_rows, "variant": "standard"},
            ))

        elif btype in ("table_v3", "table_v4"):
            t_rows = b.get("rows", [])
            if not isinstance(t_rows, list):
                t_rows = []
            norm_rows = []
            for r in t_rows:
                if not isinstance(r, list):
                    continue
                norm_row = []
                for cell in r:
                    if isinstance(cell, dict):
                        norm_row.append({
                            "text": emphasize_keywords(_normalize_text_content(str(cell.get("text", "")))),
                            "colspan": int(cell.get("colspan", 1)) if str(cell.get("colspan", "")).isdigit() else 1,
                            "rowspan": int(cell.get("rowspan", 1)) if str(cell.get("rowspan", "")).isdigit() else 1,
                        })
                    else:
                        norm_row.append({
                            "text": emphasize_keywords(_normalize_text_content(str(cell))),
                            "colspan": 1,
                            "rowspan": 1,
                        })
                norm_rows.append(norm_row)
            result.append(RenderBlock(
                block_type="table",
                css_class="hb-table",
                data={"rows": norm_rows, "variant": "spanning"},
            ))

    return result