| """Normalization layer — converts raw MySQL handbook content into typed render blocks. |
| |
| Each section_json from the database is parsed into a list of RenderBlock |
| objects. Every block has a `block_type` that maps 1-to-1 to a Jinja |
| partial and a CSS class. This prevents ad-hoc interpretation of raw |
| JSON throughout the rendering pipeline. |
| |
| Block types (from theme.BLOCK_TYPES): |
| heading_1, heading_2, paragraph, bullet_list, note, table, |
| enrollment_steps, school_profile, university_summary, toc, |
| cover, full_page_image |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
| from urllib.parse import quote_plus |
| from dataclasses import dataclass, field |
| from typing import Any |
|
|
| from app.services.renderers import _extract_university_funding |
| from app.services.utils import ( |
| ensure_program_options_pair, |
| emphasize_keywords, |
| format_money_figures, |
| get_any, |
| h, |
| hb_slug, |
| is_assoc, |
| is_truthy, |
| linkify_urls, |
| ) |
| from app.services.renderers import fetch_image_data_uri |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class RenderBlock: |
| """Base typed render block.""" |
| block_type: str |
| css_class: str = "" |
| data: dict[str, Any] = field(default_factory=dict) |
|
|
|
|
| |
| |
| |
|
|
| def normalize_section( |
| section_key: str, |
| section_title: str, |
| section_json: dict | list, |
| *, |
| universities: list[dict] | None = None, |
| debug: bool = False, |
| ) -> list[RenderBlock]: |
| """Convert a single global section payload into a list of RenderBlocks. |
| |
| This is the single translation point between the database schema |
| and the rendering layer. |
| """ |
| blocks: list[RenderBlock] = [] |
| key_norm = section_key.lower().strip() |
|
|
| if not isinstance(section_json, dict): |
| section_json = {} |
|
|
| layout_norm = str(section_json.get("layout", "")).lower().strip() |
|
|
| |
| |
| json_title = str(section_json.get("title", "")).strip() if isinstance(section_json, dict) else "" |
| title = json_title or section_title.strip() |
| if title and key_norm != "table_of_contents": |
| blocks.append(RenderBlock( |
| block_type="heading_1", |
| css_class="hb-heading-1", |
| data={"text": title}, |
| )) |
|
|
| |
| steps = section_json.get("steps") |
| if isinstance(steps, list): |
| blocks.append(RenderBlock( |
| block_type="enrollment_steps", |
| css_class="hb-enrollment-steps", |
| data={"steps": _normalize_steps(steps)}, |
| )) |
| return blocks |
|
|
| |
| has_bullets = isinstance(section_json.get("bullets"), list) |
| has_items = isinstance(section_json.get("items"), list) |
| if has_bullets or (layout_norm == "bullets_with_note" and has_items): |
| from markupsafe import Markup |
| lst = section_json.get("items") if has_items else section_json.get("bullets") |
| items = [_normalize_text_content(str(b).strip()) for b in lst if str(b).strip()] |
| html_items = [Markup(emphasize_keywords(it)) for it in items] |
| blocks.append(RenderBlock( |
| block_type="bullet_list", |
| css_class="hb-bullet-list", |
| data={"entries": html_items, "html_entries": True}, |
| )) |
| note = _normalize_text_content( |
| str(section_json.get("note", section_json.get("footnote", ""))).strip() |
| ) |
| if note: |
| blocks.append(RenderBlock( |
| block_type="note", |
| css_class="hb-note", |
| data={"text": note}, |
| )) |
| return blocks |
|
|
| |
| cols = section_json.get("columns") |
| rows = section_json.get("rows") |
| if isinstance(cols, list) and isinstance(rows, list): |
| blocks.append(_normalize_basic_table(cols, rows)) |
| return blocks |
|
|
| |
| if layout_norm == "table_v2": |
| blocks.append(_normalize_table_v2(section_json)) |
| return blocks |
|
|
| |
| if layout_norm == "doc_v1" and isinstance(section_json.get("blocks"), list): |
| blocks.extend(_normalize_doc_v1(section_json["blocks"], skip_title=title)) |
| |
| if key_norm == "program_features_breakdown": |
| blocks = _postprocess_breakdown(blocks, section_json["blocks"]) |
| |
| if key_norm == "summary_of_universities_cosigner": |
| blocks = _postprocess_tier2(blocks) |
| return blocks |
|
|
| |
| if "text" in section_json: |
| text = _normalize_text_content(str(section_json["text"])) |
| if text.strip(): |
| from markupsafe import Markup |
| blocks.append(RenderBlock( |
| block_type="paragraph", |
| css_class="hb-paragraph", |
| data={ |
| "text": text, |
| "html": Markup(emphasize_keywords(text)), |
| }, |
| )) |
|
|
| return blocks |
|
|
|
|
| def _normalize_text_content(text: str) -> str: |
| """Apply global handbook text normalization in a single place.""" |
| return ensure_program_options_pair(format_money_figures(text)) |
|
|
|
|
| |
| |
| |
|
|
| def normalize_university( |
| uni_raw: dict[str, Any], |
| allow_remote: bool, |
| include_inactive_programs: bool, |
| debug: bool, |
| stats: dict[str, Any], |
| ) -> RenderBlock: |
| """Convert raw university data into a school_profile RenderBlock.""" |
| uni_name = uni_raw["name"] |
| sections = uni_raw.get("sections", []) |
| is_first = uni_raw.get("_is_first", False) |
|
|
| stats["universities"] = stats.get("universities", 0) + 1 |
|
|
| |
| sec_map: dict[str, dict] = {} |
| for s in sections: |
| if not isinstance(s, dict): |
| continue |
| k = str(s.get("section_key", "")) |
| if not k: |
| continue |
| if k == "programs" and k in sec_map: |
| existing = sec_map["programs"].get("section_json", {}) |
| incoming = s.get("section_json", {}) |
| if not isinstance(existing, dict): |
| existing = {} |
| if not isinstance(incoming, dict): |
| incoming = {} |
| a = existing.get("programs", []) |
| b = incoming.get("programs", []) |
| if not isinstance(a, list): |
| a = [] |
| if not isinstance(b, list): |
| b = [] |
| existing["programs"] = a + b |
| sec_map["programs"]["section_json"] = existing |
| continue |
| sec_map[k] = s |
|
|
| |
| img_section = sec_map.get("campus_image") or sec_map.get("image") |
| campus_image = "" |
| campus_caption = "" |
| if img_section: |
| j = img_section.get("section_json", {}) |
| if isinstance(j, dict): |
| campus_url = str(j.get("image_url", "")).strip() |
| campus_caption = str(j.get("caption", "")).strip() |
| if allow_remote and campus_url: |
| embedded = fetch_image_data_uri(campus_url) |
| if embedded: |
| campus_image = embedded |
| stats["images_embedded"] = stats.get("images_embedded", 0) + 1 |
| else: |
| stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1 |
| else: |
| stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1 |
|
|
| |
| resolved_website = (uni_raw.get("website") or "").strip() |
| overview_data = None |
|
|
| if "overview" in sec_map: |
| overview_json = sec_map["overview"].get("section_json", {}) |
| if not isinstance(overview_json, dict): |
| overview_json = {} |
|
|
| site_from_overview = get_any( |
| overview_json, |
| ["university_website", "university_website_url", "website", |
| "site", "url", "homepage", "web_url"], |
| ) |
| if not resolved_website and site_from_overview: |
| resolved_website = site_from_overview |
|
|
| overview_data = { |
| "founded": get_any(overview_json, ["founded", "Founded"]), |
| "total_students": get_any(overview_json, ["total_students", "Total Students"]), |
| "undergraduates": get_any(overview_json, [ |
| "undergraduates", "Undergraduate Students", "undergraduate_students", |
| ]), |
| "postgraduates": get_any(overview_json, [ |
| "postgraduate_students", "Postgraduate Students", |
| ]), |
| "acceptance_rate": get_any(overview_json, ["acceptance_rate", "Acceptance Rate"]), |
| "location": get_any(overview_json, ["location", "Location"]), |
| "tuition": format_money_figures(str(get_any(overview_json, [ |
| "tuition_out_of_state_yearly", |
| "Yearly Out of State Tuition Fees", |
| "Yearly Out-of-State Tuition Fees", |
| "Yearly Tuition Fees", |
| "Yearly Out-of-State Tuition Fees:", |
| ]) or "")) or None, |
| } |
|
|
| if resolved_website: |
| stats["university_links"] = stats.get("university_links", 0) + 1 |
| stats["website_rows"] = stats.get("website_rows", 0) + 1 |
|
|
| |
| benefits: list[str] | None = [] |
| funding_heading = "Funding Available" |
| funding_items: list[str] = [] |
| if "benefits" in sec_map: |
| j = sec_map["benefits"].get("section_json", {}) |
| if not isinstance(j, dict): |
| j = {} |
| raw_benefits = j.get("benefits", []) |
| if isinstance(raw_benefits, list): |
| benefits = [ |
| _normalize_text_content(str(b).strip()) |
| for b in raw_benefits |
| if str(b).strip() |
| ] |
| else: |
| benefits = [] |
|
|
| funding_heading, funding_items = _extract_university_funding( |
| j, |
| { |
| "school_category": uni_raw.get("school_category"), |
| "status": "in" if is_truthy(uni_raw.get("is_active", True)) else "out", |
| }, |
| ) |
| |
| funding_items = [_normalize_text_content(item) for item in funding_items] |
|
|
| |
| programs = None |
| if "programs" in sec_map: |
| j = sec_map["programs"].get("section_json", {}) |
| if not isinstance(j, dict): |
| j = {} |
| programs_raw = j.get("programs", []) |
| if not isinstance(programs_raw, list): |
| programs_raw = [] |
|
|
| if not include_inactive_programs: |
| programs_raw = [ |
| p for p in programs_raw |
| if isinstance(p, dict) and is_truthy( |
| p.get("program_active", p.get("is_active", p.get("active", 1))) |
| ) |
| ] |
|
|
| programs = [] |
| seen_names = set() |
| for p in programs_raw: |
| if not isinstance(p, dict): |
| continue |
| program_name = _normalize_text_content(str(p.get("program_name", "")).strip()) |
| |
| key = program_name.lower() |
| if key in seen_names: |
| continue |
| seen_names.add(key) |
| link = str(p.get("program_link", "")).strip() |
| if not link and isinstance(p.get("program_links"), dict): |
| link = str(p["program_links"].get("web_link", "")).strip() |
|
|
| programs.append({ |
| "name": program_name, |
| "link": link, |
| "designation": _normalize_text_content(str(p.get("designation", ""))), |
| "entrance": _normalize_text_content(str(p.get("entrance_exam", p.get("entrance_examination", "")))), |
| }) |
|
|
| |
| skip_keys = {"campus_image", "image", "overview", "benefits", "programs"} |
| extra_blocks: list[list[RenderBlock]] = [] |
| for s in sections: |
| if not isinstance(s, dict): |
| continue |
| k = str(s.get("section_key", "")) |
| if not k or k in skip_keys: |
| continue |
| title = str(s.get("section_title", "")) |
| j = s.get("section_json", {}) |
| if not isinstance(j, dict): |
| j = {} |
| extra_blocks.append(normalize_section(k, title, j, debug=debug)) |
|
|
| classes = ["hb-school-profile", "page-break"] |
|
|
| return RenderBlock( |
| block_type="school_profile", |
| css_class=" ".join(classes), |
| data={ |
| "name": uni_name, |
| "anchor": uni_raw.get("anchor"), |
| "sort_order": uni_raw.get("sort_order"), |
| "website": resolved_website, |
| "overview": overview_data, |
| "campus_image": campus_image, |
| "campus_caption": campus_caption, |
| "benefits": benefits, |
| "funding_heading": funding_heading, |
| "funding_items": funding_items, |
| "programs": programs, |
| "extra_blocks": extra_blocks, |
| }, |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def _normalize_steps(steps: list) -> list[dict]: |
| """Normalise enrollment steps into structured dicts.""" |
| result = [] |
| step_num = 0 |
| for s in steps: |
| if not isinstance(s, dict): |
| continue |
| step_num += 1 |
| step_title = str(s.get("title", s.get("step_title", ""))).strip() |
| body = _normalize_text_content(str(s.get("body", s.get("description", ""))).strip()) |
|
|
| |
| from markupsafe import Markup |
| body_html = Markup(emphasize_keywords(body)) if body else "" |
|
|
| links = [] |
| plain_links = [] |
| raw_links = s.get("links", []) |
| if isinstance(raw_links, list): |
| for lnk in raw_links: |
| if not isinstance(lnk, dict): |
| continue |
| label = str(lnk.get("label", "Link")).strip() |
| url = str(lnk.get("url", "")).strip() |
| if url: |
| low_label = label.lower() |
| low_url = url.lower() |
| is_telegram = "telegram" in low_label or "t.me" in low_url |
| if step_num == 2 and "internationalscholarsprogram.com" in low_url and not re.match(r"^https?://", url, flags=re.IGNORECASE): |
| url = "https://" + url |
| |
| |
| link_label = url if is_telegram else label |
| links.append({"label": link_label, "url": url}) |
|
|
| if step_num == 2 and not any( |
| "internationalscholarsprogram.com" in str(l.get("url", "")).lower() |
| for l in links |
| ): |
| links.append({ |
| "label": "www.internationalscholarsprogram.com", |
| "url": "https://www.internationalscholarsprogram.com", |
| }) |
|
|
| qr = str(s.get("qr_url", s.get("qr_image", ""))).strip() |
| telegram_url = "" |
| if step_num == 1: |
| telegram_ref = "" |
| if plain_links: |
| telegram_ref = plain_links[0] |
| elif isinstance(body, str): |
| m = re.search(r"(https?://(?:t\.me|telegram\.me)/[^\s<)]+)", body, flags=re.IGNORECASE) |
| if m: |
| telegram_ref = m.group(1) |
| if telegram_ref: |
| telegram_url = telegram_ref |
| if not qr: |
| qr = ( |
| "https://api.qrserver.com/v1/create-qr-code/?size=160x160&data=" |
| + quote_plus(telegram_ref) |
| ) |
| |
| body = re.sub(r"https?://(?:t\.me|telegram\.me)/[^\s<)]+", "", body, flags=re.IGNORECASE) |
| body = re.sub(r"This telegram group will help you interact with program administrators and other prospective students where you can ask any questions you may have about the program\.?", "", body, flags=re.IGNORECASE) |
| body = re.sub(r"\n{2,}", "\n", body).strip() |
| body_html = Markup(emphasize_keywords(body)) if body else "" |
|
|
| result.append({ |
| "number": step_num, |
| "title": step_title, |
| "body": body, |
| "body_html": body_html, |
| "links": links, |
| "plain_links": plain_links, |
| "qr_url": qr, |
| "telegram_url": telegram_url, |
| }) |
| return result |
|
|
|
|
| def _normalize_basic_table(cols: list, rows: list) -> RenderBlock: |
| """Normalise a basic table (columns + rows).""" |
| norm_rows = [] |
| for r in rows: |
| if not isinstance(r, (list, dict)): |
| continue |
| if isinstance(r, dict): |
| row = [] |
| for col_label in cols: |
| key_guess = re.sub(r"[^a-z0-9]+", "_", str(col_label).lower()) |
| cell = r.get(key_guess, "") |
| |
| cell_html = emphasize_keywords(_normalize_text_content(str(cell))) |
| cell_with_links = linkify_urls(cell_html) |
| row.append(cell_with_links) |
| norm_rows.append(row) |
| else: |
| norm_rows.append([linkify_urls(emphasize_keywords(_normalize_text_content(str(cell)))) for cell in r]) |
|
|
| return RenderBlock( |
| block_type="table", |
| css_class="hb-table", |
| data={ |
| "columns": [str(c) for c in cols], |
| "rows": norm_rows, |
| "variant": "standard", |
| }, |
| ) |
|
|
|
|
| def _normalize_table_v2(json_data: dict) -> RenderBlock: |
| """Normalise table_v2 (comparison table with header groups).""" |
| base_cols = json_data.get("base_columns", []) |
| groups = json_data.get("header_groups", []) |
| rows = json_data.get("rows", []) |
| if not isinstance(base_cols, list): |
| base_cols = [] |
| if not isinstance(groups, list): |
| groups = [] |
| if not isinstance(rows, list): |
| rows = [] |
|
|
| all_cols: list[dict] = [] |
| for c in base_cols: |
| if isinstance(c, dict): |
| all_cols.append({"key": str(c.get("key", "")), "label": str(c.get("label", ""))}) |
| for g in groups: |
| if not isinstance(g, dict): |
| continue |
| g_cols = g.get("columns", []) |
| if not isinstance(g_cols, list): |
| g_cols = [] |
| for c in g_cols: |
| if isinstance(c, dict): |
| all_cols.append({"key": str(c.get("key", "")), "label": str(c.get("label", ""))}) |
|
|
| norm_rows = [] |
| for r in rows: |
| if not isinstance(r, dict): |
| continue |
| row = {} |
| for c in all_cols: |
| k = c.get("key", "") |
| val = r.get(k, "") |
| if isinstance(val, dict): |
| val = val.get("text", "") |
| row[k] = emphasize_keywords(_normalize_text_content(str(val))) |
| norm_rows.append(row) |
|
|
| return RenderBlock( |
| block_type="table", |
| css_class="hb-table hb-table-comparison", |
| data={ |
| "base_columns": [{"key": c.get("key", ""), "label": c.get("label", "")} for c in base_cols if isinstance(c, dict)], |
| "header_groups": [ |
| { |
| "label": str(g.get("label", "")), |
| "columns": [{"key": str(c.get("key", "")), "label": str(c.get("label", ""))} |
| for c in (g.get("columns", []) if isinstance(g.get("columns"), list) else []) |
| if isinstance(c, dict)], |
| } |
| for g in groups if isinstance(g, dict) |
| ], |
| "all_columns": all_cols, |
| "rows": norm_rows, |
| "variant": "comparison", |
| }, |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def _postprocess_breakdown( |
| blocks: list[RenderBlock], |
| raw_blocks: list, |
| ) -> list[RenderBlock]: |
| """Rewrite the breakdown section to match the reference layout. |
| |
| - "Relocation Cost" becomes a banner heading with page-break-before |
| - The relocation table gets a merged right cell (rowspan) with the |
| cost-coverage note moved inside it |
| - "ISP FINANCING" becomes an inline note with mixed bold/italic |
| - "NB: CREDIT FACILITY" is styled green |
| - Dollar amounts in parentheticals keep their original $ format |
| """ |
| from markupsafe import Markup |
|
|
| |
| raw_reloc_table = None |
| raw_note_after_table = None |
| found_reloc = False |
| for i, rb in enumerate(raw_blocks): |
| if not isinstance(rb, dict): |
| continue |
| if rb.get("type") == "subheading" and "relocation" in str(rb.get("text", "")).lower(): |
| found_reloc = True |
| continue |
| if found_reloc and rb.get("type") == "table_v1" and raw_reloc_table is None: |
| raw_reloc_table = rb |
| continue |
| if found_reloc and raw_reloc_table and rb.get("type") == "paragraph" and raw_note_after_table is None: |
| raw_note_after_table = rb |
| break |
|
|
| result: list[RenderBlock] = [] |
| i = 0 |
| while i < len(blocks): |
| blk = blocks[i] |
|
|
| |
| if (blk.block_type == "heading_2" |
| and "relocation" in blk.data.get("text", "").lower()): |
|
|
| |
| result.append(RenderBlock( |
| block_type="heading_2", |
| css_class="hb-heading-2 hb-banner-heading page-break", |
| data={"text": blk.data["text"]}, |
| )) |
| i += 1 |
|
|
| |
| if i < len(blocks) and blocks[i].block_type == "table" and raw_reloc_table: |
| raw_rows = raw_reloc_table.get("rows", []) |
| |
| note_text = "" |
| if raw_note_after_table: |
| note_text = str(raw_note_after_table.get("text", "")) |
|
|
| spanning_rows = _build_relocation_spanning_rows(raw_rows, note_text) |
| result.append(RenderBlock( |
| block_type="table", |
| css_class="hb-table hb-relocation-table", |
| data={"rows": spanning_rows, "variant": "spanning"}, |
| )) |
| i += 1 |
|
|
| |
| if (i < len(blocks) |
| and blocks[i].block_type == "paragraph" |
| and note_text): |
| i += 1 |
| continue |
|
|
| |
| if (blk.block_type == "heading_2" |
| and "isp financing" in blk.data.get("text", "").lower()): |
| |
| rate_text = "" |
| if i + 1 < len(blocks) and blocks[i + 1].block_type == "paragraph": |
| rate_text = blocks[i + 1].data.get("text", "") |
| result.append(RenderBlock( |
| block_type="note", |
| css_class="hb-note hb-isp-financing", |
| data={ |
| "parts": [ |
| {"text": "ISP FINANCING", "style": "bold"}, |
| {"text": " (" + _extract_rate_italic(rate_text) + "): " if rate_text else "", "style": "italic"}, |
| {"text": _extract_rate_amount(rate_text), "style": "bold"}, |
| ], |
| "inline": True, |
| }, |
| )) |
| i += 1 |
| if rate_text: |
| i += 1 |
| continue |
|
|
| |
| if (blk.block_type == "note" |
| and "credit facility" in blk.data.get("text", "").lower()): |
| result.append(RenderBlock( |
| block_type="note", |
| css_class="hb-note hb-credit-note", |
| data=blk.data, |
| )) |
| i += 1 |
| continue |
|
|
| result.append(blk) |
| i += 1 |
|
|
| return result |
|
|
|
|
| def _build_relocation_spanning_rows( |
| raw_rows: list, note_text: str, |
| ) -> list[list[dict]]: |
| """Build spanning rows for the relocation cost table. |
| |
| Row 0: normal 2-column (consultation fees | Covered in the contribution) |
| Rows 1-7: left cell per row, right cell merged (rowspan) with italic note |
| Rows 8+: left cell only, empty right |
| """ |
| from markupsafe import Markup |
|
|
| if not raw_rows: |
| return [] |
|
|
| rows: list[list[dict]] = [] |
|
|
| |
| first = raw_rows[0] if raw_rows else ["", ""] |
| rows.append([ |
| {"text": Markup(emphasize_keywords(str(first[0] if len(first) > 0 else ""))), "colspan": 1, "rowspan": 1}, |
| {"text": Markup("<em>" + h(str(first[1] if len(first) > 1 else "")) + "</em>"), "colspan": 1, "rowspan": 1}, |
| ]) |
|
|
| |
| |
| merged_start = 1 |
| merged_end = min(8, len(raw_rows)) |
|
|
| for idx in range(merged_start, len(raw_rows)): |
| cell_text = str(raw_rows[idx][0] if len(raw_rows[idx]) > 0 else "") |
| left = {"text": Markup(emphasize_keywords(cell_text)), "colspan": 1, "rowspan": 1} |
|
|
| if idx == merged_start and note_text: |
| |
| span_count = merged_end - merged_start |
| note_html = note_text.replace("\n\n", "<br/><br/>") |
| right = { |
| "text": Markup('<em class="hb-merged-note">' + h(note_html).replace("<br/><br/>", "<br/><br/>") + "</em>"), |
| "colspan": 1, |
| "rowspan": span_count, |
| } |
| rows.append([left, right]) |
| elif idx < merged_end: |
| |
| rows.append([left]) |
| else: |
| |
| rows.append([ |
| left, |
| {"text": "", "colspan": 1, "rowspan": 1}, |
| ]) |
|
|
| return rows |
|
|
|
|
| def _extract_rate_italic(text: str) -> str: |
| """Extract the italic portion: 'Interest rate of 12% – 15% Market Rate PA'.""" |
| |
| m = re.match(r"(Interest rate.*?(?:Market Rate|PA))", text, re.IGNORECASE) |
| if m: |
| return m.group(1).rstrip(": ") |
| |
| if ":" in text: |
| return text.split(":")[0].strip() |
| return text |
|
|
|
|
| def _extract_rate_amount(text: str) -> str: |
| """Extract the amount portion: 'UP TO USD 10,000'.""" |
| m = re.search(r"(UP TO.*)", text, re.IGNORECASE) |
| if m: |
| return m.group(1).strip() |
| if ":" in text: |
| return text.split(":", 1)[1].strip() |
| return "" |
|
|
|
|
| |
| |
| |
|
|
| def _postprocess_tier2(blocks: list[RenderBlock]) -> list[RenderBlock]: |
| """Style the Tier 2 section to match the reference layout. |
| |
| - Second consecutive bullet_list (sub-bullets under Sources of Funds) |
| gets checkmark styling instead of arrows. |
| """ |
| result: list[RenderBlock] = [] |
| prev_was_bullet = False |
| for blk in blocks: |
| if blk.block_type == "bullet_list": |
| if prev_was_bullet: |
| |
| result.append(RenderBlock( |
| block_type="bullet_list", |
| css_class="hb-bullet-list hb-sub-bullets", |
| data=blk.data, |
| )) |
| else: |
| result.append(blk) |
| prev_was_bullet = True |
| else: |
| prev_was_bullet = False |
| result.append(blk) |
| return result |
|
|
|
|
| def _normalize_doc_v1(blocks: list, *, skip_title: str = "") -> list[RenderBlock]: |
| """Normalise doc_v1 blocks into typed RenderBlocks. |
| |
| Args: |
| skip_title: When set, any leading heading/subheading block whose text |
| matches this title (case-insensitive) is dropped to avoid |
| duplicating the section heading already emitted by the caller. |
| """ |
| from markupsafe import Markup |
| _skip_norm = skip_title.strip().lower() if skip_title else "" |
| result: list[RenderBlock] = [] |
| for b in blocks: |
| if not isinstance(b, dict): |
| continue |
| btype = str(b.get("type", "")) |
|
|
| |
| if _skip_norm and btype in ("heading", "subheading"): |
| block_text = str(b.get("text", "")).strip().lower() |
| if block_text == _skip_norm: |
| continue |
|
|
| if btype == "paragraph": |
| t = _normalize_text_content(str(b.get("text", ""))) |
| if t.strip(): |
| result.append(RenderBlock( |
| block_type="paragraph", |
| css_class="hb-paragraph", |
| data={ |
| "text": t, |
| "html": Markup(emphasize_keywords(t)), |
| }, |
| )) |
|
|
| elif btype == "subheading": |
| t = _normalize_text_content(str(b.get("text", ""))) |
| if t.strip(): |
| result.append(RenderBlock( |
| block_type="heading_2", |
| css_class="hb-heading-2", |
| data={"text": t}, |
| )) |
|
|
| elif btype == "bullets": |
| items = b.get("items", []) |
| if not isinstance(items, list): |
| items = [] |
| normalized = [_normalize_text_content(str(it).strip()) for it in items if str(it).strip()] |
| html_items = [Markup(emphasize_keywords(it)) for it in normalized] |
| if normalized: |
| result.append(RenderBlock( |
| block_type="bullet_list", |
| css_class="hb-bullet-list", |
| data={"entries": html_items, "html_entries": True}, |
| )) |
|
|
| elif btype == "numbered_list": |
| items = b.get("items", []) |
| if not isinstance(items, list): |
| items = [] |
| normalized = [_normalize_text_content(str(it).strip()) for it in items if str(it).strip()] |
| html_items = [Markup(emphasize_keywords(it)) for it in normalized] |
| if normalized: |
| result.append(RenderBlock( |
| block_type="bullet_list", |
| css_class="hb-bullet-list hb-numbered-list", |
| data={"entries": html_items, "ordered": True, "html_entries": True}, |
| )) |
|
|
| elif btype == "note": |
| t = _normalize_text_content(str(b.get("text", ""))) |
| if t.strip(): |
| result.append(RenderBlock( |
| block_type="note", |
| css_class="hb-note", |
| data={"text": t}, |
| )) |
|
|
| elif btype == "note_inline": |
| parts = b.get("parts", []) |
| if not isinstance(parts, list): |
| parts = [] |
| normalized_parts = [] |
| for p in parts: |
| if not isinstance(p, dict): |
| continue |
| t = _normalize_text_content(str(p.get("text", ""))) |
| if t: |
| normalized_parts.append({ |
| "text": t, |
| "style": str(p.get("style", "")), |
| }) |
| if normalized_parts: |
| result.append(RenderBlock( |
| block_type="note", |
| css_class="hb-note", |
| data={"parts": normalized_parts, "inline": True}, |
| )) |
|
|
| elif btype == "table_v1": |
| t_cols = b.get("columns", []) |
| t_rows = b.get("rows", []) |
| if not isinstance(t_cols, list): |
| t_cols = [] |
| if not isinstance(t_rows, list): |
| t_rows = [] |
| norm_rows = [] |
| for r in t_rows: |
| if not isinstance(r, list): |
| continue |
| norm_rows.append([emphasize_keywords(_normalize_text_content(str(cell))) for cell in r]) |
| result.append(RenderBlock( |
| block_type="table", |
| css_class="hb-table", |
| data={"columns": [str(c) for c in t_cols], "rows": norm_rows, "variant": "standard"}, |
| )) |
|
|
| elif btype == "table": |
| |
| t_cols = b.get("columns", []) |
| t_rows = b.get("rows", []) |
| if not isinstance(t_cols, list): |
| t_cols = [] |
| if not isinstance(t_rows, list): |
| t_rows = [] |
| col_labels = [] |
| col_keys = [] |
| for c in t_cols: |
| if isinstance(c, dict): |
| col_labels.append(str(c.get("label", c.get("key", "")))) |
| col_keys.append(str(c.get("key", ""))) |
| else: |
| col_labels.append(str(c)) |
| col_keys.append(re.sub(r"[^a-z0-9]+", "_", str(c).lower())) |
| norm_rows = [] |
| for r in t_rows: |
| if isinstance(r, dict): |
| norm_rows.append([emphasize_keywords(_normalize_text_content(str(r.get(k, "")))) for k in col_keys]) |
| elif isinstance(r, list): |
| norm_rows.append([emphasize_keywords(_normalize_text_content(str(cell))) for cell in r]) |
| result.append(RenderBlock( |
| block_type="table", |
| css_class="hb-table", |
| data={"columns": col_labels, "rows": norm_rows, "variant": "standard"}, |
| )) |
|
|
| elif btype in ("table_v3", "table_v4"): |
| t_rows = b.get("rows", []) |
| if not isinstance(t_rows, list): |
| t_rows = [] |
| norm_rows = [] |
| for r in t_rows: |
| if not isinstance(r, list): |
| continue |
| norm_row = [] |
| for cell in r: |
| if isinstance(cell, dict): |
| norm_row.append({ |
| "text": emphasize_keywords(_normalize_text_content(str(cell.get("text", "")))), |
| "colspan": int(cell.get("colspan", 1)) if str(cell.get("colspan", "")).isdigit() else 1, |
| "rowspan": int(cell.get("rowspan", 1)) if str(cell.get("rowspan", "")).isdigit() else 1, |
| }) |
| else: |
| norm_row.append({ |
| "text": emphasize_keywords(_normalize_text_content(str(cell))), |
| "colspan": 1, |
| "rowspan": 1, |
| }) |
| norm_rows.append(norm_row) |
| result.append(RenderBlock( |
| block_type="table", |
| css_class="hb-table", |
| data={"rows": norm_rows, "variant": "spanning"}, |
| )) |
|
|
| return result |
|
|