internationalscholarsprogram commited on
Commit
93a688a
Β·
verified Β·
1 Parent(s): 9e6b0ef

Update services: normalizer, html_builder, renderers, utils - fix PDF output to match V5.0 docx

Browse files
app/services/html_builder.py CHANGED
@@ -78,6 +78,7 @@ SECTION_CLASS_MAP = {
78
  "program_features_breakdown": "sec-breakdown",
79
  "funding_options_available": "sec-funding",
80
  "summary_of_universities": "sec-summary",
 
81
  }
82
 
83
  PAGE_BREAK_KEYS = {
@@ -91,6 +92,7 @@ PAGE_BREAK_KEYS = {
91
  "program_features_breakdown",
92
  "funding_options_available",
93
  "summary_of_universities",
 
94
  }
95
 
96
 
@@ -260,39 +262,11 @@ def _prepare_university_data(
260
  if not link and isinstance(p.get("program_links"), dict):
261
  link = str(p["program_links"].get("web_link", "")).strip()
262
 
263
- # Build career HTML
264
- career = p.get("career_pathways", [])
265
- career_html = ""
266
- if isinstance(career, list):
267
- career_items = [str(x).strip() for x in career if str(x).strip()]
268
- if career_items:
269
- career_html = '<ul class="career-list">'
270
- for ci in career_items:
271
- career_html += f"<li>{h(ci)}</li>"
272
- career_html += "</ul>"
273
- else:
274
- raw = str(career).strip()
275
- if raw:
276
- import re as _re
277
- lines = [l.strip() for l in _re.split(r"[\r\n]+", raw) if l.strip()]
278
- if len(lines) > 1:
279
- career_html = '<ul class="career-list">'
280
- for line in lines:
281
- career_html += f"<li>{h(line)}</li>"
282
- career_html += "</ul>"
283
- else:
284
- career_html = h(raw)
285
-
286
- if not career_html:
287
- career_html = "&nbsp;"
288
-
289
  programs.append({
290
  "name": program_name,
291
  "link": link,
292
  "designation": str(p.get("designation", "")),
293
  "entrance": str(p.get("entrance_exam", p.get("entrance_examination", ""))),
294
- "career_html": Markup(career_html),
295
- "funding": str(p.get("funding_category", "")),
296
  })
297
 
298
  # Extra sections
@@ -398,7 +372,7 @@ def build_handbook_html(
398
  # Fallback to remote URL when local file is unavailable
399
  label_image = "https://finsapdev.qhtestingserver.com/MODEL_APIS/handbook/images/label.jpeg"
400
 
401
- # ── Prepare active universities ──
402
  active_universities: list[dict[str, Any]] = []
403
  for uid, uni in by_uni.items():
404
  if not isinstance(uni, dict):
@@ -407,6 +381,9 @@ def build_handbook_html(
407
  continue
408
  name = str(uni.get("university_name", f"University #{uid}"))
409
  anchor = handbook_anchor("uni", name, int(uid))
 
 
 
410
  active_universities.append({
411
  "id": int(uid),
412
  "anchor": anchor,
@@ -414,8 +391,18 @@ def build_handbook_html(
414
  "sections": uni.get("sections", []) if isinstance(uni.get("sections"), list) else [],
415
  "website": str(uni.get("website", "")),
416
  "sort_order": int(uni["sort_order"]) if uni.get("sort_order") is not None and str(uni.get("sort_order", "")).lstrip("-").isdigit() else None,
 
 
 
417
  })
418
 
 
 
 
 
 
 
 
419
  # ── Normalise globals ──
420
  globals_data = sort_sections_stable(globals_data)
421
 
@@ -432,7 +419,6 @@ def build_handbook_html(
432
  raise RuntimeError(msg)
433
 
434
  general_sections: list[dict[str, Any]] = []
435
- summary_block: dict[str, Any] | None = None
436
  toc_sort_order = None
437
  toc_title = "Table of Contents"
438
 
@@ -448,14 +434,6 @@ def build_handbook_html(
448
  toc_title = str(g.get("section_title", "Table of Contents"))
449
  continue
450
 
451
- if key == "summary_of_universities":
452
- summary_block = {
453
- "anchor": handbook_anchor("summary", "summary-of-universities", idx),
454
- "data": g,
455
- "sort_order": sort_order,
456
- }
457
- continue
458
-
459
  section_hits: list[str] = []
460
  _collect_program_option_inconsistencies(
461
  g.get("section_json", {}),
@@ -476,7 +454,12 @@ def build_handbook_html(
476
  # ── Build TOC items ──
477
  toc_items: list[dict[str, Any]] = []
478
  for gs in general_sections:
479
- title = str(gs["data"].get("section_title", gs["data"].get("section_key", "Section")))
 
 
 
 
 
480
  toc_items.append({
481
  "title": title,
482
  "target": "#" + gs["anchor"],
@@ -485,16 +468,6 @@ def build_handbook_html(
485
  "sort": gs["sort_order"],
486
  })
487
 
488
- if summary_block:
489
- title = str(summary_block["data"].get("section_title", "Summary of Universities"))
490
- toc_items.append({
491
- "title": title,
492
- "target": "#" + summary_block["anchor"],
493
- "level": 0,
494
- "bold": True,
495
- "sort": summary_block["sort_order"],
496
- })
497
-
498
  for u in active_universities:
499
  toc_items.append({
500
  "title": u["name"],
@@ -578,44 +551,24 @@ def build_handbook_html(
578
  "rendered_html": Markup(section_html),
579
  })
580
 
581
- # ── Prepare summary block ──
582
- summary_template = None
583
- if summary_block:
584
- data = summary_block["data"]
585
- section_json = data.get("section_json", {})
586
- if not isinstance(section_json, dict):
587
- section_json = {}
588
-
589
- # Typed blocks for summary
590
- summary_blocks = normalize_section(
591
- str(data.get("section_key", "")),
592
- str(data.get("section_title", "")),
593
- section_json,
594
- universities=active_universities,
595
- debug=debug,
596
- )
597
-
598
- summary_html = render_global_blocks(
599
- str(data.get("section_key", "")),
600
- str(data.get("section_title", "")),
601
- section_json,
602
- debug,
603
- universities=active_universities,
604
- )
605
-
606
- summary_template = {
607
- "anchor": summary_block["anchor"],
608
- "data": data,
609
- "blocks": summary_blocks,
610
- "rendered_html": Markup(summary_html),
611
- }
612
-
613
  # ── Prepare university data for templates (both old + new paths) ──
 
614
  university_template_data = []
615
  university_block_data = []
 
 
 
616
  for idx, uni_raw in enumerate(active_universities):
617
  uni_raw["_is_first"] = (idx == 0)
618
 
 
 
 
 
 
 
 
 
619
  uni_hits: list[str] = []
620
  _collect_program_option_inconsistencies(
621
  uni_raw.get("sections", []),
@@ -630,6 +583,11 @@ def build_handbook_html(
630
  uni_data = _prepare_university_data(
631
  uni_raw, allow_remote, include_inactive_programs, debug, stats,
632
  )
 
 
 
 
 
633
  university_template_data.append(uni_data)
634
  # New block path
635
  uni_block = normalize_university(
@@ -665,7 +623,7 @@ def build_handbook_html(
665
  toc_title=toc_title,
666
  toc_sort_order=toc_sort_order,
667
  general_sections=template_sections,
668
- summary_block=summary_template,
669
  universities=university_template_data,
670
  university_blocks=university_block_data,
671
  bottom_pages=bottom_pages_urls,
 
78
  "program_features_breakdown": "sec-breakdown",
79
  "funding_options_available": "sec-funding",
80
  "summary_of_universities": "sec-summary",
81
+ "summary_of_universities_cosigner": "sec-summary-cosigner",
82
  }
83
 
84
  PAGE_BREAK_KEYS = {
 
92
  "program_features_breakdown",
93
  "funding_options_available",
94
  "summary_of_universities",
95
+ "summary_of_universities_cosigner",
96
  }
97
 
98
 
 
262
  if not link and isinstance(p.get("program_links"), dict):
263
  link = str(p["program_links"].get("web_link", "")).strip()
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  programs.append({
266
  "name": program_name,
267
  "link": link,
268
  "designation": str(p.get("designation", "")),
269
  "entrance": str(p.get("entrance_exam", p.get("entrance_examination", ""))),
 
 
270
  })
271
 
272
  # Extra sections
 
372
  # Fallback to remote URL when local file is unavailable
373
  label_image = "https://finsapdev.qhtestingserver.com/MODEL_APIS/handbook/images/label.jpeg"
374
 
375
+ # ── Prepare active universities (sorted: Tier One first, Tier Two second) ──
376
  active_universities: list[dict[str, Any]] = []
377
  for uid, uni in by_uni.items():
378
  if not isinstance(uni, dict):
 
381
  continue
382
  name = str(uni.get("university_name", f"University #{uid}"))
383
  anchor = handbook_anchor("uni", name, int(uid))
384
+ school_category = str(uni.get("school_category", "")).strip()
385
+ tier = uni.get("tier")
386
+ tier_label = str(uni.get("tier_label", "")).strip()
387
  active_universities.append({
388
  "id": int(uid),
389
  "anchor": anchor,
 
391
  "sections": uni.get("sections", []) if isinstance(uni.get("sections"), list) else [],
392
  "website": str(uni.get("website", "")),
393
  "sort_order": int(uni["sort_order"]) if uni.get("sort_order") is not None and str(uni.get("sort_order", "")).lstrip("-").isdigit() else None,
394
+ "school_category": school_category,
395
+ "tier": tier,
396
+ "tier_label": tier_label,
397
  })
398
 
399
+ # Stable tier ordering: Tier One (non_cosigner) β†’ Tier Two (cosigner) β†’ others, then alphabetical
400
+ def _tier_sort(u: dict) -> tuple:
401
+ t = u.get("tier")
402
+ rank = t if isinstance(t, int) else 99
403
+ return (rank, (u.get("name") or "").lower(), u.get("id", 0))
404
+ active_universities.sort(key=_tier_sort)
405
+
406
  # ── Normalise globals ──
407
  globals_data = sort_sections_stable(globals_data)
408
 
 
419
  raise RuntimeError(msg)
420
 
421
  general_sections: list[dict[str, Any]] = []
 
422
  toc_sort_order = None
423
  toc_title = "Table of Contents"
424
 
 
434
  toc_title = str(g.get("section_title", "Table of Contents"))
435
  continue
436
 
 
 
 
 
 
 
 
 
437
  section_hits: list[str] = []
438
  _collect_program_option_inconsistencies(
439
  g.get("section_json", {}),
 
454
  # ── Build TOC items ──
455
  toc_items: list[dict[str, Any]] = []
456
  for gs in general_sections:
457
+ # Prefer the JSON-level title (display-ready) over the DB section_title
458
+ gs_json = gs["data"].get("section_json", {})
459
+ if isinstance(gs_json, dict) and gs_json.get("title", "").strip():
460
+ title = gs_json["title"].strip()
461
+ else:
462
+ title = str(gs["data"].get("section_title", gs["data"].get("section_key", "Section")))
463
  toc_items.append({
464
  "title": title,
465
  "target": "#" + gs["anchor"],
 
468
  "sort": gs["sort_order"],
469
  })
470
 
 
 
 
 
 
 
 
 
 
 
471
  for u in active_universities:
472
  toc_items.append({
473
  "title": u["name"],
 
551
  "rendered_html": Markup(section_html),
552
  })
553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
  # ── Prepare university data for templates (both old + new paths) ──
555
+ # Group by tier for tier heading insertion in the PDF output
556
  university_template_data = []
557
  university_block_data = []
558
+ # Track which tier label was last emitted so we can insert tier divider headings
559
+ _seen_tier_labels: set[str] = set()
560
+
561
  for idx, uni_raw in enumerate(active_universities):
562
  uni_raw["_is_first"] = (idx == 0)
563
 
564
+ # Insert tier group heading when tier changes
565
+ current_tier_label = str(uni_raw.get("tier_label", "")).strip()
566
+ if current_tier_label and current_tier_label not in _seen_tier_labels:
567
+ _seen_tier_labels.add(current_tier_label)
568
+ # Mark this university as starting a new tier group
569
+ uni_raw["_tier_group_start"] = True
570
+ uni_raw["_tier_group_label"] = f"{current_tier_label} Schools"
571
+
572
  uni_hits: list[str] = []
573
  _collect_program_option_inconsistencies(
574
  uni_raw.get("sections", []),
 
583
  uni_data = _prepare_university_data(
584
  uni_raw, allow_remote, include_inactive_programs, debug, stats,
585
  )
586
+ # Carry tier metadata to template data
587
+ uni_data["tier"] = uni_raw.get("tier")
588
+ uni_data["tier_label"] = uni_raw.get("tier_label", "")
589
+ uni_data["tier_group_start"] = uni_raw.get("_tier_group_start", False)
590
+ uni_data["tier_group_label"] = uni_raw.get("_tier_group_label", "")
591
  university_template_data.append(uni_data)
592
  # New block path
593
  uni_block = normalize_university(
 
623
  toc_title=toc_title,
624
  toc_sort_order=toc_sort_order,
625
  general_sections=template_sections,
626
+ summary_block=None,
627
  universities=university_template_data,
628
  university_blocks=university_block_data,
629
  bottom_pages=bottom_pages_urls,
app/services/normalizer.py CHANGED
@@ -69,15 +69,10 @@ def normalize_section(
69
 
70
  layout_norm = str(section_json.get("layout", "")).lower().strip()
71
 
72
- # ── Summary of universities ──
73
- if key_norm == "summary_of_universities":
74
- blocks.extend(_normalize_university_summary(
75
- section_title, section_json, layout_norm, universities or [],
76
- ))
77
- return blocks
78
-
79
  # ── Section heading ──
80
- title = section_title.strip()
 
 
81
  if title and key_norm != "table_of_contents":
82
  blocks.append(RenderBlock(
83
  block_type="heading_1",
@@ -133,7 +128,7 @@ def normalize_section(
133
 
134
  # ── doc_v1 ──
135
  if layout_norm == "doc_v1" and isinstance(section_json.get("blocks"), list):
136
- blocks.extend(_normalize_doc_v1(section_json["blocks"]))
137
  return blocks
138
 
139
  # ── Fallback ──
@@ -518,15 +513,28 @@ def _normalize_table_v2(json_data: dict) -> RenderBlock:
518
  )
519
 
520
 
521
- def _normalize_doc_v1(blocks: list) -> list[RenderBlock]:
522
- """Normalise doc_v1 blocks into typed RenderBlocks."""
 
 
 
 
 
 
523
  from markupsafe import Markup
 
524
  result: list[RenderBlock] = []
525
  for b in blocks:
526
  if not isinstance(b, dict):
527
  continue
528
  btype = str(b.get("type", ""))
529
 
 
 
 
 
 
 
530
  if btype == "paragraph":
531
  t = _normalize_text_content(str(b.get("text", "")))
532
  if t.strip():
@@ -622,6 +630,35 @@ def _normalize_doc_v1(blocks: list) -> list[RenderBlock]:
622
  data={"columns": [str(c) for c in t_cols], "rows": norm_rows, "variant": "standard"},
623
  ))
624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625
  elif btype in ("table_v3", "table_v4"):
626
  t_rows = b.get("rows", [])
627
  if not isinstance(t_rows, list):
@@ -652,189 +689,3 @@ def _normalize_doc_v1(blocks: list) -> list[RenderBlock]:
652
  ))
653
 
654
  return result
655
-
656
-
657
- _CLOSING_NOTE_MARKER = "we keep expanding and updating"
658
-
659
-
660
- def _is_closing_note(text: str) -> bool:
661
- """Return True if text is the 'expanding/updating' closing paragraph."""
662
- return _CLOSING_NOTE_MARKER in text.lower()
663
-
664
-
665
- def _normalize_university_summary(
666
- section_title: str,
667
- json_data: dict,
668
- layout_norm: str,
669
- universities: list[dict],
670
- ) -> list[RenderBlock]:
671
- """Normalise the summary_of_universities section.
672
-
673
- Enforced structure:
674
- 1. Section heading
675
- 2. Introductory paragraphs (STEM / OPT etc.) – anything NOT the closing note
676
- 3. Numbered university list
677
- 4. Closing note paragraph ("We keep expanding and updating…")
678
- 5. Optional note field
679
- """
680
- pre_list_blocks: list[RenderBlock] = [] # intro content β†’ before list
681
- closing_blocks: list[RenderBlock] = [] # deferred οΏ½οΏ½ after list
682
-
683
- title = section_title.strip()
684
-
685
- # ── Collect intro / closing paragraphs ──
686
- intro = _normalize_text_content(str(json_data.get("intro", "")).strip())
687
- if intro:
688
- target = closing_blocks if _is_closing_note(intro) else pre_list_blocks
689
- target.append(RenderBlock(
690
- block_type="paragraph",
691
- css_class="hb-paragraph",
692
- data={"text": intro},
693
- ))
694
-
695
- if layout_norm == "doc_v1" and isinstance(json_data.get("blocks"), list):
696
- for b in json_data["blocks"]:
697
- if not isinstance(b, dict):
698
- continue
699
- btype = str(b.get("type", ""))
700
- if btype not in ("paragraph", "subheading", "note"):
701
- continue
702
- t = _normalize_text_content(str(b.get("text", "")))
703
- if not t.strip():
704
- continue
705
- if btype == "subheading":
706
- pre_list_blocks.append(RenderBlock(
707
- block_type="heading_2", css_class="hb-heading-2", data={"text": t},
708
- ))
709
- elif btype == "note":
710
- target = closing_blocks if _is_closing_note(t) else pre_list_blocks
711
- target.append(RenderBlock(
712
- block_type="note", css_class="hb-note", data={"text": t},
713
- ))
714
- else:
715
- target = closing_blocks if _is_closing_note(t) else pre_list_blocks
716
- target.append(RenderBlock(
717
- block_type="paragraph", css_class="hb-paragraph", data={"text": t},
718
- ))
719
-
720
- # ── Resolve university list (tier-grouped) ──
721
- resolved: list[str] = []
722
- tier_one_names: list[str] = []
723
- tier_two_names: list[str] = []
724
- if universities:
725
- def uni_sort_key(u):
726
- so = u.get("sort_order") if isinstance(u, dict) else None
727
- if so is not None:
728
- try:
729
- return (0, float(so))
730
- except (ValueError, TypeError):
731
- pass
732
- return (1, 0.0)
733
-
734
- sorted_unis = sorted(universities, key=uni_sort_key)
735
- for u in sorted_unis:
736
- if isinstance(u, dict):
737
- name = str(u.get("university_name", u.get("name", ""))).strip()
738
- if name:
739
- resolved.append(name)
740
- # Group by tier for sub-headings
741
- tier = u.get("tier")
742
- if tier == 1:
743
- tier_one_names.append(name)
744
- elif tier == 2:
745
- tier_two_names.append(name)
746
- else:
747
- tier_one_names.append(name) # default to tier one
748
-
749
- if not resolved and layout_norm == "doc_v1" and isinstance(json_data.get("blocks"), list):
750
- for b in json_data["blocks"]:
751
- if not isinstance(b, dict) or str(b.get("type", "")) != "bullets":
752
- continue
753
- items = b.get("items", [])
754
- if isinstance(items, list):
755
- for it in items:
756
- it_str = str(it).strip()
757
- if it_str:
758
- resolved.append(it_str)
759
-
760
- # Dedupe
761
- seen: set[str] = set()
762
- deduped: list[str] = []
763
- for nm in resolved:
764
- k = nm.lower().strip()
765
- if k and k not in seen:
766
- seen.add(k)
767
- deduped.append(nm)
768
-
769
- # ── Assemble in enforced order ──
770
- blocks: list[RenderBlock] = []
771
-
772
- if title:
773
- blocks.append(RenderBlock(
774
- block_type="heading_1",
775
- css_class="hb-heading-1",
776
- data={"text": title},
777
- ))
778
-
779
- blocks.extend(pre_list_blocks)
780
-
781
- # Render university summary grouped by tier when tier data is available
782
- if tier_one_names or tier_two_names:
783
- # Tier One sub-group
784
- if tier_one_names:
785
- blocks.append(RenderBlock(
786
- block_type="heading_2",
787
- css_class="hb-heading-2",
788
- data={"text": "Tier One Schools"},
789
- ))
790
- seen_t1: set[str] = set()
791
- deduped_t1 = []
792
- for nm in tier_one_names:
793
- k = nm.lower().strip()
794
- if k and k not in seen_t1:
795
- seen_t1.add(k)
796
- deduped_t1.append(nm)
797
- blocks.append(RenderBlock(
798
- block_type="university_summary",
799
- css_class="hb-university-summary",
800
- data={"universities": deduped_t1},
801
- ))
802
- # Tier Two sub-group (no redundant "Summary of Universities" heading)
803
- if tier_two_names:
804
- blocks.append(RenderBlock(
805
- block_type="heading_2",
806
- css_class="hb-heading-2",
807
- data={"text": "Tier Two Schools"},
808
- ))
809
- seen_t2: set[str] = set()
810
- deduped_t2 = []
811
- for nm in tier_two_names:
812
- k = nm.lower().strip()
813
- if k and k not in seen_t2:
814
- seen_t2.add(k)
815
- deduped_t2.append(nm)
816
- blocks.append(RenderBlock(
817
- block_type="university_summary",
818
- css_class="hb-university-summary",
819
- data={"universities": deduped_t2},
820
- ))
821
- elif deduped:
822
- # Fallback: no tier data available, render flat list (backward compat)
823
- blocks.append(RenderBlock(
824
- block_type="university_summary",
825
- css_class="hb-university-summary",
826
- data={"universities": deduped},
827
- ))
828
-
829
- # Closing note always after the list
830
- blocks.extend(closing_blocks)
831
-
832
- note = str(json_data.get("note", "")).strip()
833
- if note:
834
- blocks.append(RenderBlock(
835
- block_type="note",
836
- css_class="hb-note",
837
- data={"text": _normalize_text_content(note)},
838
- ))
839
-
840
- return blocks
 
69
 
70
  layout_norm = str(section_json.get("layout", "")).lower().strip()
71
 
 
 
 
 
 
 
 
72
  # ── Section heading ──
73
+ # Prefer the JSON-level title (display-ready) over the DB section_title
74
+ json_title = str(section_json.get("title", "")).strip() if isinstance(section_json, dict) else ""
75
+ title = json_title or section_title.strip()
76
  if title and key_norm != "table_of_contents":
77
  blocks.append(RenderBlock(
78
  block_type="heading_1",
 
128
 
129
  # ── doc_v1 ──
130
  if layout_norm == "doc_v1" and isinstance(section_json.get("blocks"), list):
131
+ blocks.extend(_normalize_doc_v1(section_json["blocks"], skip_title=title))
132
  return blocks
133
 
134
  # ── Fallback ──
 
513
  )
514
 
515
 
516
+ def _normalize_doc_v1(blocks: list, *, skip_title: str = "") -> list[RenderBlock]:
517
+ """Normalise doc_v1 blocks into typed RenderBlocks.
518
+
519
+ Args:
520
+ skip_title: When set, any leading heading/subheading block whose text
521
+ matches this title (case-insensitive) is dropped to avoid
522
+ duplicating the section heading already emitted by the caller.
523
+ """
524
  from markupsafe import Markup
525
+ _skip_norm = skip_title.strip().lower() if skip_title else ""
526
  result: list[RenderBlock] = []
527
  for b in blocks:
528
  if not isinstance(b, dict):
529
  continue
530
  btype = str(b.get("type", ""))
531
 
532
+ # Skip heading/subheading blocks that duplicate the section title
533
+ if _skip_norm and btype in ("heading", "subheading"):
534
+ block_text = str(b.get("text", "")).strip().lower()
535
+ if block_text == _skip_norm:
536
+ continue
537
+
538
  if btype == "paragraph":
539
  t = _normalize_text_content(str(b.get("text", "")))
540
  if t.strip():
 
630
  data={"columns": [str(c) for c in t_cols], "rows": norm_rows, "variant": "standard"},
631
  ))
632
 
633
+ elif btype == "table":
634
+ # Generic table (columns may be objects or strings, rows may be dicts or lists)
635
+ t_cols = b.get("columns", [])
636
+ t_rows = b.get("rows", [])
637
+ if not isinstance(t_cols, list):
638
+ t_cols = []
639
+ if not isinstance(t_rows, list):
640
+ t_rows = []
641
+ col_labels = []
642
+ col_keys = []
643
+ for c in t_cols:
644
+ if isinstance(c, dict):
645
+ col_labels.append(str(c.get("label", c.get("key", ""))))
646
+ col_keys.append(str(c.get("key", "")))
647
+ else:
648
+ col_labels.append(str(c))
649
+ col_keys.append(re.sub(r"[^a-z0-9]+", "_", str(c).lower()))
650
+ norm_rows = []
651
+ for r in t_rows:
652
+ if isinstance(r, dict):
653
+ norm_rows.append([emphasize_keywords(_normalize_text_content(str(r.get(k, "")))) for k in col_keys])
654
+ elif isinstance(r, list):
655
+ norm_rows.append([emphasize_keywords(_normalize_text_content(str(cell))) for cell in r])
656
+ result.append(RenderBlock(
657
+ block_type="table",
658
+ css_class="hb-table",
659
+ data={"columns": col_labels, "rows": norm_rows, "variant": "standard"},
660
+ ))
661
+
662
  elif btype in ("table_v3", "table_v4"):
663
  t_rows = b.get("rows", [])
664
  if not isinstance(t_rows, list):
 
689
  ))
690
 
691
  return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/services/renderers.py CHANGED
@@ -349,92 +349,13 @@ def render_global_blocks(
349
 
350
  layout_norm = str(json_data.get("layout", "")).lower().strip()
351
 
352
- # ── Summary of universities ──
353
- if key_norm == "summary_of_universities":
354
- unis = universities or []
355
- title = section_title.strip()
356
- if title:
357
- html_out += f'<h2 class="h2">{h(title)}</h2>'
358
-
359
- intro = str(json_data.get("intro", "")).strip()
360
- if intro:
361
- html_out += f'<p class="p">{h(format_money_figures(intro))}</p>'
362
- elif layout_norm == "doc_v1" and isinstance(json_data.get("blocks"), list):
363
- for b in json_data["blocks"]:
364
- if not isinstance(b, dict):
365
- continue
366
- btype = str(b.get("type", ""))
367
- if btype not in ("paragraph", "subheading", "note"):
368
- continue
369
- t = format_money_figures(str(b.get("text", "")))
370
- if not t.strip():
371
- continue
372
- if btype == "subheading":
373
- html_out += f'<h3 class="h3">{h(t)}</h3>'
374
- elif btype == "note":
375
- html_out += f'<div class="note">{h(t)}</div>'
376
- else:
377
- html_out += f'<p class="p">{emphasize_keywords(t)}</p>'
378
-
379
- # Resolve list from universities or doc_v1 bullets
380
- resolved: list[str] = []
381
- if unis:
382
- def uni_sort_key(u):
383
- so = u.get("sort_order") if isinstance(u, dict) else None
384
- if so is not None:
385
- try:
386
- return (0, float(so))
387
- except (ValueError, TypeError):
388
- pass
389
- return (1, 0.0)
390
-
391
- sorted_unis = sorted(unis, key=uni_sort_key)
392
- for u in sorted_unis:
393
- if not isinstance(u, dict):
394
- continue
395
- name = str(u.get("university_name", u.get("name", ""))).strip()
396
- if name:
397
- resolved.append(name)
398
-
399
- if not resolved and layout_norm == "doc_v1" and isinstance(json_data.get("blocks"), list):
400
- for b in json_data["blocks"]:
401
- if not isinstance(b, dict) or str(b.get("type", "")) != "bullets":
402
- continue
403
- items = b.get("items", [])
404
- if not isinstance(items, list):
405
- continue
406
- for it in items:
407
- it_str = str(it).strip()
408
- if it_str:
409
- resolved.append(it_str)
410
-
411
- # Dedupe
412
- seen: set[str] = set()
413
- deduped: list[str] = []
414
- for nm in resolved:
415
- k = nm.lower().strip()
416
- if not k or k in seen:
417
- continue
418
- seen.add(k)
419
- deduped.append(nm)
420
-
421
- if deduped:
422
- html_out += '<ol class="ol">'
423
- for name in deduped:
424
- anchor = "university_" + hb_slug(name)
425
- html_out += f'<li><a href="#{h(anchor)}">{h(name)}</a></li>'
426
- html_out += "</ol>"
427
-
428
- note = str(json_data.get("note", "")).strip()
429
- if note:
430
- html_out += f'<div class="note">{h(format_money_figures(note))}</div>'
431
-
432
- return html_out
433
-
434
  # ── Section title ──
435
- title = section_title.strip()
 
 
436
  if title and key_norm != "table_of_contents":
437
  html_out += f'<h2 class="h2">{h(title)}</h2>'
 
438
 
439
  # ── Steps ──
440
  steps = json_data.get("steps")
@@ -600,6 +521,12 @@ def render_global_blocks(
600
  continue
601
  btype = str(b.get("type", ""))
602
 
 
 
 
 
 
 
603
  if btype == "paragraph":
604
  t = format_money_figures(str(b.get("text", "")))
605
  if t.strip():
@@ -679,6 +606,42 @@ def render_global_blocks(
679
  html_out += "</tr>"
680
  html_out += "</tbody></table>"
681
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
  elif btype in ("table_v3", "table_v4"):
683
  t_rows = b.get("rows", [])
684
  h_rows = b.get("header_rows", [])
 
349
 
350
  layout_norm = str(json_data.get("layout", "")).lower().strip()
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  # ── Section title ──
353
+ # Prefer the JSON-level title (display-ready) over the DB section_title
354
+ json_title = str(json_data.get("title", "")).strip() if isinstance(json_data, dict) else ""
355
+ title = json_title or section_title.strip()
356
  if title and key_norm != "table_of_contents":
357
  html_out += f'<h2 class="h2">{h(title)}</h2>'
358
+ _title_norm = title.lower()
359
 
360
  # ── Steps ──
361
  steps = json_data.get("steps")
 
521
  continue
522
  btype = str(b.get("type", ""))
523
 
524
+ # Skip heading/subheading blocks that duplicate the section title
525
+ if btype in ("heading", "subheading"):
526
+ block_text = str(b.get("text", "")).strip().lower()
527
+ if block_text == _title_norm:
528
+ continue
529
+
530
  if btype == "paragraph":
531
  t = format_money_figures(str(b.get("text", "")))
532
  if t.strip():
 
606
  html_out += "</tr>"
607
  html_out += "</tbody></table>"
608
 
609
+ elif btype == "table":
610
+ # Generic table (columns may be objects or strings, rows may be dicts or lists)
611
+ t_cols = b.get("columns", [])
612
+ t_rows = b.get("rows", [])
613
+ if not isinstance(t_cols, list):
614
+ t_cols = []
615
+ if not isinstance(t_rows, list):
616
+ t_rows = []
617
+ col_labels = []
618
+ col_keys = []
619
+ for c in t_cols:
620
+ if isinstance(c, dict):
621
+ col_labels.append(str(c.get("label", c.get("key", ""))))
622
+ col_keys.append(str(c.get("key", "")))
623
+ else:
624
+ col_labels.append(str(c))
625
+ col_keys.append(re.sub(r"[^a-z0-9]+", "_", str(c).lower()))
626
+ html_out += '<table class="tbl">'
627
+ if col_labels:
628
+ html_out += "<thead><tr>"
629
+ for lbl in col_labels:
630
+ html_out += f"<th>{h(lbl)}</th>"
631
+ html_out += "</tr></thead>"
632
+ html_out += "<tbody>"
633
+ for r in t_rows:
634
+ html_out += "<tr>"
635
+ if isinstance(r, dict):
636
+ for k in col_keys:
637
+ cell = r.get(k, "")
638
+ html_out += f"<td>{h(format_money_figures(str(cell)))}</td>"
639
+ elif isinstance(r, list):
640
+ for cell in r:
641
+ html_out += f"<td>{h(format_money_figures(str(cell)))}</td>"
642
+ html_out += "</tr>"
643
+ html_out += "</tbody></table>"
644
+
645
  elif btype in ("table_v3", "table_v4"):
646
  t_rows = b.get("rows", [])
647
  h_rows = b.get("header_rows", [])
app/services/utils.py CHANGED
@@ -51,20 +51,52 @@ def format_money_figures(text: str) -> str:
51
  """Normalize all monetary figures to "USD X,XXX" format.
52
 
53
  - Converts existing $X,XXX β†’ USD X,XXX
54
- - Normalizes bare large numbers β†’ USD X,XXX
55
  - Formats with commas
56
  - Currency type is always USD (no $ symbol)
57
  """
58
  if not text:
59
  return text
60
 
61
- # Normalize "$X,XXX" β†’ bare number (strip $ symbol)
62
- text = re.sub(r'\$([\d,]+(?:\.\d+)?)', lambda m: m.group(1), text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- # Normalize "USD X,XXX" β†’ bare number for uniform re-processing
65
- text = re.sub(r'\bUSD\s+([\d,]+(?:\.\d+)?)', lambda m: m.group(1), text, flags=re.IGNORECASE)
66
 
67
- def _format_match(m: re.Match) -> str:
 
 
68
  num_str = m.group(1).replace(",", "")
69
  dec = m.group(2) if m.group(2) else ""
70
  try:
@@ -77,10 +109,9 @@ def format_money_figures(text: str) -> str:
77
  formatted = f"{num:,.0f}"
78
  return "USD " + formatted
79
 
80
- # Add "USD " to large numbers (4+ digits or already comma-formatted)
81
  text = re.sub(
82
- r"(?<!\d)((?:\d{1,3}(?:,\d{3})+)|(?:\d{4,}))(?:\.(\d+))?(?![%\d/])",
83
- _format_match,
84
  text,
85
  )
86
 
 
51
  """Normalize all monetary figures to "USD X,XXX" format.
52
 
53
  - Converts existing $X,XXX β†’ USD X,XXX
54
+ - Normalizes bare large numbers (1,000+) β†’ USD X,XXX
55
  - Formats with commas
56
  - Currency type is always USD (no $ symbol)
57
  """
58
  if not text:
59
  return text
60
 
61
+ # Step 1: Convert "$X" β†’ "USD X" directly (preserves ALL dollar amounts)
62
+ def _dollar_to_usd(m: re.Match) -> str:
63
+ num_str = m.group(1).replace(",", "")
64
+ try:
65
+ num = float(num_str)
66
+ except ValueError:
67
+ return m.group(0)
68
+ if "." in m.group(1):
69
+ dec_part = m.group(1).split(".")[-1]
70
+ formatted = f"{num:,.{len(dec_part)}f}"
71
+ elif num == int(num):
72
+ formatted = f"{int(num):,}"
73
+ else:
74
+ formatted = f"{num:,.2f}"
75
+ return "USD " + formatted
76
+
77
+ text = re.sub(r'\$([\d,]+(?:\.\d+)?)', _dollar_to_usd, text)
78
+
79
+ # Step 2: Normalize existing "USD X,XXX" for consistent comma formatting
80
+ def _normalize_usd(m: re.Match) -> str:
81
+ num_str = m.group(1).replace(",", "")
82
+ try:
83
+ num = float(num_str)
84
+ except ValueError:
85
+ return m.group(0)
86
+ if "." in m.group(1):
87
+ dec_part = m.group(1).split(".")[-1]
88
+ formatted = f"{num:,.{len(dec_part)}f}"
89
+ elif num == int(num):
90
+ formatted = f"{int(num):,}"
91
+ else:
92
+ formatted = f"{num:,.2f}"
93
+ return "USD " + formatted
94
 
95
+ text = re.sub(r'\bUSD\s+([\d,]+(?:\.\d+)?)', _normalize_usd, text, flags=re.IGNORECASE)
 
96
 
97
+ # Step 3: Add "USD " to bare large numbers (4+ digits or comma-formatted)
98
+ # that aren't already preceded by "USD "
99
+ def _format_bare_large(m: re.Match) -> str:
100
  num_str = m.group(1).replace(",", "")
101
  dec = m.group(2) if m.group(2) else ""
102
  try:
 
109
  formatted = f"{num:,.0f}"
110
  return "USD " + formatted
111
 
 
112
  text = re.sub(
113
+ r"(?<!\d)(?<!USD )((?:\d{1,3}(?:,\d{3})+)|(?:\d{4,}))(?:\.(\d+))?(?![%\d/])",
114
+ _format_bare_large,
115
  text,
116
  )
117
 
app/templates/handbook.html CHANGED
@@ -95,7 +95,8 @@
95
  {% for uni in universities %}
96
  {% if uni.tier_group_start and uni.tier_group_label %}
97
  <div class="section-block page-break tier-group-heading" data-tier="{{ uni.tier_label | default('') | e }}">
98
- <h1 class="h1 hb-heading-1" style="margin-top:0.5em;margin-bottom:0.3em;">{{ uni.tier_group_label | e }}</h1>
 
99
  </div>
100
  {% endif %}
101
  {% include "partials/university.html" %}
 
95
  {% for uni in universities %}
96
  {% if uni.tier_group_start and uni.tier_group_label %}
97
  <div class="section-block page-break tier-group-heading" data-tier="{{ uni.tier_label | default('') | e }}">
98
+ <h1 class="h1 hb-heading-1" style="margin-top:0.5em;margin-bottom:0.3em;">{{ uni.tier_group_label | e }}
99
+ </h1>
100
  </div>
101
  {% endif %}
102
  {% include "partials/university.html" %}
app/templates/partials/university.html CHANGED
@@ -83,9 +83,9 @@
83
  <table class="programs">
84
  <thead>
85
  <tr>
86
- <th style="width:34%">Program</th>
87
- <th style="width:33%">Designation</th>
88
- <th style="width:33%">Entrance Examination</th>
89
  </tr>
90
  </thead>
91
  <tbody>
 
83
  <table class="programs">
84
  <thead>
85
  <tr>
86
+ <th>Program</th>
87
+ <th>Designation</th>
88
+ <th>Entrance Examination</th>
89
  </tr>
90
  </thead>
91
  <tbody>