Spaces:

webmuppetnz
/

hmc-rag

Running

File size: 11,107 Bytes

bad8b6c

"""
Convert NZ legislation from HTML (legislation.govt.nz) to structured Markdown.

HTML source has clean semantic markup:
  div.part > h2.part            → ## Part X — Title
  div.subpart > h3.subpart      → ### Subpart X — Title
  div.prov > h5.prov            → #### Section X — Title
  div.subprov > span.label (1)  → body text with (1), (2) etc.
  div.label-para > (a), (b)     → indented list items

Usage:
  uv run python scripts/convert_legislation_html.py
"""

import os
import re
import sys
from bs4 import BeautifulSoup, NavigableString

SOURCES_RAW = os.path.join(os.path.dirname(__file__), "..", "sources", "raw")
CORPUS_DIR = os.path.join(os.path.dirname(__file__), "..", "corpus")
os.makedirs(CORPUS_DIR, exist_ok=True)


def download_html(url, filename):
    """Download HTML if not already cached."""
    path = os.path.join(SOURCES_RAW, filename)
    if os.path.exists(path) and os.path.getsize(path) > 100000:
        print(f"  Using cached: {path}")
        return path

    import urllib.request
    req = urllib.request.Request(url, headers={
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
    })
    print(f"  Downloading: {url}")
    with urllib.request.urlopen(req) as resp:
        data = resp.read()
    with open(path, "wb") as f:
        f.write(data)
    print(f"  Saved: {path} ({len(data) / 1024:.0f} KB)")
    return path


def clean_text(el):
    """Extract clean text from an element, collapsing whitespace."""
    text = el.get_text(separator=" ", strip=True)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def convert_prov(prov_div, base_url=""):
    """Convert a single provision (section) to Markdown."""
    lines = []

    # Source URL from the provision's HTML id attribute
    prov_id = prov_div.get("id", "")
    if prov_id and base_url:
        lines.append(f"\nSource: {base_url}{prov_id}.html")

    # Section heading
    h5 = prov_div.find("h5", class_="prov", recursive=False)
    if h5:
        label_span = h5.find("span", class_="label")
        section_num = label_span.get_text(strip=True) if label_span else ""
        # Get title text (everything after the label)
        title_parts = []
        for child in h5.children:
            if isinstance(child, NavigableString):
                t = child.strip()
                if t:
                    title_parts.append(t)
            elif child.name != "span" or "label" not in child.get("class", []):
                title_parts.append(child.get_text(strip=True))
        title = " ".join(title_parts).strip()
        lines.append(f"\n#### {section_num} {title}\n")

    # Provision body
    prov_body = prov_div.find("div", class_="prov-body", recursive=False)
    if prov_body:
        lines.append(convert_prov_body(prov_body))

    # Compare notes (cross-references to old legislation)
    for cf in prov_div.find_all("p", class_="cf", recursive=False):
        lines.append(f"\n*{clean_text(cf)}*\n")

    # History/amendment notes
    for hist in prov_div.find_all("div", class_="history", recursive=False):
        for p in hist.find_all("p", class_="history-note"):
            lines.append(f"\n*{clean_text(p)}*\n")

    return "\n".join(lines)


def convert_prov_body(body):
    """Convert the body of a provision to Markdown text."""
    lines = []

    for child in body.children:
        if isinstance(child, NavigableString):
            continue

        classes = child.get("class", [])

        # Subsections (1), (2), etc.
        if "subprov" in classes:
            label_p = child.find("p", class_="subprov", recursive=False)
            label_span = label_p.find("span", class_="label") if label_p else None
            sub_label = label_span.get_text(strip=True) if label_span else ""

            # Get the text from direct child para divs only (not nested label-para content)
            paras = child.find_all("div", class_="para", recursive=False)
            for para in paras:
                para_text = convert_para(para)
                if para_text:
                    if sub_label:
                        lines.append(f"\n{sub_label} {para_text}")
                        sub_label = ""  # Only use on first para
                    else:
                        lines.append(f"\n{para_text}")

        # Direct paragraphs
        elif child.name == "div" and "para" in classes:
            lines.append(f"\n{convert_para(child)}")

        # Tables
        elif child.name == "table":
            lines.append(f"\n{convert_table(child)}")

    return "\n".join(lines)


def convert_para(para_div):
    """Convert a paragraph div, including nested label-paras (a), (b), etc."""
    parts = []

    for child in para_div.children:
        if isinstance(child, NavigableString):
            continue

        classes = child.get("class", [])

        if child.name == "p" and "text" in classes:
            parts.append(clean_text(child))

        elif "label-para" in classes:
            # Nested (a), (b) items
            lp_h5 = child.find("h5", class_="label-para")
            lp_label = ""
            if lp_h5:
                lp_span = lp_h5.find("span", class_="label")
                lp_label = lp_span.get_text(strip=True) if lp_span else ""

            lp_paras = child.find_all("div", class_="para")
            for lp_para in lp_paras:
                lp_text = convert_para(lp_para)  # Recursive for sub-sub items
                if lp_text:
                    parts.append(f"\n  {lp_label} {lp_text}")

    return " ".join(parts) if parts else ""


def convert_table(table):
    """Basic table conversion to Markdown."""
    rows = table.find_all("tr")
    if not rows:
        return ""

    md_rows = []
    for row in rows:
        cells = row.find_all(["th", "td"])
        cell_texts = [clean_text(c) for c in cells]
        md_rows.append("| " + " | ".join(cell_texts) + " |")
        if row.find("th"):
            md_rows.append("| " + " | ".join(["---"] * len(cells)) + " |")

    return "\n".join(md_rows)


def extract_part(soup, part_id):
    """Extract a full Part div by its ID."""
    return soup.find("div", class_="part", id=part_id)


def extract_subpart_by_content(soup, text_match):
    """Find a subpart containing specific text in its heading."""
    for sp in soup.find_all("div", class_="subpart"):
        h3 = sp.find("h3", class_="subpart")
        if h3 and text_match.lower() in h3.get_text().lower():
            return sp
    return None


def convert_part(part_div, heading_level=2, base_url=""):
    """Convert a full Part div to Markdown."""
    lines = []

    # Part heading
    h2 = part_div.find("h2", class_="part", recursive=False)
    if h2:
        lines.append(f"\n{'#' * heading_level} {clean_text(h2)}\n")

    # Process children: subparts and direct provisions
    for child in part_div.children:
        if isinstance(child, NavigableString):
            continue

        classes = child.get("class", [])

        if "subpart" in classes:
            lines.append(convert_subpart(child, base_url=base_url))
        elif "prov" in classes:
            lines.append(convert_prov(child, base_url=base_url))
        elif "crosshead" in classes:
            # Cross-headings (thematic groupings within subparts)
            lines.append(f"\n##### {clean_text(child)}\n")

    return "\n".join(lines)


def convert_subpart(subpart_div, base_url=""):
    """Convert a subpart div to Markdown."""
    lines = []

    h3 = subpart_div.find("h3", class_="subpart", recursive=False)
    if h3:
        lines.append(f"\n### {clean_text(h3)}\n")

    for child in subpart_div.children:
        if isinstance(child, NavigableString):
            continue

        classes = child.get("class", [])

        if "prov" in classes:
            lines.append(convert_prov(child, base_url=base_url))
        elif "crosshead" in classes:
            lines.append(f"\n##### {clean_text(child)}\n")

    return "\n".join(lines)


def build_legislation_compilation():
    print("Building legislation compilation from HTML sources\n")

    # Download Act
    act_path = download_html(
        "https://www.legislation.govt.nz/act/public/2020/0038/latest/whole.html",
        "education-training-act-2020.html",
    )

    # Download Regulations
    regs_path = download_html(
        "https://www.legislation.govt.nz/regulation/public/2008/0204/latest/whole.html",
        "ece-regulations-2008.html",
    )

    # Parse Act
    print("\nParsing Education and Training Act 2020...")
    with open(act_path, "r", encoding="utf-8") as f:
        act_soup = BeautifulSoup(f, "html.parser")

    act_base_url = "https://www.legislation.govt.nz/act/public/2020/0038/latest/"

    # Extract Part 2 (ECE) — ID from the HTML we inspected
    part2 = extract_part(act_soup, "LMS171362")
    if part2:
        part2_md = convert_part(part2, base_url=act_base_url)
        print(f"  Part 2: {len(part2_md)} chars")
    else:
        print("  WARNING: Part 2 not found!")
        part2_md = ""

    # Extract ERO subpart (Part 5, Subpart 3 — Education Review Office)
    ero_subpart = extract_subpart_by_content(act_soup, "Education Review Office")
    if ero_subpart:
        ero_md = convert_subpart(ero_subpart, base_url=act_base_url)
        print(f"  ERO subpart: {len(ero_md)} chars")
    else:
        print("  WARNING: ERO subpart not found!")
        ero_md = ""

    # Parse Regulations
    print("\nParsing ECE Regulations 2008...")
    with open(regs_path, "r", encoding="utf-8") as f:
        regs_soup = BeautifulSoup(f, "html.parser")

    regs_base_url = "https://www.legislation.govt.nz/regulation/public/2008/0204/latest/"

    # Convert all parts of the regulations
    regs_parts = regs_soup.find_all("div", class_="part")
    regs_md_parts = []
    for part in regs_parts:
        regs_md_parts.append(convert_part(part, base_url=regs_base_url))
    regs_md = "\n".join(regs_md_parts)
    print(f"  Regulations: {len(regs_md)} chars ({len(regs_parts)} parts)")

    # Assemble compilation
    compilation = f"""# Legislation — ECE Regulatory Framework

This compilation contains the primary legislation governing Early Childhood Education in New Zealand, converted from the authoritative HTML versions on legislation.govt.nz.

{part2_md}

## Education and Training Act 2020 — ERO Powers (Part 5, Subpart 3)

The Education Review Office reviews ECE services under these provisions.

{ero_md}

## Education (Early Childhood Services) Regulations 2008

These regulations set out the detailed requirements for ECE service licensing, including adult-to-child ratios, qualifications, and the licensing criteria framework.

{regs_md}
"""

    output_path = os.path.join(CORPUS_DIR, "legislation.md")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(compilation)

    lines = compilation.count("\n") + 1
    size_kb = len(compilation.encode("utf-8")) / 1024
    print(f"\nLegislation compilation: {lines} lines, {size_kb:.1f} KB")
    print(f"Saved to: {output_path}")


if __name__ == "__main__":
    build_legislation_compilation()