"""
Convert NZ legislation from HTML (legislation.govt.nz) to structured Markdown.
HTML source has clean semantic markup:
div.part > h2.part → ## Part X — Title
div.subpart > h3.subpart → ### Subpart X — Title
div.prov > h5.prov → #### Section X — Title
div.subprov > span.label (1) → body text with (1), (2) etc.
div.label-para > (a), (b) → indented list items
Usage:
uv run python scripts/convert_legislation_html.py
"""
import os
import re
import sys
from bs4 import BeautifulSoup, NavigableString
SOURCES_RAW = os.path.join(os.path.dirname(__file__), "..", "sources", "raw")
CORPUS_DIR = os.path.join(os.path.dirname(__file__), "..", "corpus")
os.makedirs(CORPUS_DIR, exist_ok=True)
def download_html(url, filename):
"""Download HTML if not already cached."""
path = os.path.join(SOURCES_RAW, filename)
if os.path.exists(path) and os.path.getsize(path) > 100000:
print(f" Using cached: {path}")
return path
import urllib.request
req = urllib.request.Request(url, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
})
print(f" Downloading: {url}")
with urllib.request.urlopen(req) as resp:
data = resp.read()
with open(path, "wb") as f:
f.write(data)
print(f" Saved: {path} ({len(data) / 1024:.0f} KB)")
return path
def clean_text(el):
"""Extract clean text from an element, collapsing whitespace."""
text = el.get_text(separator=" ", strip=True)
text = re.sub(r"\s+", " ", text)
return text.strip()
def convert_prov(prov_div, base_url=""):
"""Convert a single provision (section) to Markdown."""
lines = []
# Source URL from the provision's HTML id attribute
prov_id = prov_div.get("id", "")
if prov_id and base_url:
lines.append(f"\nSource: {base_url}{prov_id}.html")
# Section heading
h5 = prov_div.find("h5", class_="prov", recursive=False)
if h5:
label_span = h5.find("span", class_="label")
section_num = label_span.get_text(strip=True) if label_span else ""
# Get title text (everything after the label)
title_parts = []
for child in h5.children:
if isinstance(child, NavigableString):
t = child.strip()
if t:
title_parts.append(t)
elif child.name != "span" or "label" not in child.get("class", []):
title_parts.append(child.get_text(strip=True))
title = " ".join(title_parts).strip()
lines.append(f"\n#### {section_num} {title}\n")
# Provision body
prov_body = prov_div.find("div", class_="prov-body", recursive=False)
if prov_body:
lines.append(convert_prov_body(prov_body))
# Compare notes (cross-references to old legislation)
for cf in prov_div.find_all("p", class_="cf", recursive=False):
lines.append(f"\n*{clean_text(cf)}*\n")
# History/amendment notes
for hist in prov_div.find_all("div", class_="history", recursive=False):
for p in hist.find_all("p", class_="history-note"):
lines.append(f"\n*{clean_text(p)}*\n")
return "\n".join(lines)
def convert_prov_body(body):
"""Convert the body of a provision to Markdown text."""
lines = []
for child in body.children:
if isinstance(child, NavigableString):
continue
classes = child.get("class", [])
# Subsections (1), (2), etc.
if "subprov" in classes:
label_p = child.find("p", class_="subprov", recursive=False)
label_span = label_p.find("span", class_="label") if label_p else None
sub_label = label_span.get_text(strip=True) if label_span else ""
# Get the text from direct child para divs only (not nested label-para content)
paras = child.find_all("div", class_="para", recursive=False)
for para in paras:
para_text = convert_para(para)
if para_text:
if sub_label:
lines.append(f"\n{sub_label} {para_text}")
sub_label = "" # Only use on first para
else:
lines.append(f"\n{para_text}")
# Direct paragraphs
elif child.name == "div" and "para" in classes:
lines.append(f"\n{convert_para(child)}")
# Tables
elif child.name == "table":
lines.append(f"\n{convert_table(child)}")
return "\n".join(lines)
def convert_para(para_div):
"""Convert a paragraph div, including nested label-paras (a), (b), etc."""
parts = []
for child in para_div.children:
if isinstance(child, NavigableString):
continue
classes = child.get("class", [])
if child.name == "p" and "text" in classes:
parts.append(clean_text(child))
elif "label-para" in classes:
# Nested (a), (b) items
lp_h5 = child.find("h5", class_="label-para")
lp_label = ""
if lp_h5:
lp_span = lp_h5.find("span", class_="label")
lp_label = lp_span.get_text(strip=True) if lp_span else ""
lp_paras = child.find_all("div", class_="para")
for lp_para in lp_paras:
lp_text = convert_para(lp_para) # Recursive for sub-sub items
if lp_text:
parts.append(f"\n {lp_label} {lp_text}")
return " ".join(parts) if parts else ""
def convert_table(table):
"""Basic table conversion to Markdown."""
rows = table.find_all("tr")
if not rows:
return ""
md_rows = []
for row in rows:
cells = row.find_all(["th", "td"])
cell_texts = [clean_text(c) for c in cells]
md_rows.append("| " + " | ".join(cell_texts) + " |")
if row.find("th"):
md_rows.append("| " + " | ".join(["---"] * len(cells)) + " |")
return "\n".join(md_rows)
def extract_part(soup, part_id):
"""Extract a full Part div by its ID."""
return soup.find("div", class_="part", id=part_id)
def extract_subpart_by_content(soup, text_match):
"""Find a subpart containing specific text in its heading."""
for sp in soup.find_all("div", class_="subpart"):
h3 = sp.find("h3", class_="subpart")
if h3 and text_match.lower() in h3.get_text().lower():
return sp
return None
def convert_part(part_div, heading_level=2, base_url=""):
"""Convert a full Part div to Markdown."""
lines = []
# Part heading
h2 = part_div.find("h2", class_="part", recursive=False)
if h2:
lines.append(f"\n{'#' * heading_level} {clean_text(h2)}\n")
# Process children: subparts and direct provisions
for child in part_div.children:
if isinstance(child, NavigableString):
continue
classes = child.get("class", [])
if "subpart" in classes:
lines.append(convert_subpart(child, base_url=base_url))
elif "prov" in classes:
lines.append(convert_prov(child, base_url=base_url))
elif "crosshead" in classes:
# Cross-headings (thematic groupings within subparts)
lines.append(f"\n##### {clean_text(child)}\n")
return "\n".join(lines)
def convert_subpart(subpart_div, base_url=""):
"""Convert a subpart div to Markdown."""
lines = []
h3 = subpart_div.find("h3", class_="subpart", recursive=False)
if h3:
lines.append(f"\n### {clean_text(h3)}\n")
for child in subpart_div.children:
if isinstance(child, NavigableString):
continue
classes = child.get("class", [])
if "prov" in classes:
lines.append(convert_prov(child, base_url=base_url))
elif "crosshead" in classes:
lines.append(f"\n##### {clean_text(child)}\n")
return "\n".join(lines)
def build_legislation_compilation():
print("Building legislation compilation from HTML sources\n")
# Download Act
act_path = download_html(
"https://www.legislation.govt.nz/act/public/2020/0038/latest/whole.html",
"education-training-act-2020.html",
)
# Download Regulations
regs_path = download_html(
"https://www.legislation.govt.nz/regulation/public/2008/0204/latest/whole.html",
"ece-regulations-2008.html",
)
# Parse Act
print("\nParsing Education and Training Act 2020...")
with open(act_path, "r", encoding="utf-8") as f:
act_soup = BeautifulSoup(f, "html.parser")
act_base_url = "https://www.legislation.govt.nz/act/public/2020/0038/latest/"
# Extract Part 2 (ECE) — ID from the HTML we inspected
part2 = extract_part(act_soup, "LMS171362")
if part2:
part2_md = convert_part(part2, base_url=act_base_url)
print(f" Part 2: {len(part2_md)} chars")
else:
print(" WARNING: Part 2 not found!")
part2_md = ""
# Extract ERO subpart (Part 5, Subpart 3 — Education Review Office)
ero_subpart = extract_subpart_by_content(act_soup, "Education Review Office")
if ero_subpart:
ero_md = convert_subpart(ero_subpart, base_url=act_base_url)
print(f" ERO subpart: {len(ero_md)} chars")
else:
print(" WARNING: ERO subpart not found!")
ero_md = ""
# Parse Regulations
print("\nParsing ECE Regulations 2008...")
with open(regs_path, "r", encoding="utf-8") as f:
regs_soup = BeautifulSoup(f, "html.parser")
regs_base_url = "https://www.legislation.govt.nz/regulation/public/2008/0204/latest/"
# Convert all parts of the regulations
regs_parts = regs_soup.find_all("div", class_="part")
regs_md_parts = []
for part in regs_parts:
regs_md_parts.append(convert_part(part, base_url=regs_base_url))
regs_md = "\n".join(regs_md_parts)
print(f" Regulations: {len(regs_md)} chars ({len(regs_parts)} parts)")
# Assemble compilation
compilation = f"""# Legislation — ECE Regulatory Framework
This compilation contains the primary legislation governing Early Childhood Education in New Zealand, converted from the authoritative HTML versions on legislation.govt.nz.
{part2_md}
## Education and Training Act 2020 — ERO Powers (Part 5, Subpart 3)
The Education Review Office reviews ECE services under these provisions.
{ero_md}
## Education (Early Childhood Services) Regulations 2008
These regulations set out the detailed requirements for ECE service licensing, including adult-to-child ratios, qualifications, and the licensing criteria framework.
{regs_md}
"""
output_path = os.path.join(CORPUS_DIR, "legislation.md")
with open(output_path, "w", encoding="utf-8") as f:
f.write(compilation)
lines = compilation.count("\n") + 1
size_kb = len(compilation.encode("utf-8")) / 1024
print(f"\nLegislation compilation: {lines} lines, {size_kb:.1f} KB")
print(f"Saved to: {output_path}")
if __name__ == "__main__":
build_legislation_compilation()