""" Convert NZ legislation from HTML (legislation.govt.nz) to structured Markdown. HTML source has clean semantic markup: div.part > h2.part → ## Part X — Title div.subpart > h3.subpart → ### Subpart X — Title div.prov > h5.prov → #### Section X — Title div.subprov > span.label (1) → body text with (1), (2) etc. div.label-para > (a), (b) → indented list items Usage: uv run python scripts/convert_legislation_html.py """ import os import re import sys from bs4 import BeautifulSoup, NavigableString SOURCES_RAW = os.path.join(os.path.dirname(__file__), "..", "sources", "raw") CORPUS_DIR = os.path.join(os.path.dirname(__file__), "..", "corpus") os.makedirs(CORPUS_DIR, exist_ok=True) def download_html(url, filename): """Download HTML if not already cached.""" path = os.path.join(SOURCES_RAW, filename) if os.path.exists(path) and os.path.getsize(path) > 100000: print(f" Using cached: {path}") return path import urllib.request req = urllib.request.Request(url, headers={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)" }) print(f" Downloading: {url}") with urllib.request.urlopen(req) as resp: data = resp.read() with open(path, "wb") as f: f.write(data) print(f" Saved: {path} ({len(data) / 1024:.0f} KB)") return path def clean_text(el): """Extract clean text from an element, collapsing whitespace.""" text = el.get_text(separator=" ", strip=True) text = re.sub(r"\s+", " ", text) return text.strip() def convert_prov(prov_div, base_url=""): """Convert a single provision (section) to Markdown.""" lines = [] # Source URL from the provision's HTML id attribute prov_id = prov_div.get("id", "") if prov_id and base_url: lines.append(f"\nSource: {base_url}{prov_id}.html") # Section heading h5 = prov_div.find("h5", class_="prov", recursive=False) if h5: label_span = h5.find("span", class_="label") section_num = label_span.get_text(strip=True) if label_span else "" # Get title text (everything after the label) title_parts = [] for child in h5.children: if isinstance(child, NavigableString): t = child.strip() if t: title_parts.append(t) elif child.name != "span" or "label" not in child.get("class", []): title_parts.append(child.get_text(strip=True)) title = " ".join(title_parts).strip() lines.append(f"\n#### {section_num} {title}\n") # Provision body prov_body = prov_div.find("div", class_="prov-body", recursive=False) if prov_body: lines.append(convert_prov_body(prov_body)) # Compare notes (cross-references to old legislation) for cf in prov_div.find_all("p", class_="cf", recursive=False): lines.append(f"\n*{clean_text(cf)}*\n") # History/amendment notes for hist in prov_div.find_all("div", class_="history", recursive=False): for p in hist.find_all("p", class_="history-note"): lines.append(f"\n*{clean_text(p)}*\n") return "\n".join(lines) def convert_prov_body(body): """Convert the body of a provision to Markdown text.""" lines = [] for child in body.children: if isinstance(child, NavigableString): continue classes = child.get("class", []) # Subsections (1), (2), etc. if "subprov" in classes: label_p = child.find("p", class_="subprov", recursive=False) label_span = label_p.find("span", class_="label") if label_p else None sub_label = label_span.get_text(strip=True) if label_span else "" # Get the text from direct child para divs only (not nested label-para content) paras = child.find_all("div", class_="para", recursive=False) for para in paras: para_text = convert_para(para) if para_text: if sub_label: lines.append(f"\n{sub_label} {para_text}") sub_label = "" # Only use on first para else: lines.append(f"\n{para_text}") # Direct paragraphs elif child.name == "div" and "para" in classes: lines.append(f"\n{convert_para(child)}") # Tables elif child.name == "table": lines.append(f"\n{convert_table(child)}") return "\n".join(lines) def convert_para(para_div): """Convert a paragraph div, including nested label-paras (a), (b), etc.""" parts = [] for child in para_div.children: if isinstance(child, NavigableString): continue classes = child.get("class", []) if child.name == "p" and "text" in classes: parts.append(clean_text(child)) elif "label-para" in classes: # Nested (a), (b) items lp_h5 = child.find("h5", class_="label-para") lp_label = "" if lp_h5: lp_span = lp_h5.find("span", class_="label") lp_label = lp_span.get_text(strip=True) if lp_span else "" lp_paras = child.find_all("div", class_="para") for lp_para in lp_paras: lp_text = convert_para(lp_para) # Recursive for sub-sub items if lp_text: parts.append(f"\n {lp_label} {lp_text}") return " ".join(parts) if parts else "" def convert_table(table): """Basic table conversion to Markdown.""" rows = table.find_all("tr") if not rows: return "" md_rows = [] for row in rows: cells = row.find_all(["th", "td"]) cell_texts = [clean_text(c) for c in cells] md_rows.append("| " + " | ".join(cell_texts) + " |") if row.find("th"): md_rows.append("| " + " | ".join(["---"] * len(cells)) + " |") return "\n".join(md_rows) def extract_part(soup, part_id): """Extract a full Part div by its ID.""" return soup.find("div", class_="part", id=part_id) def extract_subpart_by_content(soup, text_match): """Find a subpart containing specific text in its heading.""" for sp in soup.find_all("div", class_="subpart"): h3 = sp.find("h3", class_="subpart") if h3 and text_match.lower() in h3.get_text().lower(): return sp return None def convert_part(part_div, heading_level=2, base_url=""): """Convert a full Part div to Markdown.""" lines = [] # Part heading h2 = part_div.find("h2", class_="part", recursive=False) if h2: lines.append(f"\n{'#' * heading_level} {clean_text(h2)}\n") # Process children: subparts and direct provisions for child in part_div.children: if isinstance(child, NavigableString): continue classes = child.get("class", []) if "subpart" in classes: lines.append(convert_subpart(child, base_url=base_url)) elif "prov" in classes: lines.append(convert_prov(child, base_url=base_url)) elif "crosshead" in classes: # Cross-headings (thematic groupings within subparts) lines.append(f"\n##### {clean_text(child)}\n") return "\n".join(lines) def convert_subpart(subpart_div, base_url=""): """Convert a subpart div to Markdown.""" lines = [] h3 = subpart_div.find("h3", class_="subpart", recursive=False) if h3: lines.append(f"\n### {clean_text(h3)}\n") for child in subpart_div.children: if isinstance(child, NavigableString): continue classes = child.get("class", []) if "prov" in classes: lines.append(convert_prov(child, base_url=base_url)) elif "crosshead" in classes: lines.append(f"\n##### {clean_text(child)}\n") return "\n".join(lines) def build_legislation_compilation(): print("Building legislation compilation from HTML sources\n") # Download Act act_path = download_html( "https://www.legislation.govt.nz/act/public/2020/0038/latest/whole.html", "education-training-act-2020.html", ) # Download Regulations regs_path = download_html( "https://www.legislation.govt.nz/regulation/public/2008/0204/latest/whole.html", "ece-regulations-2008.html", ) # Parse Act print("\nParsing Education and Training Act 2020...") with open(act_path, "r", encoding="utf-8") as f: act_soup = BeautifulSoup(f, "html.parser") act_base_url = "https://www.legislation.govt.nz/act/public/2020/0038/latest/" # Extract Part 2 (ECE) — ID from the HTML we inspected part2 = extract_part(act_soup, "LMS171362") if part2: part2_md = convert_part(part2, base_url=act_base_url) print(f" Part 2: {len(part2_md)} chars") else: print(" WARNING: Part 2 not found!") part2_md = "" # Extract ERO subpart (Part 5, Subpart 3 — Education Review Office) ero_subpart = extract_subpart_by_content(act_soup, "Education Review Office") if ero_subpart: ero_md = convert_subpart(ero_subpart, base_url=act_base_url) print(f" ERO subpart: {len(ero_md)} chars") else: print(" WARNING: ERO subpart not found!") ero_md = "" # Parse Regulations print("\nParsing ECE Regulations 2008...") with open(regs_path, "r", encoding="utf-8") as f: regs_soup = BeautifulSoup(f, "html.parser") regs_base_url = "https://www.legislation.govt.nz/regulation/public/2008/0204/latest/" # Convert all parts of the regulations regs_parts = regs_soup.find_all("div", class_="part") regs_md_parts = [] for part in regs_parts: regs_md_parts.append(convert_part(part, base_url=regs_base_url)) regs_md = "\n".join(regs_md_parts) print(f" Regulations: {len(regs_md)} chars ({len(regs_parts)} parts)") # Assemble compilation compilation = f"""# Legislation — ECE Regulatory Framework This compilation contains the primary legislation governing Early Childhood Education in New Zealand, converted from the authoritative HTML versions on legislation.govt.nz. {part2_md} ## Education and Training Act 2020 — ERO Powers (Part 5, Subpart 3) The Education Review Office reviews ECE services under these provisions. {ero_md} ## Education (Early Childhood Services) Regulations 2008 These regulations set out the detailed requirements for ECE service licensing, including adult-to-child ratios, qualifications, and the licensing criteria framework. {regs_md} """ output_path = os.path.join(CORPUS_DIR, "legislation.md") with open(output_path, "w", encoding="utf-8") as f: f.write(compilation) lines = compilation.count("\n") + 1 size_kb = len(compilation.encode("utf-8")) / 1024 print(f"\nLegislation compilation: {lines} lines, {size_kb:.1f} KB") print(f"Saved to: {output_path}") if __name__ == "__main__": build_legislation_compilation()