Spaces:
Running
Running
| """ | |
| Convert NZ legislation from HTML (legislation.govt.nz) to structured Markdown. | |
| HTML source has clean semantic markup: | |
| div.part > h2.part β ## Part X β Title | |
| div.subpart > h3.subpart β ### Subpart X β Title | |
| div.prov > h5.prov β #### Section X β Title | |
| div.subprov > span.label (1) β body text with (1), (2) etc. | |
| div.label-para > (a), (b) β indented list items | |
| Usage: | |
| uv run python scripts/convert_legislation_html.py | |
| """ | |
| import os | |
| import re | |
| import sys | |
| from bs4 import BeautifulSoup, NavigableString | |
| SOURCES_RAW = os.path.join(os.path.dirname(__file__), "..", "sources", "raw") | |
| CORPUS_DIR = os.path.join(os.path.dirname(__file__), "..", "corpus") | |
| os.makedirs(CORPUS_DIR, exist_ok=True) | |
| def download_html(url, filename): | |
| """Download HTML if not already cached.""" | |
| path = os.path.join(SOURCES_RAW, filename) | |
| if os.path.exists(path) and os.path.getsize(path) > 100000: | |
| print(f" Using cached: {path}") | |
| return path | |
| import urllib.request | |
| req = urllib.request.Request(url, headers={ | |
| "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)" | |
| }) | |
| print(f" Downloading: {url}") | |
| with urllib.request.urlopen(req) as resp: | |
| data = resp.read() | |
| with open(path, "wb") as f: | |
| f.write(data) | |
| print(f" Saved: {path} ({len(data) / 1024:.0f} KB)") | |
| return path | |
| def clean_text(el): | |
| """Extract clean text from an element, collapsing whitespace.""" | |
| text = el.get_text(separator=" ", strip=True) | |
| text = re.sub(r"\s+", " ", text) | |
| return text.strip() | |
| def convert_prov(prov_div, base_url=""): | |
| """Convert a single provision (section) to Markdown.""" | |
| lines = [] | |
| # Source URL from the provision's HTML id attribute | |
| prov_id = prov_div.get("id", "") | |
| if prov_id and base_url: | |
| lines.append(f"\nSource: {base_url}{prov_id}.html") | |
| # Section heading | |
| h5 = prov_div.find("h5", class_="prov", recursive=False) | |
| if h5: | |
| label_span = h5.find("span", class_="label") | |
| section_num = label_span.get_text(strip=True) if label_span else "" | |
| # Get title text (everything after the label) | |
| title_parts = [] | |
| for child in h5.children: | |
| if isinstance(child, NavigableString): | |
| t = child.strip() | |
| if t: | |
| title_parts.append(t) | |
| elif child.name != "span" or "label" not in child.get("class", []): | |
| title_parts.append(child.get_text(strip=True)) | |
| title = " ".join(title_parts).strip() | |
| lines.append(f"\n#### {section_num} {title}\n") | |
| # Provision body | |
| prov_body = prov_div.find("div", class_="prov-body", recursive=False) | |
| if prov_body: | |
| lines.append(convert_prov_body(prov_body)) | |
| # Compare notes (cross-references to old legislation) | |
| for cf in prov_div.find_all("p", class_="cf", recursive=False): | |
| lines.append(f"\n*{clean_text(cf)}*\n") | |
| # History/amendment notes | |
| for hist in prov_div.find_all("div", class_="history", recursive=False): | |
| for p in hist.find_all("p", class_="history-note"): | |
| lines.append(f"\n*{clean_text(p)}*\n") | |
| return "\n".join(lines) | |
| def convert_prov_body(body): | |
| """Convert the body of a provision to Markdown text.""" | |
| lines = [] | |
| for child in body.children: | |
| if isinstance(child, NavigableString): | |
| continue | |
| classes = child.get("class", []) | |
| # Subsections (1), (2), etc. | |
| if "subprov" in classes: | |
| label_p = child.find("p", class_="subprov", recursive=False) | |
| label_span = label_p.find("span", class_="label") if label_p else None | |
| sub_label = label_span.get_text(strip=True) if label_span else "" | |
| # Get the text from direct child para divs only (not nested label-para content) | |
| paras = child.find_all("div", class_="para", recursive=False) | |
| for para in paras: | |
| para_text = convert_para(para) | |
| if para_text: | |
| if sub_label: | |
| lines.append(f"\n{sub_label} {para_text}") | |
| sub_label = "" # Only use on first para | |
| else: | |
| lines.append(f"\n{para_text}") | |
| # Direct paragraphs | |
| elif child.name == "div" and "para" in classes: | |
| lines.append(f"\n{convert_para(child)}") | |
| # Tables | |
| elif child.name == "table": | |
| lines.append(f"\n{convert_table(child)}") | |
| return "\n".join(lines) | |
| def convert_para(para_div): | |
| """Convert a paragraph div, including nested label-paras (a), (b), etc.""" | |
| parts = [] | |
| for child in para_div.children: | |
| if isinstance(child, NavigableString): | |
| continue | |
| classes = child.get("class", []) | |
| if child.name == "p" and "text" in classes: | |
| parts.append(clean_text(child)) | |
| elif "label-para" in classes: | |
| # Nested (a), (b) items | |
| lp_h5 = child.find("h5", class_="label-para") | |
| lp_label = "" | |
| if lp_h5: | |
| lp_span = lp_h5.find("span", class_="label") | |
| lp_label = lp_span.get_text(strip=True) if lp_span else "" | |
| lp_paras = child.find_all("div", class_="para") | |
| for lp_para in lp_paras: | |
| lp_text = convert_para(lp_para) # Recursive for sub-sub items | |
| if lp_text: | |
| parts.append(f"\n {lp_label} {lp_text}") | |
| return " ".join(parts) if parts else "" | |
| def convert_table(table): | |
| """Basic table conversion to Markdown.""" | |
| rows = table.find_all("tr") | |
| if not rows: | |
| return "" | |
| md_rows = [] | |
| for row in rows: | |
| cells = row.find_all(["th", "td"]) | |
| cell_texts = [clean_text(c) for c in cells] | |
| md_rows.append("| " + " | ".join(cell_texts) + " |") | |
| if row.find("th"): | |
| md_rows.append("| " + " | ".join(["---"] * len(cells)) + " |") | |
| return "\n".join(md_rows) | |
| def extract_part(soup, part_id): | |
| """Extract a full Part div by its ID.""" | |
| return soup.find("div", class_="part", id=part_id) | |
| def extract_subpart_by_content(soup, text_match): | |
| """Find a subpart containing specific text in its heading.""" | |
| for sp in soup.find_all("div", class_="subpart"): | |
| h3 = sp.find("h3", class_="subpart") | |
| if h3 and text_match.lower() in h3.get_text().lower(): | |
| return sp | |
| return None | |
| def convert_part(part_div, heading_level=2, base_url=""): | |
| """Convert a full Part div to Markdown.""" | |
| lines = [] | |
| # Part heading | |
| h2 = part_div.find("h2", class_="part", recursive=False) | |
| if h2: | |
| lines.append(f"\n{'#' * heading_level} {clean_text(h2)}\n") | |
| # Process children: subparts and direct provisions | |
| for child in part_div.children: | |
| if isinstance(child, NavigableString): | |
| continue | |
| classes = child.get("class", []) | |
| if "subpart" in classes: | |
| lines.append(convert_subpart(child, base_url=base_url)) | |
| elif "prov" in classes: | |
| lines.append(convert_prov(child, base_url=base_url)) | |
| elif "crosshead" in classes: | |
| # Cross-headings (thematic groupings within subparts) | |
| lines.append(f"\n##### {clean_text(child)}\n") | |
| return "\n".join(lines) | |
| def convert_subpart(subpart_div, base_url=""): | |
| """Convert a subpart div to Markdown.""" | |
| lines = [] | |
| h3 = subpart_div.find("h3", class_="subpart", recursive=False) | |
| if h3: | |
| lines.append(f"\n### {clean_text(h3)}\n") | |
| for child in subpart_div.children: | |
| if isinstance(child, NavigableString): | |
| continue | |
| classes = child.get("class", []) | |
| if "prov" in classes: | |
| lines.append(convert_prov(child, base_url=base_url)) | |
| elif "crosshead" in classes: | |
| lines.append(f"\n##### {clean_text(child)}\n") | |
| return "\n".join(lines) | |
| def build_legislation_compilation(): | |
| print("Building legislation compilation from HTML sources\n") | |
| # Download Act | |
| act_path = download_html( | |
| "https://www.legislation.govt.nz/act/public/2020/0038/latest/whole.html", | |
| "education-training-act-2020.html", | |
| ) | |
| # Download Regulations | |
| regs_path = download_html( | |
| "https://www.legislation.govt.nz/regulation/public/2008/0204/latest/whole.html", | |
| "ece-regulations-2008.html", | |
| ) | |
| # Parse Act | |
| print("\nParsing Education and Training Act 2020...") | |
| with open(act_path, "r", encoding="utf-8") as f: | |
| act_soup = BeautifulSoup(f, "html.parser") | |
| act_base_url = "https://www.legislation.govt.nz/act/public/2020/0038/latest/" | |
| # Extract Part 2 (ECE) β ID from the HTML we inspected | |
| part2 = extract_part(act_soup, "LMS171362") | |
| if part2: | |
| part2_md = convert_part(part2, base_url=act_base_url) | |
| print(f" Part 2: {len(part2_md)} chars") | |
| else: | |
| print(" WARNING: Part 2 not found!") | |
| part2_md = "" | |
| # Extract ERO subpart (Part 5, Subpart 3 β Education Review Office) | |
| ero_subpart = extract_subpart_by_content(act_soup, "Education Review Office") | |
| if ero_subpart: | |
| ero_md = convert_subpart(ero_subpart, base_url=act_base_url) | |
| print(f" ERO subpart: {len(ero_md)} chars") | |
| else: | |
| print(" WARNING: ERO subpart not found!") | |
| ero_md = "" | |
| # Parse Regulations | |
| print("\nParsing ECE Regulations 2008...") | |
| with open(regs_path, "r", encoding="utf-8") as f: | |
| regs_soup = BeautifulSoup(f, "html.parser") | |
| regs_base_url = "https://www.legislation.govt.nz/regulation/public/2008/0204/latest/" | |
| # Convert all parts of the regulations | |
| regs_parts = regs_soup.find_all("div", class_="part") | |
| regs_md_parts = [] | |
| for part in regs_parts: | |
| regs_md_parts.append(convert_part(part, base_url=regs_base_url)) | |
| regs_md = "\n".join(regs_md_parts) | |
| print(f" Regulations: {len(regs_md)} chars ({len(regs_parts)} parts)") | |
| # Assemble compilation | |
| compilation = f"""# Legislation β ECE Regulatory Framework | |
| This compilation contains the primary legislation governing Early Childhood Education in New Zealand, converted from the authoritative HTML versions on legislation.govt.nz. | |
| {part2_md} | |
| ## Education and Training Act 2020 β ERO Powers (Part 5, Subpart 3) | |
| The Education Review Office reviews ECE services under these provisions. | |
| {ero_md} | |
| ## Education (Early Childhood Services) Regulations 2008 | |
| These regulations set out the detailed requirements for ECE service licensing, including adult-to-child ratios, qualifications, and the licensing criteria framework. | |
| {regs_md} | |
| """ | |
| output_path = os.path.join(CORPUS_DIR, "legislation.md") | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(compilation) | |
| lines = compilation.count("\n") + 1 | |
| size_kb = len(compilation.encode("utf-8")) / 1024 | |
| print(f"\nLegislation compilation: {lines} lines, {size_kb:.1f} KB") | |
| print(f"Saved to: {output_path}") | |
| if __name__ == "__main__": | |
| build_legislation_compilation() | |