Spaces:

webmuppetnz
/

hmc-rag

Running

hmc-rag / scripts /convert_legislation_html.py

webmuppet

Initial commit — health marketing compliance RAG

bad8b6c 5 days ago

11.1 kB

	"""
	Convert NZ legislation from HTML (legislation.govt.nz) to structured Markdown.

	HTML source has clean semantic markup:
	div.part > h2.part → ## Part X — Title
	div.subpart > h3.subpart → ### Subpart X — Title
	div.prov > h5.prov → #### Section X — Title
	div.subprov > span.label (1) → body text with (1), (2) etc.
	div.label-para > (a), (b) → indented list items

	Usage:
	uv run python scripts/convert_legislation_html.py
	"""

	import os
	import re
	import sys
	from bs4 import BeautifulSoup, NavigableString

	SOURCES_RAW = os.path.join(os.path.dirname(__file__), "..", "sources", "raw")
	CORPUS_DIR = os.path.join(os.path.dirname(__file__), "..", "corpus")
	os.makedirs(CORPUS_DIR, exist_ok=True)


	def download_html(url, filename):
	"""Download HTML if not already cached."""
	path = os.path.join(SOURCES_RAW, filename)
	if os.path.exists(path) and os.path.getsize(path) > 100000:
	print(f" Using cached: {path}")
	return path

	import urllib.request
	req = urllib.request.Request(url, headers={
	"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
	})
	print(f" Downloading: {url}")
	with urllib.request.urlopen(req) as resp:
	data = resp.read()
	with open(path, "wb") as f:
	f.write(data)
	print(f" Saved: {path} ({len(data) / 1024:.0f} KB)")
	return path


	def clean_text(el):
	"""Extract clean text from an element, collapsing whitespace."""
	text = el.get_text(separator=" ", strip=True)
	text = re.sub(r"\s+", " ", text)
	return text.strip()


	def convert_prov(prov_div, base_url=""):
	"""Convert a single provision (section) to Markdown."""
	lines = []

	# Source URL from the provision's HTML id attribute
	prov_id = prov_div.get("id", "")
	if prov_id and base_url:
	lines.append(f"\nSource: {base_url}{prov_id}.html")

	# Section heading
	h5 = prov_div.find("h5", class_="prov", recursive=False)
	if h5:
	label_span = h5.find("span", class_="label")
	section_num = label_span.get_text(strip=True) if label_span else ""
	# Get title text (everything after the label)
	title_parts = []
	for child in h5.children:
	if isinstance(child, NavigableString):
	t = child.strip()
	if t:
	title_parts.append(t)
	elif child.name != "span" or "label" not in child.get("class", []):
	title_parts.append(child.get_text(strip=True))
	title = " ".join(title_parts).strip()
	lines.append(f"\n#### {section_num} {title}\n")

	# Provision body
	prov_body = prov_div.find("div", class_="prov-body", recursive=False)
	if prov_body:
	lines.append(convert_prov_body(prov_body))

	# Compare notes (cross-references to old legislation)
	for cf in prov_div.find_all("p", class_="cf", recursive=False):
	lines.append(f"\n{clean_text(cf)}\n")

	# History/amendment notes
	for hist in prov_div.find_all("div", class_="history", recursive=False):
	for p in hist.find_all("p", class_="history-note"):
	lines.append(f"\n{clean_text(p)}\n")

	return "\n".join(lines)


	def convert_prov_body(body):
	"""Convert the body of a provision to Markdown text."""
	lines = []

	for child in body.children:
	if isinstance(child, NavigableString):
	continue

	classes = child.get("class", [])

	# Subsections (1), (2), etc.
	if "subprov" in classes:
	label_p = child.find("p", class_="subprov", recursive=False)
	label_span = label_p.find("span", class_="label") if label_p else None
	sub_label = label_span.get_text(strip=True) if label_span else ""

	# Get the text from direct child para divs only (not nested label-para content)
	paras = child.find_all("div", class_="para", recursive=False)
	for para in paras:
	para_text = convert_para(para)
	if para_text:
	if sub_label:
	lines.append(f"\n{sub_label} {para_text}")
	sub_label = "" # Only use on first para
	else:
	lines.append(f"\n{para_text}")

	# Direct paragraphs
	elif child.name == "div" and "para" in classes:
	lines.append(f"\n{convert_para(child)}")

	# Tables
	elif child.name == "table":
	lines.append(f"\n{convert_table(child)}")

	return "\n".join(lines)


	def convert_para(para_div):
	"""Convert a paragraph div, including nested label-paras (a), (b), etc."""
	parts = []

	for child in para_div.children:
	if isinstance(child, NavigableString):
	continue

	classes = child.get("class", [])

	if child.name == "p" and "text" in classes:
	parts.append(clean_text(child))

	elif "label-para" in classes:
	# Nested (a), (b) items
	lp_h5 = child.find("h5", class_="label-para")
	lp_label = ""
	if lp_h5:
	lp_span = lp_h5.find("span", class_="label")
	lp_label = lp_span.get_text(strip=True) if lp_span else ""

	lp_paras = child.find_all("div", class_="para")
	for lp_para in lp_paras:
	lp_text = convert_para(lp_para) # Recursive for sub-sub items
	if lp_text:
	parts.append(f"\n {lp_label} {lp_text}")

	return " ".join(parts) if parts else ""


	def convert_table(table):
	"""Basic table conversion to Markdown."""
	rows = table.find_all("tr")
	if not rows:
	return ""

	md_rows = []
	for row in rows:
	cells = row.find_all(["th", "td"])
	cell_texts = [clean_text(c) for c in cells]
	md_rows.append("\| " + " \| ".join(cell_texts) + " \|")
	if row.find("th"):
	md_rows.append("\| " + " \| ".join(["---"] * len(cells)) + " \|")

	return "\n".join(md_rows)


	def extract_part(soup, part_id):
	"""Extract a full Part div by its ID."""
	return soup.find("div", class_="part", id=part_id)


	def extract_subpart_by_content(soup, text_match):
	"""Find a subpart containing specific text in its heading."""
	for sp in soup.find_all("div", class_="subpart"):
	h3 = sp.find("h3", class_="subpart")
	if h3 and text_match.lower() in h3.get_text().lower():
	return sp
	return None


	def convert_part(part_div, heading_level=2, base_url=""):
	"""Convert a full Part div to Markdown."""
	lines = []

	# Part heading
	h2 = part_div.find("h2", class_="part", recursive=False)
	if h2:
	lines.append(f"\n{'#' * heading_level} {clean_text(h2)}\n")

	# Process children: subparts and direct provisions
	for child in part_div.children:
	if isinstance(child, NavigableString):
	continue

	classes = child.get("class", [])

	if "subpart" in classes:
	lines.append(convert_subpart(child, base_url=base_url))
	elif "prov" in classes:
	lines.append(convert_prov(child, base_url=base_url))
	elif "crosshead" in classes:
	# Cross-headings (thematic groupings within subparts)
	lines.append(f"\n##### {clean_text(child)}\n")

	return "\n".join(lines)


	def convert_subpart(subpart_div, base_url=""):
	"""Convert a subpart div to Markdown."""
	lines = []

	h3 = subpart_div.find("h3", class_="subpart", recursive=False)
	if h3:
	lines.append(f"\n### {clean_text(h3)}\n")

	for child in subpart_div.children:
	if isinstance(child, NavigableString):
	continue

	classes = child.get("class", [])

	if "prov" in classes:
	lines.append(convert_prov(child, base_url=base_url))
	elif "crosshead" in classes:
	lines.append(f"\n##### {clean_text(child)}\n")

	return "\n".join(lines)


	def build_legislation_compilation():
	print("Building legislation compilation from HTML sources\n")

	# Download Act
	act_path = download_html(
	"https://www.legislation.govt.nz/act/public/2020/0038/latest/whole.html",
	"education-training-act-2020.html",
	)

	# Download Regulations
	regs_path = download_html(
	"https://www.legislation.govt.nz/regulation/public/2008/0204/latest/whole.html",
	"ece-regulations-2008.html",
	)

	# Parse Act
	print("\nParsing Education and Training Act 2020...")
	with open(act_path, "r", encoding="utf-8") as f:
	act_soup = BeautifulSoup(f, "html.parser")

	act_base_url = "https://www.legislation.govt.nz/act/public/2020/0038/latest/"

	# Extract Part 2 (ECE) — ID from the HTML we inspected
	part2 = extract_part(act_soup, "LMS171362")
	if part2:
	part2_md = convert_part(part2, base_url=act_base_url)
	print(f" Part 2: {len(part2_md)} chars")
	else:
	print(" WARNING: Part 2 not found!")
	part2_md = ""

	# Extract ERO subpart (Part 5, Subpart 3 — Education Review Office)
	ero_subpart = extract_subpart_by_content(act_soup, "Education Review Office")
	if ero_subpart:
	ero_md = convert_subpart(ero_subpart, base_url=act_base_url)
	print(f" ERO subpart: {len(ero_md)} chars")
	else:
	print(" WARNING: ERO subpart not found!")
	ero_md = ""

	# Parse Regulations
	print("\nParsing ECE Regulations 2008...")
	with open(regs_path, "r", encoding="utf-8") as f:
	regs_soup = BeautifulSoup(f, "html.parser")

	regs_base_url = "https://www.legislation.govt.nz/regulation/public/2008/0204/latest/"

	# Convert all parts of the regulations
	regs_parts = regs_soup.find_all("div", class_="part")
	regs_md_parts = []
	for part in regs_parts:
	regs_md_parts.append(convert_part(part, base_url=regs_base_url))
	regs_md = "\n".join(regs_md_parts)
	print(f" Regulations: {len(regs_md)} chars ({len(regs_parts)} parts)")

	# Assemble compilation
	compilation = f"""# Legislation — ECE Regulatory Framework

	This compilation contains the primary legislation governing Early Childhood Education in New Zealand, converted from the authoritative HTML versions on legislation.govt.nz.

	{part2_md}

	## Education and Training Act 2020 — ERO Powers (Part 5, Subpart 3)

	The Education Review Office reviews ECE services under these provisions.

	{ero_md}

	## Education (Early Childhood Services) Regulations 2008

	These regulations set out the detailed requirements for ECE service licensing, including adult-to-child ratios, qualifications, and the licensing criteria framework.

	{regs_md}
	"""

	output_path = os.path.join(CORPUS_DIR, "legislation.md")
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(compilation)

	lines = compilation.count("\n") + 1
	size_kb = len(compilation.encode("utf-8")) / 1024
	print(f"\nLegislation compilation: {lines} lines, {size_kb:.1f} KB")
	print(f"Saved to: {output_path}")


	if __name__ == "__main__":
	build_legislation_compilation()