Spaces:

webmuppetnz
/

hmc-rag

Sleeping

hmc-rag / scripts /build_practitioner_regulation_compilation.py

webmuppet

Initial commit — health marketing compliance RAG

bad8b6c 15 days ago

10.4 kB

	"""
	Build the `practitioner_regulation` corpus domain.

	Combines the rules that govern who can call themselves what, what scope of
	practice means, and the consumer-rights framework that overlays advertising
	compliance:

	- Health Practitioners Competence Assurance Act 2003 (HPCA Act) — defines
	health practitioner, scopes of practice, restricted activities, title use.
	Foundation for "can I call myself X?" questions.
	- HDC Code of Health and Disability Services Consumers' Rights — Right 6
	(right to information) and Right 7 (informed consent) routinely cited in
	advertising complaints; HDC has run cases on misleading clinic websites.
	- ACC provider responsibilities — for the chunk of the audience (chiros,
	osteos, physios, acupuncturists) who are commonly ACC-registered, ACC
	contracts add a marketing-conduct layer.

	Pattern adapted from `build_medicines_and_supplements_compilation.py`.

	Note on ACC: Becki flagged that ACC's provider-facing material is less
	standardised than the legislation/code documents. v1 includes the
	Understanding Your Responsibilities hub page plus the Working Together
	under the Cost of Treatment Regulations handbook. v2 can add more once
	we know what queries the audience actually asks.
	"""

	from __future__ import annotations

	import os
	import re
	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).resolve().parent))

	from bs4 import BeautifulSoup # noqa: E402
	from convert_legislation_html import ( # noqa: E402
	download_html,
	extract_part,
	extract_subpart_by_content,
	convert_part,
	convert_subpart,
	)
	from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402
	from extract_pdf import extract_to_markdown, demote_headings # noqa: E402

	PROJECT_ROOT = Path(__file__).resolve().parents[1]
	CORPUS_DIR = PROJECT_ROOT / "corpus"
	SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"

	CORPUS_DIR.mkdir(exist_ok=True)
	SOURCES_RAW.mkdir(parents=True, exist_ok=True)

	DOMAIN_FILE = CORPUS_DIR / "practitioner-regulation.md"

	LEGISLATION_SOURCES = [
	{
	"name": "Health Practitioners Competence Assurance Act 2003",
	"url": "https://www.legislation.govt.nz/act/public/2003/0048/latest/whole.html",
	"filename": "hpca-act-2003.html",
	"base_url": "https://www.legislation.govt.nz/act/public/2003/0048/latest/",
	"parts_by_id": [],
	"parts_by_text": [],
	"section_flags": [
	# Title-protection sections: "no person may claim to be registered..."
	# The HPCA Act prohibits unauthorised use of regulated titles like
	# "physiotherapist", "chiropractor", "osteopath", etc.
	# ss 7-10 are typically the title-use cluster; tag s7 as anchor.
	{
	"match": re.compile(r"^####\s7\b[^\n]", re.MULTILINE),
	"tags": "title-use, registration, scope-of-practice",
	},
	],
	},
	]

	GUIDANCE_SOURCES = [
	{
	"name": "HDC Code of Health and Disability Services Consumers' Rights",
	# NOTE: First attempt used the printable PDF
	# (hdc.org.nz/media/550hs5ih/code-of-rights_online_5-sept-2022.pdf) but its
	# multi-column glossy brochure layout caused markitdown to flatten everything
	# into 10,000-char single lines (Right 5 / Right 6 / Right 7 jumbled together).
	# The HDC HTML page is on SilverStripe with semantic markup and parses cleanly.
	"url": "https://www.hdc.org.nz/your-rights/about-the-code/code-of-health-and-disability-services-consumers-rights/",
	"filename": "hdc-code-of-rights.html",
	"format": "html",
	"section_title": "HDC Code of Health and Disability Services Consumers' Rights",
	},
	# ACC sources (Becki v3 spec): Provider Agreement template + Code of ACC
	# Claimants' Rights, as the starting pair. Allied-health-specific provider
	# standards are deferred to v2 (see docs/watchlist.md).
	{
	"name": "ACC — Contract for Services Standard Terms and Conditions (Provider Agreement template)",
	"url": "https://www.acc.co.nz/assets/contracts/health-contract-terms-conditions.pdf",
	"filename": "acc-health-contract-standard-terms-conditions.pdf",
	"format": "pdf",
	"section_title": "ACC — Contract for Services: Standard Terms and Conditions",
	},
	{
	"name": "Code of ACC Claimants' Rights",
	"url": "https://www.acc.co.nz/assets/im-injured/730eea8693/claimant-rights.pdf",
	"filename": "acc-code-of-claimants-rights.pdf",
	"format": "pdf",
	"section_title": "Code of ACC Claimants' Rights",
	},
	]


	def fetch_legislation(source: dict) -> str:
	"""Download (or use cached) HTML and convert to markdown with per-section URLs."""
	print(f"\n→ {source['name']}")
	path = download_html(source["url"], source["filename"])

	with open(path, "r", encoding="utf-8") as f:
	soup = BeautifulSoup(f, "html.parser")

	parts_md = []

	for part_id in source.get("parts_by_id", []):
	part = extract_part(soup, part_id)
	if part:
	md = convert_part(part, base_url=source["base_url"])
	parts_md.append(md)
	print(f" ✓ Part by id={part_id} ({len(md)} chars)")
	else:
	print(f" ⚠ Part id={part_id} not found")

	for text_match in source.get("parts_by_text", []):
	part = next(
	(p for p in soup.find_all("div", class_="part")
	if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()),
	None,
	)
	if part:
	md = convert_part(part, base_url=source["base_url"])
	parts_md.append(md)
	print(f" ✓ Part by text={text_match!r} ({len(md)} chars)")
	continue
	subpart = extract_subpart_by_content(soup, text_match)
	if subpart:
	md = convert_subpart(subpart, base_url=source["base_url"])
	parts_md.append(md)
	print(f" ✓ Subpart by text={text_match!r} ({len(md)} chars)")
	else:
	print(f" ⚠ No Part/Subpart matching text={text_match!r}")

	if not source.get("parts_by_id") and not source.get("parts_by_text"):
	for part in soup.find_all("div", class_="part"):
	md = convert_part(part, base_url=source["base_url"])
	parts_md.append(md)
	print(f" ✓ All Parts ({len(parts_md)} parts, "
	f"{sum(len(m) for m in parts_md)} chars)")

	merged = "\n\n".join(parts_md)

	# Clean PDF/HTML extraction artefacts before applying section flags
	merged, _clean_stats = clean_corpus_artifacts(merged)
	print(format_stats(_clean_stats, label=source["name"]))

	for flag in source.get("section_flags", []):
	before = merged.count(flag["tags"])
	merged = flag["match"].sub(
	lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}",
	merged,
	)
	applied = merged.count(flag["tags"]) - before
	print(f" ✓ Applied flag {flag['tags']!r}: {applied} match(es)")

	# Demote headings so legislation Parts/Subparts/sections nest under the
	# source-level H2 wrapper added in build() — avoids cross-Act H2 collisions
	# like Privacy Act + UEMA both having "Part 1 Preliminary provisions".
	merged = demote_headings(merged)

	return merged


	def fetch_guidance(source: dict) -> str:
	"""Download a guidance document (HTML or PDF) and convert to markdown."""
	print(f"\n→ {source['name']}")

	cache_path = SOURCES_RAW / source["filename"]
	if not cache_path.exists():
	import urllib.request
	req = urllib.request.Request(
	source["url"],
	headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
	)
	try:
	with urllib.request.urlopen(req) as resp:
	data = resp.read()
	cache_path.write_bytes(data)
	print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
	except Exception as e:
	print(f" ⚠ Download failed: {e}")
	return ""
	else:
	print(f" Using cached: {cache_path.name}")

	body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()

	# Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
	body, _clean_stats = clean_corpus_artifacts(body)
	print(format_stats(_clean_stats, label=source["name"]))

	# Demote body headings so they nest under the source-level H2 wrapper we
	# add below (avoids cross-source collisions like multiple "## Introduction").
	body = demote_headings(body)

	return (
	f"\n## {source['section_title']}\n\n"
	f"Source: {source['url']}\n\n"
	f"{body}\n"
	)


	def build():
	print("Building practitioner_regulation compilation\n")

	legislation_blocks = []
	for src in LEGISLATION_SOURCES:
	block = fetch_legislation(src)
	if block:
	legislation_blocks.append(
	f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}"
	)

	guidance_blocks = []
	for src in GUIDANCE_SOURCES:
	block = fetch_guidance(src)
	if block:
	guidance_blocks.append(block)

	body = "\n\n".join(legislation_blocks + guidance_blocks)

	compilation = f"""# Practitioner Regulation — NZ Healthcare Marketing Regulation

	Source: https://www.legislation.govt.nz/act/public/2003/0048/latest/

	This compilation covers the legal framework that defines who can call themselves a health practitioner, what scopes of practice mean for advertising claims, and the consumer-rights framework that overlays marketing conduct. The HPCA Act is foundational for "can I call myself X?" questions; the HDC Code of Rights (especially Right 6 right to information and Right 7 informed consent) is routinely cited in advertising complaints; ACC provider obligations add a contractual layer for the chunk of the audience that is ACC-registered.

	Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing.

	{body}
	"""

	DOMAIN_FILE.write_text(compilation, encoding="utf-8")
	lines = compilation.count("\n") + 1
	size_kb = len(compilation.encode("utf-8")) / 1024
	print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")


	if __name__ == "__main__":
	build()