hmc-rag / scripts /build_practitioner_regulation_compilation.py
webmuppet
Initial commit — health marketing compliance RAG
bad8b6c
"""
Build the `practitioner_regulation` corpus domain.
Combines the rules that govern who can call themselves what, what scope of
practice means, and the consumer-rights framework that overlays advertising
compliance:
- Health Practitioners Competence Assurance Act 2003 (HPCA Act) — defines
health practitioner, scopes of practice, restricted activities, title use.
Foundation for "can I call myself X?" questions.
- HDC Code of Health and Disability Services Consumers' Rights — Right 6
(right to information) and Right 7 (informed consent) routinely cited in
advertising complaints; HDC has run cases on misleading clinic websites.
- ACC provider responsibilities — for the chunk of the audience (chiros,
osteos, physios, acupuncturists) who are commonly ACC-registered, ACC
contracts add a marketing-conduct layer.
Pattern adapted from `build_medicines_and_supplements_compilation.py`.
Note on ACC: Becki flagged that ACC's provider-facing material is less
standardised than the legislation/code documents. v1 includes the
Understanding Your Responsibilities hub page plus the Working Together
under the Cost of Treatment Regulations handbook. v2 can add more once
we know what queries the audience actually asks.
"""
from __future__ import annotations
import os
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
from bs4 import BeautifulSoup # noqa: E402
from convert_legislation_html import ( # noqa: E402
download_html,
extract_part,
extract_subpart_by_content,
convert_part,
convert_subpart,
)
from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402
from extract_pdf import extract_to_markdown, demote_headings # noqa: E402
PROJECT_ROOT = Path(__file__).resolve().parents[1]
CORPUS_DIR = PROJECT_ROOT / "corpus"
SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"
CORPUS_DIR.mkdir(exist_ok=True)
SOURCES_RAW.mkdir(parents=True, exist_ok=True)
DOMAIN_FILE = CORPUS_DIR / "practitioner-regulation.md"
LEGISLATION_SOURCES = [
{
"name": "Health Practitioners Competence Assurance Act 2003",
"url": "https://www.legislation.govt.nz/act/public/2003/0048/latest/whole.html",
"filename": "hpca-act-2003.html",
"base_url": "https://www.legislation.govt.nz/act/public/2003/0048/latest/",
"parts_by_id": [],
"parts_by_text": [],
"section_flags": [
# Title-protection sections: "no person may claim to be registered..."
# The HPCA Act prohibits unauthorised use of regulated titles like
# "physiotherapist", "chiropractor", "osteopath", etc.
# ss 7-10 are typically the title-use cluster; tag s7 as anchor.
{
"match": re.compile(r"^####\s*7\b[^\n]*", re.MULTILINE),
"tags": "title-use, registration, scope-of-practice",
},
],
},
]
GUIDANCE_SOURCES = [
{
"name": "HDC Code of Health and Disability Services Consumers' Rights",
# NOTE: First attempt used the printable PDF
# (hdc.org.nz/media/550hs5ih/code-of-rights_online_5-sept-2022.pdf) but its
# multi-column glossy brochure layout caused markitdown to flatten everything
# into 10,000-char single lines (Right 5 / Right 6 / Right 7 jumbled together).
# The HDC HTML page is on SilverStripe with semantic markup and parses cleanly.
"url": "https://www.hdc.org.nz/your-rights/about-the-code/code-of-health-and-disability-services-consumers-rights/",
"filename": "hdc-code-of-rights.html",
"format": "html",
"section_title": "HDC Code of Health and Disability Services Consumers' Rights",
},
# ACC sources (Becki v3 spec): Provider Agreement template + Code of ACC
# Claimants' Rights, as the starting pair. Allied-health-specific provider
# standards are deferred to v2 (see docs/watchlist.md).
{
"name": "ACC — Contract for Services Standard Terms and Conditions (Provider Agreement template)",
"url": "https://www.acc.co.nz/assets/contracts/health-contract-terms-conditions.pdf",
"filename": "acc-health-contract-standard-terms-conditions.pdf",
"format": "pdf",
"section_title": "ACC — Contract for Services: Standard Terms and Conditions",
},
{
"name": "Code of ACC Claimants' Rights",
"url": "https://www.acc.co.nz/assets/im-injured/730eea8693/claimant-rights.pdf",
"filename": "acc-code-of-claimants-rights.pdf",
"format": "pdf",
"section_title": "Code of ACC Claimants' Rights",
},
]
def fetch_legislation(source: dict) -> str:
"""Download (or use cached) HTML and convert to markdown with per-section URLs."""
print(f"\n→ {source['name']}")
path = download_html(source["url"], source["filename"])
with open(path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
parts_md = []
for part_id in source.get("parts_by_id", []):
part = extract_part(soup, part_id)
if part:
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ Part by id={part_id} ({len(md)} chars)")
else:
print(f" ⚠ Part id={part_id} not found")
for text_match in source.get("parts_by_text", []):
part = next(
(p for p in soup.find_all("div", class_="part")
if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()),
None,
)
if part:
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ Part by text={text_match!r} ({len(md)} chars)")
continue
subpart = extract_subpart_by_content(soup, text_match)
if subpart:
md = convert_subpart(subpart, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ Subpart by text={text_match!r} ({len(md)} chars)")
else:
print(f" ⚠ No Part/Subpart matching text={text_match!r}")
if not source.get("parts_by_id") and not source.get("parts_by_text"):
for part in soup.find_all("div", class_="part"):
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ All Parts ({len(parts_md)} parts, "
f"{sum(len(m) for m in parts_md)} chars)")
merged = "\n\n".join(parts_md)
# Clean PDF/HTML extraction artefacts before applying section flags
merged, _clean_stats = clean_corpus_artifacts(merged)
print(format_stats(_clean_stats, label=source["name"]))
for flag in source.get("section_flags", []):
before = merged.count(flag["tags"])
merged = flag["match"].sub(
lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}",
merged,
)
applied = merged.count(flag["tags"]) - before
print(f" ✓ Applied flag {flag['tags']!r}: {applied} match(es)")
# Demote headings so legislation Parts/Subparts/sections nest under the
# source-level H2 wrapper added in build() — avoids cross-Act H2 collisions
# like Privacy Act + UEMA both having "Part 1 Preliminary provisions".
merged = demote_headings(merged)
return merged
def fetch_guidance(source: dict) -> str:
"""Download a guidance document (HTML or PDF) and convert to markdown."""
print(f"\n→ {source['name']}")
cache_path = SOURCES_RAW / source["filename"]
if not cache_path.exists():
import urllib.request
req = urllib.request.Request(
source["url"],
headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
)
try:
with urllib.request.urlopen(req) as resp:
data = resp.read()
cache_path.write_bytes(data)
print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
except Exception as e:
print(f" ⚠ Download failed: {e}")
return ""
else:
print(f" Using cached: {cache_path.name}")
body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()
# Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
body, _clean_stats = clean_corpus_artifacts(body)
print(format_stats(_clean_stats, label=source["name"]))
# Demote body headings so they nest under the source-level H2 wrapper we
# add below (avoids cross-source collisions like multiple "## Introduction").
body = demote_headings(body)
return (
f"\n## {source['section_title']}\n\n"
f"Source: {source['url']}\n\n"
f"{body}\n"
)
def build():
print("Building practitioner_regulation compilation\n")
legislation_blocks = []
for src in LEGISLATION_SOURCES:
block = fetch_legislation(src)
if block:
legislation_blocks.append(
f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}"
)
guidance_blocks = []
for src in GUIDANCE_SOURCES:
block = fetch_guidance(src)
if block:
guidance_blocks.append(block)
body = "\n\n".join(legislation_blocks + guidance_blocks)
compilation = f"""# Practitioner Regulation — NZ Healthcare Marketing Regulation
Source: https://www.legislation.govt.nz/act/public/2003/0048/latest/
This compilation covers the legal framework that defines who can call themselves a health practitioner, what scopes of practice mean for advertising claims, and the consumer-rights framework that overlays marketing conduct. The HPCA Act is foundational for "can I call myself X?" questions; the HDC Code of Rights (especially Right 6 right to information and Right 7 informed consent) is routinely cited in advertising complaints; ACC provider obligations add a contractual layer for the chunk of the audience that is ACC-registered.
Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing.
{body}
"""
DOMAIN_FILE.write_text(compilation, encoding="utf-8")
lines = compilation.count("\n") + 1
size_kb = len(compilation.encode("utf-8")) / 1024
print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")
if __name__ == "__main__":
build()