""" Build the `practitioner_regulation` corpus domain. Combines the rules that govern who can call themselves what, what scope of practice means, and the consumer-rights framework that overlays advertising compliance: - Health Practitioners Competence Assurance Act 2003 (HPCA Act) — defines health practitioner, scopes of practice, restricted activities, title use. Foundation for "can I call myself X?" questions. - HDC Code of Health and Disability Services Consumers' Rights — Right 6 (right to information) and Right 7 (informed consent) routinely cited in advertising complaints; HDC has run cases on misleading clinic websites. - ACC provider responsibilities — for the chunk of the audience (chiros, osteos, physios, acupuncturists) who are commonly ACC-registered, ACC contracts add a marketing-conduct layer. Pattern adapted from `build_medicines_and_supplements_compilation.py`. Note on ACC: Becki flagged that ACC's provider-facing material is less standardised than the legislation/code documents. v1 includes the Understanding Your Responsibilities hub page plus the Working Together under the Cost of Treatment Regulations handbook. v2 can add more once we know what queries the audience actually asks. """ from __future__ import annotations import os import re import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent)) from bs4 import BeautifulSoup # noqa: E402 from convert_legislation_html import ( # noqa: E402 download_html, extract_part, extract_subpart_by_content, convert_part, convert_subpart, ) from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402 from extract_pdf import extract_to_markdown, demote_headings # noqa: E402 PROJECT_ROOT = Path(__file__).resolve().parents[1] CORPUS_DIR = PROJECT_ROOT / "corpus" SOURCES_RAW = PROJECT_ROOT / "sources" / "raw" CORPUS_DIR.mkdir(exist_ok=True) SOURCES_RAW.mkdir(parents=True, exist_ok=True) DOMAIN_FILE = CORPUS_DIR / "practitioner-regulation.md" LEGISLATION_SOURCES = [ { "name": "Health Practitioners Competence Assurance Act 2003", "url": "https://www.legislation.govt.nz/act/public/2003/0048/latest/whole.html", "filename": "hpca-act-2003.html", "base_url": "https://www.legislation.govt.nz/act/public/2003/0048/latest/", "parts_by_id": [], "parts_by_text": [], "section_flags": [ # Title-protection sections: "no person may claim to be registered..." # The HPCA Act prohibits unauthorised use of regulated titles like # "physiotherapist", "chiropractor", "osteopath", etc. # ss 7-10 are typically the title-use cluster; tag s7 as anchor. { "match": re.compile(r"^####\s*7\b[^\n]*", re.MULTILINE), "tags": "title-use, registration, scope-of-practice", }, ], }, ] GUIDANCE_SOURCES = [ { "name": "HDC Code of Health and Disability Services Consumers' Rights", # NOTE: First attempt used the printable PDF # (hdc.org.nz/media/550hs5ih/code-of-rights_online_5-sept-2022.pdf) but its # multi-column glossy brochure layout caused markitdown to flatten everything # into 10,000-char single lines (Right 5 / Right 6 / Right 7 jumbled together). # The HDC HTML page is on SilverStripe with semantic markup and parses cleanly. "url": "https://www.hdc.org.nz/your-rights/about-the-code/code-of-health-and-disability-services-consumers-rights/", "filename": "hdc-code-of-rights.html", "format": "html", "section_title": "HDC Code of Health and Disability Services Consumers' Rights", }, # ACC sources (Becki v3 spec): Provider Agreement template + Code of ACC # Claimants' Rights, as the starting pair. Allied-health-specific provider # standards are deferred to v2 (see docs/watchlist.md). { "name": "ACC — Contract for Services Standard Terms and Conditions (Provider Agreement template)", "url": "https://www.acc.co.nz/assets/contracts/health-contract-terms-conditions.pdf", "filename": "acc-health-contract-standard-terms-conditions.pdf", "format": "pdf", "section_title": "ACC — Contract for Services: Standard Terms and Conditions", }, { "name": "Code of ACC Claimants' Rights", "url": "https://www.acc.co.nz/assets/im-injured/730eea8693/claimant-rights.pdf", "filename": "acc-code-of-claimants-rights.pdf", "format": "pdf", "section_title": "Code of ACC Claimants' Rights", }, ] def fetch_legislation(source: dict) -> str: """Download (or use cached) HTML and convert to markdown with per-section URLs.""" print(f"\n→ {source['name']}") path = download_html(source["url"], source["filename"]) with open(path, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") parts_md = [] for part_id in source.get("parts_by_id", []): part = extract_part(soup, part_id) if part: md = convert_part(part, base_url=source["base_url"]) parts_md.append(md) print(f" ✓ Part by id={part_id} ({len(md)} chars)") else: print(f" ⚠ Part id={part_id} not found") for text_match in source.get("parts_by_text", []): part = next( (p for p in soup.find_all("div", class_="part") if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()), None, ) if part: md = convert_part(part, base_url=source["base_url"]) parts_md.append(md) print(f" ✓ Part by text={text_match!r} ({len(md)} chars)") continue subpart = extract_subpart_by_content(soup, text_match) if subpart: md = convert_subpart(subpart, base_url=source["base_url"]) parts_md.append(md) print(f" ✓ Subpart by text={text_match!r} ({len(md)} chars)") else: print(f" ⚠ No Part/Subpart matching text={text_match!r}") if not source.get("parts_by_id") and not source.get("parts_by_text"): for part in soup.find_all("div", class_="part"): md = convert_part(part, base_url=source["base_url"]) parts_md.append(md) print(f" ✓ All Parts ({len(parts_md)} parts, " f"{sum(len(m) for m in parts_md)} chars)") merged = "\n\n".join(parts_md) # Clean PDF/HTML extraction artefacts before applying section flags merged, _clean_stats = clean_corpus_artifacts(merged) print(format_stats(_clean_stats, label=source["name"])) for flag in source.get("section_flags", []): before = merged.count(flag["tags"]) merged = flag["match"].sub( lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}", merged, ) applied = merged.count(flag["tags"]) - before print(f" ✓ Applied flag {flag['tags']!r}: {applied} match(es)") # Demote headings so legislation Parts/Subparts/sections nest under the # source-level H2 wrapper added in build() — avoids cross-Act H2 collisions # like Privacy Act + UEMA both having "Part 1 Preliminary provisions". merged = demote_headings(merged) return merged def fetch_guidance(source: dict) -> str: """Download a guidance document (HTML or PDF) and convert to markdown.""" print(f"\n→ {source['name']}") cache_path = SOURCES_RAW / source["filename"] if not cache_path.exists(): import urllib.request req = urllib.request.Request( source["url"], headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"}, ) try: with urllib.request.urlopen(req) as resp: data = resp.read() cache_path.write_bytes(data) print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)") except Exception as e: print(f" ⚠ Download failed: {e}") return "" else: print(f" Using cached: {cache_path.name}") body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip() # Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars) body, _clean_stats = clean_corpus_artifacts(body) print(format_stats(_clean_stats, label=source["name"])) # Demote body headings so they nest under the source-level H2 wrapper we # add below (avoids cross-source collisions like multiple "## Introduction"). body = demote_headings(body) return ( f"\n## {source['section_title']}\n\n" f"Source: {source['url']}\n\n" f"{body}\n" ) def build(): print("Building practitioner_regulation compilation\n") legislation_blocks = [] for src in LEGISLATION_SOURCES: block = fetch_legislation(src) if block: legislation_blocks.append( f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}" ) guidance_blocks = [] for src in GUIDANCE_SOURCES: block = fetch_guidance(src) if block: guidance_blocks.append(block) body = "\n\n".join(legislation_blocks + guidance_blocks) compilation = f"""# Practitioner Regulation — NZ Healthcare Marketing Regulation Source: https://www.legislation.govt.nz/act/public/2003/0048/latest/ This compilation covers the legal framework that defines who can call themselves a health practitioner, what scopes of practice mean for advertising claims, and the consumer-rights framework that overlays marketing conduct. The HPCA Act is foundational for "can I call myself X?" questions; the HDC Code of Rights (especially Right 6 right to information and Right 7 informed consent) is routinely cited in advertising complaints; ACC provider obligations add a contractual layer for the chunk of the audience that is ACC-registered. Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing. {body} """ DOMAIN_FILE.write_text(compilation, encoding="utf-8") lines = compilation.count("\n") + 1 size_kb = len(compilation.encode("utf-8")) / 1024 print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)") if __name__ == "__main__": build()