Spaces:
Sleeping
Sleeping
| """ | |
| Build the `practitioner_regulation` corpus domain. | |
| Combines the rules that govern who can call themselves what, what scope of | |
| practice means, and the consumer-rights framework that overlays advertising | |
| compliance: | |
| - Health Practitioners Competence Assurance Act 2003 (HPCA Act) — defines | |
| health practitioner, scopes of practice, restricted activities, title use. | |
| Foundation for "can I call myself X?" questions. | |
| - HDC Code of Health and Disability Services Consumers' Rights — Right 6 | |
| (right to information) and Right 7 (informed consent) routinely cited in | |
| advertising complaints; HDC has run cases on misleading clinic websites. | |
| - ACC provider responsibilities — for the chunk of the audience (chiros, | |
| osteos, physios, acupuncturists) who are commonly ACC-registered, ACC | |
| contracts add a marketing-conduct layer. | |
| Pattern adapted from `build_medicines_and_supplements_compilation.py`. | |
| Note on ACC: Becki flagged that ACC's provider-facing material is less | |
| standardised than the legislation/code documents. v1 includes the | |
| Understanding Your Responsibilities hub page plus the Working Together | |
| under the Cost of Treatment Regulations handbook. v2 can add more once | |
| we know what queries the audience actually asks. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import re | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).resolve().parent)) | |
| from bs4 import BeautifulSoup # noqa: E402 | |
| from convert_legislation_html import ( # noqa: E402 | |
| download_html, | |
| extract_part, | |
| extract_subpart_by_content, | |
| convert_part, | |
| convert_subpart, | |
| ) | |
| from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402 | |
| from extract_pdf import extract_to_markdown, demote_headings # noqa: E402 | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| CORPUS_DIR = PROJECT_ROOT / "corpus" | |
| SOURCES_RAW = PROJECT_ROOT / "sources" / "raw" | |
| CORPUS_DIR.mkdir(exist_ok=True) | |
| SOURCES_RAW.mkdir(parents=True, exist_ok=True) | |
| DOMAIN_FILE = CORPUS_DIR / "practitioner-regulation.md" | |
| LEGISLATION_SOURCES = [ | |
| { | |
| "name": "Health Practitioners Competence Assurance Act 2003", | |
| "url": "https://www.legislation.govt.nz/act/public/2003/0048/latest/whole.html", | |
| "filename": "hpca-act-2003.html", | |
| "base_url": "https://www.legislation.govt.nz/act/public/2003/0048/latest/", | |
| "parts_by_id": [], | |
| "parts_by_text": [], | |
| "section_flags": [ | |
| # Title-protection sections: "no person may claim to be registered..." | |
| # The HPCA Act prohibits unauthorised use of regulated titles like | |
| # "physiotherapist", "chiropractor", "osteopath", etc. | |
| # ss 7-10 are typically the title-use cluster; tag s7 as anchor. | |
| { | |
| "match": re.compile(r"^####\s*7\b[^\n]*", re.MULTILINE), | |
| "tags": "title-use, registration, scope-of-practice", | |
| }, | |
| ], | |
| }, | |
| ] | |
| GUIDANCE_SOURCES = [ | |
| { | |
| "name": "HDC Code of Health and Disability Services Consumers' Rights", | |
| # NOTE: First attempt used the printable PDF | |
| # (hdc.org.nz/media/550hs5ih/code-of-rights_online_5-sept-2022.pdf) but its | |
| # multi-column glossy brochure layout caused markitdown to flatten everything | |
| # into 10,000-char single lines (Right 5 / Right 6 / Right 7 jumbled together). | |
| # The HDC HTML page is on SilverStripe with semantic markup and parses cleanly. | |
| "url": "https://www.hdc.org.nz/your-rights/about-the-code/code-of-health-and-disability-services-consumers-rights/", | |
| "filename": "hdc-code-of-rights.html", | |
| "format": "html", | |
| "section_title": "HDC Code of Health and Disability Services Consumers' Rights", | |
| }, | |
| # ACC sources (Becki v3 spec): Provider Agreement template + Code of ACC | |
| # Claimants' Rights, as the starting pair. Allied-health-specific provider | |
| # standards are deferred to v2 (see docs/watchlist.md). | |
| { | |
| "name": "ACC — Contract for Services Standard Terms and Conditions (Provider Agreement template)", | |
| "url": "https://www.acc.co.nz/assets/contracts/health-contract-terms-conditions.pdf", | |
| "filename": "acc-health-contract-standard-terms-conditions.pdf", | |
| "format": "pdf", | |
| "section_title": "ACC — Contract for Services: Standard Terms and Conditions", | |
| }, | |
| { | |
| "name": "Code of ACC Claimants' Rights", | |
| "url": "https://www.acc.co.nz/assets/im-injured/730eea8693/claimant-rights.pdf", | |
| "filename": "acc-code-of-claimants-rights.pdf", | |
| "format": "pdf", | |
| "section_title": "Code of ACC Claimants' Rights", | |
| }, | |
| ] | |
| def fetch_legislation(source: dict) -> str: | |
| """Download (or use cached) HTML and convert to markdown with per-section URLs.""" | |
| print(f"\n→ {source['name']}") | |
| path = download_html(source["url"], source["filename"]) | |
| with open(path, "r", encoding="utf-8") as f: | |
| soup = BeautifulSoup(f, "html.parser") | |
| parts_md = [] | |
| for part_id in source.get("parts_by_id", []): | |
| part = extract_part(soup, part_id) | |
| if part: | |
| md = convert_part(part, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ Part by id={part_id} ({len(md)} chars)") | |
| else: | |
| print(f" ⚠ Part id={part_id} not found") | |
| for text_match in source.get("parts_by_text", []): | |
| part = next( | |
| (p for p in soup.find_all("div", class_="part") | |
| if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()), | |
| None, | |
| ) | |
| if part: | |
| md = convert_part(part, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ Part by text={text_match!r} ({len(md)} chars)") | |
| continue | |
| subpart = extract_subpart_by_content(soup, text_match) | |
| if subpart: | |
| md = convert_subpart(subpart, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ Subpart by text={text_match!r} ({len(md)} chars)") | |
| else: | |
| print(f" ⚠ No Part/Subpart matching text={text_match!r}") | |
| if not source.get("parts_by_id") and not source.get("parts_by_text"): | |
| for part in soup.find_all("div", class_="part"): | |
| md = convert_part(part, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ All Parts ({len(parts_md)} parts, " | |
| f"{sum(len(m) for m in parts_md)} chars)") | |
| merged = "\n\n".join(parts_md) | |
| # Clean PDF/HTML extraction artefacts before applying section flags | |
| merged, _clean_stats = clean_corpus_artifacts(merged) | |
| print(format_stats(_clean_stats, label=source["name"])) | |
| for flag in source.get("section_flags", []): | |
| before = merged.count(flag["tags"]) | |
| merged = flag["match"].sub( | |
| lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}", | |
| merged, | |
| ) | |
| applied = merged.count(flag["tags"]) - before | |
| print(f" ✓ Applied flag {flag['tags']!r}: {applied} match(es)") | |
| # Demote headings so legislation Parts/Subparts/sections nest under the | |
| # source-level H2 wrapper added in build() — avoids cross-Act H2 collisions | |
| # like Privacy Act + UEMA both having "Part 1 Preliminary provisions". | |
| merged = demote_headings(merged) | |
| return merged | |
| def fetch_guidance(source: dict) -> str: | |
| """Download a guidance document (HTML or PDF) and convert to markdown.""" | |
| print(f"\n→ {source['name']}") | |
| cache_path = SOURCES_RAW / source["filename"] | |
| if not cache_path.exists(): | |
| import urllib.request | |
| req = urllib.request.Request( | |
| source["url"], | |
| headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"}, | |
| ) | |
| try: | |
| with urllib.request.urlopen(req) as resp: | |
| data = resp.read() | |
| cache_path.write_bytes(data) | |
| print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)") | |
| except Exception as e: | |
| print(f" ⚠ Download failed: {e}") | |
| return "" | |
| else: | |
| print(f" Using cached: {cache_path.name}") | |
| body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip() | |
| # Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars) | |
| body, _clean_stats = clean_corpus_artifacts(body) | |
| print(format_stats(_clean_stats, label=source["name"])) | |
| # Demote body headings so they nest under the source-level H2 wrapper we | |
| # add below (avoids cross-source collisions like multiple "## Introduction"). | |
| body = demote_headings(body) | |
| return ( | |
| f"\n## {source['section_title']}\n\n" | |
| f"Source: {source['url']}\n\n" | |
| f"{body}\n" | |
| ) | |
| def build(): | |
| print("Building practitioner_regulation compilation\n") | |
| legislation_blocks = [] | |
| for src in LEGISLATION_SOURCES: | |
| block = fetch_legislation(src) | |
| if block: | |
| legislation_blocks.append( | |
| f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}" | |
| ) | |
| guidance_blocks = [] | |
| for src in GUIDANCE_SOURCES: | |
| block = fetch_guidance(src) | |
| if block: | |
| guidance_blocks.append(block) | |
| body = "\n\n".join(legislation_blocks + guidance_blocks) | |
| compilation = f"""# Practitioner Regulation — NZ Healthcare Marketing Regulation | |
| Source: https://www.legislation.govt.nz/act/public/2003/0048/latest/ | |
| This compilation covers the legal framework that defines who can call themselves a health practitioner, what scopes of practice mean for advertising claims, and the consumer-rights framework that overlays marketing conduct. The HPCA Act is foundational for "can I call myself X?" questions; the HDC Code of Rights (especially Right 6 right to information and Right 7 informed consent) is routinely cited in advertising complaints; ACC provider obligations add a contractual layer for the chunk of the audience that is ACC-registered. | |
| Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing. | |
| {body} | |
| """ | |
| DOMAIN_FILE.write_text(compilation, encoding="utf-8") | |
| lines = compilation.count("\n") + 1 | |
| size_kb = len(compilation.encode("utf-8")) / 1024 | |
| print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)") | |
| if __name__ == "__main__": | |
| build() | |