File size: 10,427 Bytes
bad8b6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
"""
Build the `practitioner_regulation` corpus domain.

Combines the rules that govern who can call themselves what, what scope of
practice means, and the consumer-rights framework that overlays advertising
compliance:

- Health Practitioners Competence Assurance Act 2003 (HPCA Act) β€” defines
  health practitioner, scopes of practice, restricted activities, title use.
  Foundation for "can I call myself X?" questions.
- HDC Code of Health and Disability Services Consumers' Rights β€” Right 6
  (right to information) and Right 7 (informed consent) routinely cited in
  advertising complaints; HDC has run cases on misleading clinic websites.
- ACC provider responsibilities β€” for the chunk of the audience (chiros,
  osteos, physios, acupuncturists) who are commonly ACC-registered, ACC
  contracts add a marketing-conduct layer.

Pattern adapted from `build_medicines_and_supplements_compilation.py`.

Note on ACC: Becki flagged that ACC's provider-facing material is less
standardised than the legislation/code documents. v1 includes the
Understanding Your Responsibilities hub page plus the Working Together
under the Cost of Treatment Regulations handbook. v2 can add more once
we know what queries the audience actually asks.
"""

from __future__ import annotations

import os
import re
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent))

from bs4 import BeautifulSoup  # noqa: E402
from convert_legislation_html import (  # noqa: E402
    download_html,
    extract_part,
    extract_subpart_by_content,
    convert_part,
    convert_subpart,
)
from clean_artifacts import clean_corpus_artifacts, format_stats  # noqa: E402
from extract_pdf import extract_to_markdown, demote_headings  # noqa: E402

PROJECT_ROOT = Path(__file__).resolve().parents[1]
CORPUS_DIR = PROJECT_ROOT / "corpus"
SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"

CORPUS_DIR.mkdir(exist_ok=True)
SOURCES_RAW.mkdir(parents=True, exist_ok=True)

DOMAIN_FILE = CORPUS_DIR / "practitioner-regulation.md"

LEGISLATION_SOURCES = [
    {
        "name": "Health Practitioners Competence Assurance Act 2003",
        "url": "https://www.legislation.govt.nz/act/public/2003/0048/latest/whole.html",
        "filename": "hpca-act-2003.html",
        "base_url": "https://www.legislation.govt.nz/act/public/2003/0048/latest/",
        "parts_by_id": [],
        "parts_by_text": [],
        "section_flags": [
            # Title-protection sections: "no person may claim to be registered..."
            # The HPCA Act prohibits unauthorised use of regulated titles like
            # "physiotherapist", "chiropractor", "osteopath", etc.
            # ss 7-10 are typically the title-use cluster; tag s7 as anchor.
            {
                "match": re.compile(r"^####\s*7\b[^\n]*", re.MULTILINE),
                "tags": "title-use, registration, scope-of-practice",
            },
        ],
    },
]

GUIDANCE_SOURCES = [
    {
        "name": "HDC Code of Health and Disability Services Consumers' Rights",
        # NOTE: First attempt used the printable PDF
        # (hdc.org.nz/media/550hs5ih/code-of-rights_online_5-sept-2022.pdf) but its
        # multi-column glossy brochure layout caused markitdown to flatten everything
        # into 10,000-char single lines (Right 5 / Right 6 / Right 7 jumbled together).
        # The HDC HTML page is on SilverStripe with semantic markup and parses cleanly.
        "url": "https://www.hdc.org.nz/your-rights/about-the-code/code-of-health-and-disability-services-consumers-rights/",
        "filename": "hdc-code-of-rights.html",
        "format": "html",
        "section_title": "HDC Code of Health and Disability Services Consumers' Rights",
    },
    # ACC sources (Becki v3 spec): Provider Agreement template + Code of ACC
    # Claimants' Rights, as the starting pair. Allied-health-specific provider
    # standards are deferred to v2 (see docs/watchlist.md).
    {
        "name": "ACC β€” Contract for Services Standard Terms and Conditions (Provider Agreement template)",
        "url": "https://www.acc.co.nz/assets/contracts/health-contract-terms-conditions.pdf",
        "filename": "acc-health-contract-standard-terms-conditions.pdf",
        "format": "pdf",
        "section_title": "ACC β€” Contract for Services: Standard Terms and Conditions",
    },
    {
        "name": "Code of ACC Claimants' Rights",
        "url": "https://www.acc.co.nz/assets/im-injured/730eea8693/claimant-rights.pdf",
        "filename": "acc-code-of-claimants-rights.pdf",
        "format": "pdf",
        "section_title": "Code of ACC Claimants' Rights",
    },
]


def fetch_legislation(source: dict) -> str:
    """Download (or use cached) HTML and convert to markdown with per-section URLs."""
    print(f"\n→ {source['name']}")
    path = download_html(source["url"], source["filename"])

    with open(path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    parts_md = []

    for part_id in source.get("parts_by_id", []):
        part = extract_part(soup, part_id)
        if part:
            md = convert_part(part, base_url=source["base_url"])
            parts_md.append(md)
            print(f"  βœ“ Part by id={part_id} ({len(md)} chars)")
        else:
            print(f"  ⚠ Part id={part_id} not found")

    for text_match in source.get("parts_by_text", []):
        part = next(
            (p for p in soup.find_all("div", class_="part")
             if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()),
            None,
        )
        if part:
            md = convert_part(part, base_url=source["base_url"])
            parts_md.append(md)
            print(f"  βœ“ Part by text={text_match!r} ({len(md)} chars)")
            continue
        subpart = extract_subpart_by_content(soup, text_match)
        if subpart:
            md = convert_subpart(subpart, base_url=source["base_url"])
            parts_md.append(md)
            print(f"  βœ“ Subpart by text={text_match!r} ({len(md)} chars)")
        else:
            print(f"  ⚠ No Part/Subpart matching text={text_match!r}")

    if not source.get("parts_by_id") and not source.get("parts_by_text"):
        for part in soup.find_all("div", class_="part"):
            md = convert_part(part, base_url=source["base_url"])
            parts_md.append(md)
        print(f"  βœ“ All Parts ({len(parts_md)} parts, "
              f"{sum(len(m) for m in parts_md)} chars)")

    merged = "\n\n".join(parts_md)

    # Clean PDF/HTML extraction artefacts before applying section flags
    merged, _clean_stats = clean_corpus_artifacts(merged)
    print(format_stats(_clean_stats, label=source["name"]))

    for flag in source.get("section_flags", []):
        before = merged.count(flag["tags"])
        merged = flag["match"].sub(
            lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}",
            merged,
        )
        applied = merged.count(flag["tags"]) - before
        print(f"  βœ“ Applied flag {flag['tags']!r}: {applied} match(es)")

    # Demote headings so legislation Parts/Subparts/sections nest under the
    # source-level H2 wrapper added in build() β€” avoids cross-Act H2 collisions
    # like Privacy Act + UEMA both having "Part 1 Preliminary provisions".
    merged = demote_headings(merged)

    return merged


def fetch_guidance(source: dict) -> str:
    """Download a guidance document (HTML or PDF) and convert to markdown."""
    print(f"\n→ {source['name']}")

    cache_path = SOURCES_RAW / source["filename"]
    if not cache_path.exists():
        import urllib.request
        req = urllib.request.Request(
            source["url"],
            headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
        )
        try:
            with urllib.request.urlopen(req) as resp:
                data = resp.read()
            cache_path.write_bytes(data)
            print(f"  Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
        except Exception as e:
            print(f"  ⚠ Download failed: {e}")
            return ""
    else:
        print(f"  Using cached: {cache_path.name}")

    body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()

    # Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
    body, _clean_stats = clean_corpus_artifacts(body)
    print(format_stats(_clean_stats, label=source["name"]))

    # Demote body headings so they nest under the source-level H2 wrapper we
    # add below (avoids cross-source collisions like multiple "## Introduction").
    body = demote_headings(body)

    return (
        f"\n## {source['section_title']}\n\n"
        f"Source: {source['url']}\n\n"
        f"{body}\n"
    )


def build():
    print("Building practitioner_regulation compilation\n")

    legislation_blocks = []
    for src in LEGISLATION_SOURCES:
        block = fetch_legislation(src)
        if block:
            legislation_blocks.append(
                f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}"
            )

    guidance_blocks = []
    for src in GUIDANCE_SOURCES:
        block = fetch_guidance(src)
        if block:
            guidance_blocks.append(block)

    body = "\n\n".join(legislation_blocks + guidance_blocks)

    compilation = f"""# Practitioner Regulation β€” NZ Healthcare Marketing Regulation

Source: https://www.legislation.govt.nz/act/public/2003/0048/latest/

This compilation covers the legal framework that defines who can call themselves a health practitioner, what scopes of practice mean for advertising claims, and the consumer-rights framework that overlays marketing conduct. The HPCA Act is foundational for "can I call myself X?" questions; the HDC Code of Rights (especially Right 6 right to information and Right 7 informed consent) is routinely cited in advertising complaints; ACC provider obligations add a contractual layer for the chunk of the audience that is ACC-registered.

Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing.

{body}
"""

    DOMAIN_FILE.write_text(compilation, encoding="utf-8")
    lines = compilation.count("\n") + 1
    size_kb = len(compilation.encode("utf-8")) / 1024
    print(f"\nβœ… Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")


if __name__ == "__main__":
    build()