""" Build the `professional_codes` corpus domain. Council/board advertising standards for the audience's regulated professions: - Chiropractic Board — Advertising Policy 2025 - Osteopathic Council — Code of Conduct (Jan 2023, contains advertising provisions) - Physiotherapy Board — Code of Ethics and Professional Conduct (Code, Standards, Thresholds) - Chinese Medicine Council — Advertising Standard Guidance - Medical Council — Statement on Advertising (BENCHMARK ONLY — does NOT bind non-MD practitioners) The critical v2 design feature in this domain is `binds:` scope metadata. Each council document declares which practitioner classes it binds. The Medical Council statement is included as a benchmark but tagged `benchmark-only: true` so the retriever does not cite it as authoritative for chiropractors, naturopaths, etc. Becki's correction in v1 → v2 review: "the Medical Council statement is the strictest in the set, particularly on testimonials. For a chiropractor or naturopath user, citing it could give answers more conservative than their own regulator requires." Pattern adapted from `build_advertising_standards_compilation.py` (no LEGISLATION_SOURCES — all PDFs). """ from __future__ import annotations import os import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent)) from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402 from extract_pdf import extract_to_markdown, demote_headings # noqa: E402 PROJECT_ROOT = Path(__file__).resolve().parents[1] CORPUS_DIR = PROJECT_ROOT / "corpus" SOURCES_RAW = PROJECT_ROOT / "sources" / "raw" CORPUS_DIR.mkdir(exist_ok=True) SOURCES_RAW.mkdir(parents=True, exist_ok=True) DOMAIN_FILE = CORPUS_DIR / "professional-codes.md" GUIDANCE_SOURCES = [ { "name": "Chiropractic Board — Advertising Policy 2025", "url": "https://chiropracticboard.org.nz/assets/ElementalFiles/Policies/Advertising-Policy-2025-FINAL.pdf", "filename": "chiropractic-board-advertising-policy-2025.pdf", "format": "pdf", "section_title": "Chiropractic Board of NZ — Advertising Policy 2025", "metadata": { "binds": "registered chiropractors (NZ)", "issued_by": "Chiropractic Board of New Zealand (Te Poari Kaikorohiti o Aotearoa)", "applies_to": "advertising of chiropractic services and chiropractic care products", }, }, { "name": "Osteopathic Council — Code of Conduct for Osteopaths (Jan 2023)", "url": "https://www.osteopathiccouncil.org.nz/common/Uploaded%20files/Publications/OCNZ%20Code%20of%20Conduct%20Jan23.pdf", "filename": "osteopathic-council-code-of-conduct-jan-2023.pdf", "format": "pdf", "section_title": "Osteopathic Council of NZ — Code of Conduct (Jan 2023)", "metadata": { "binds": "registered osteopaths (NZ)", "issued_by": "Osteopathic Council of New Zealand", "applies_to": "professional conduct including advertising of osteopathic services", "notes": "the council's Code of Conduct includes advertising-specific provisions; for narrower advertising-only guidance see also OCNZ Medical Advertisement Policy 2016 (not currently included in v1)", }, }, { # The PDF is a 361-KB kitchen-sink compendium ("Standards framework") covering # clinical, cultural, and ethical standards — only ~30 KB is actually about # advertising. We slice out just the dedicated "Advertising standard" # subdocument (between "Advertising standard" and the next "Telehealth standard" # heading) to keep the corpus focused. "name": "Physiotherapy Board — Advertising standard (extracted from Standards framework)", "url": "https://www.physioboard.org.nz/sites/default/files/Physiotherapy-Board-Code-Standards-Thresholds.pdf", "filename": "physiotherapy-board-code-standards-thresholds.pdf", "format": "pdf", "section_title": "Physiotherapy Board of NZ — Advertising Standard", # Generic slicing: keep only content between start and end regex markers. # The Advertising standard appears twice in the PDF (once as full text, once # as a TOC entry); we anchor on a unique nearby string ("False, misleading or deceptive advertising # can also give rise to a breach of the Fair Trading Act 1986") to disambiguate, but # the simpler approach: slice from the FIRST occurrence of a heading-on-its-own-line # to the FIRST "Telehealth standard" that follows. # Slice between the H2 *headings* (not the bare-text TOC entries that also # appear earlier in the document). Docling preserves headings as `## Foo`, # so we require the heading prefix to disambiguate from TOC text. "slice_after": r"^## Advertising standard\s*$", "slice_until": r"^## Telehealth standard\s*$", "metadata": { "binds": "registered physiotherapists (NZ)", "issued_by": "Physiotherapy Board of New Zealand", "applies_to": "advertising of physiotherapy services; cross-references Fair Trading Act 1986, Consumer Guarantees Act 1993, ASA Codes, HPCAA s7/s8", "source_note": "extracted from the larger Physiotherapy Standards framework PDF — see source URL for full document", }, }, { "name": "Chinese Medicine Council — Advertising Standard Guidance", "url": "https://www.chinesemedicinecouncil.org.nz/common/Uploaded%20files/RegistrationsDocs/Standards%20Statement%20and%20Policies/Post%20feedback%20Advertising%20Standard%20Guidance%2023June23.pdf", "filename": "chinese-medicine-council-advertising-standard-jun-2023.pdf", "format": "pdf", "section_title": "Chinese Medicine Council of NZ — Advertising Standard Guidance", "metadata": { "binds": "registered Chinese medicine practitioners (NZ) — including acupuncturists, herbal medicine practitioners, tuina practitioners", "issued_by": "Chinese Medicine Council of New Zealand", "applies_to": "advertising of Chinese medicine services and products", "notes": "Becki flagged that AU practitioners cannot rely on traditional-use claims or 2003 WHO statement when advertising — verify whether NZ council takes the same position", }, }, { "name": "Medical Council — Statement on Advertising (BENCHMARK ONLY)", "url": "https://www.mcnz.org.nz/assets/standards/e93109ce92/Statement-on-advertising.pdf", "filename": "medical-council-statement-on-advertising.pdf", "format": "pdf", "section_title": "Medical Council of NZ — Statement on Advertising (benchmark only)", "metadata": { "binds": "registered medical practitioners (MDs) only", "benchmark-only": "true", "issued_by": "Medical Council of New Zealand", "applies_to": "advertising by registered doctors", "WARNING": "Becki: this statement is the STRICTEST in the set, particularly on testimonials. Do NOT cite as authoritative for chiropractors, osteopaths, physiotherapists, Chinese medicine practitioners, naturopaths, or other non-MD professions. Their own councils set the binding rules for them. Use this only as a comparative benchmark when explicitly asked to compare professions.", }, }, ] def fetch_guidance(source: dict) -> str: """Download a guidance PDF and convert to markdown, with binding metadata. Optional slicing: if ``slice_after`` and/or ``slice_until`` regex strings are set on the source, keep only the text BETWEEN those markers (inclusive of the start, exclusive of the end). Useful for extracting a single subdocument from a larger compendium PDF without losing the source-PDF's URL provenance. """ import re as _re print(f"\n→ {source['name']}") cache_path = SOURCES_RAW / source["filename"] if not cache_path.exists(): import urllib.request req = urllib.request.Request( source["url"], headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"}, ) try: with urllib.request.urlopen(req) as resp: data = resp.read() cache_path.write_bytes(data) print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)") except Exception as e: print(f" ⚠ Download failed: {e}") return "" else: print(f" Using cached: {cache_path.name}") body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip() # Optional slicing — keep only the substring between markers. slice_after = source.get("slice_after") slice_until = source.get("slice_until") if slice_after or slice_until: original_size = len(body) if slice_after: m = _re.search(slice_after, body, flags=_re.MULTILINE) if m: body = body[m.start():] else: print(f" ⚠ slice_after marker {slice_after!r} not found; keeping full text") if slice_until: m = _re.search(slice_until, body, flags=_re.MULTILINE) if m: body = body[:m.start()] else: print(f" ⚠ slice_until marker {slice_until!r} not found; keeping to end") body = body.strip() print(f" ✂ Sliced: {original_size:,} → {len(body):,} chars") # Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars) body, _clean_stats = clean_corpus_artifacts(body) print(format_stats(_clean_stats, label=source["name"])) # Demote body headings so they nest under the source-level H2 wrapper we # add below (avoids cross-source collisions like multiple "## Introduction"). body = demote_headings(body) meta = source.get("metadata", {}) meta_lines = [f"{k}: {v}" for k, v in meta.items()] meta_block = "\n".join(meta_lines) return ( f"\n## {source['section_title']}\n\n" f"Source: {source['url']}\n\n" f"{meta_block}\n\n" f"{body}\n" ) def build(): print("Building professional_codes compilation\n") guidance_blocks = [] for src in GUIDANCE_SOURCES: block = fetch_guidance(src) if block: guidance_blocks.append(block) body = "\n\n".join(guidance_blocks) compilation = f"""# Professional Codes — NZ Healthcare Marketing Regulation Source: https://www.health.govt.nz/our-work/regulation-health-and-disability-system/health-practitioners-competence-assurance-act This compilation covers the advertising-specific standards issued by the registration boards/councils for the regulated professions in scope: chiropractors, osteopaths, physiotherapists, and Chinese medicine practitioners. The Medical Council Statement on Advertising is also included **as a benchmark only** — it does NOT bind non-MD practitioners. **Critical retrieval rule:** each council document has a `binds:` metadata line declaring which practitioners it applies to. The Medical Council statement is the strictest in the set on testimonials and should not be cited as authoritative for non-MD professions; their own councils set the binding rules. Use the binding metadata to scope answers to the practitioner type the user is asking about. {body} """ DOMAIN_FILE.write_text(compilation, encoding="utf-8") lines = compilation.count("\n") + 1 size_kb = len(compilation.encode("utf-8")) / 1024 print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)") # Section-size sanity check — flags imbalanced sections that suggest a # wrong-source choice (one PDF disproportionately larger than the others). # Learning from v1 bug: 4-of-5 council docs were 13–33 KB but Physiotherapy # was 361 KB because the chosen PDF was a kitchen-sink compendium. # # Granularity note: this check counts SOURCE-LEVEL H2 sections only — the # ones we explicitly inject in `fetch_guidance` (which always have a # `Source: ...` line immediately under the heading). Docling-extracted # body content also contains H2 headings, but those are sub-sections of a # single source PDF and shouldn't be measured at this granularity. import re as _re # Match: H2 heading + blank line + Source: URL line (source-level pattern) source_section_pattern = _re.compile( r"^## (?P