Spaces:

webmuppetnz
/

hmc-rag

Running

File size: 14,132 Bytes

bad8b6c

"""
Build the `professional_codes` corpus domain.

Council/board advertising standards for the audience's regulated professions:

- Chiropractic Board — Advertising Policy 2025
- Osteopathic Council — Code of Conduct (Jan 2023, contains advertising provisions)
- Physiotherapy Board — Code of Ethics and Professional Conduct (Code, Standards, Thresholds)
- Chinese Medicine Council — Advertising Standard Guidance
- Medical Council — Statement on Advertising (BENCHMARK ONLY — does NOT bind non-MD practitioners)

The critical v2 design feature in this domain is `binds:` scope metadata.
Each council document declares which practitioner classes it binds. The
Medical Council statement is included as a benchmark but tagged
`benchmark-only: true` so the retriever does not cite it as authoritative
for chiropractors, naturopaths, etc. Becki's correction in v1 → v2 review:
"the Medical Council statement is the strictest in the set, particularly
on testimonials. For a chiropractor or naturopath user, citing it could
give answers more conservative than their own regulator requires."

Pattern adapted from `build_advertising_standards_compilation.py` (no
LEGISLATION_SOURCES — all PDFs).
"""

from __future__ import annotations

import os
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent))

from clean_artifacts import clean_corpus_artifacts, format_stats  # noqa: E402
from extract_pdf import extract_to_markdown, demote_headings  # noqa: E402

PROJECT_ROOT = Path(__file__).resolve().parents[1]
CORPUS_DIR = PROJECT_ROOT / "corpus"
SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"

CORPUS_DIR.mkdir(exist_ok=True)
SOURCES_RAW.mkdir(parents=True, exist_ok=True)

DOMAIN_FILE = CORPUS_DIR / "professional-codes.md"

GUIDANCE_SOURCES = [
    {
        "name": "Chiropractic Board — Advertising Policy 2025",
        "url": "https://chiropracticboard.org.nz/assets/ElementalFiles/Policies/Advertising-Policy-2025-FINAL.pdf",
        "filename": "chiropractic-board-advertising-policy-2025.pdf",
        "format": "pdf",
        "section_title": "Chiropractic Board of NZ — Advertising Policy 2025",
        "metadata": {
            "binds": "registered chiropractors (NZ)",
            "issued_by": "Chiropractic Board of New Zealand (Te Poari Kaikorohiti o Aotearoa)",
            "applies_to": "advertising of chiropractic services and chiropractic care products",
        },
    },
    {
        "name": "Osteopathic Council — Code of Conduct for Osteopaths (Jan 2023)",
        "url": "https://www.osteopathiccouncil.org.nz/common/Uploaded%20files/Publications/OCNZ%20Code%20of%20Conduct%20Jan23.pdf",
        "filename": "osteopathic-council-code-of-conduct-jan-2023.pdf",
        "format": "pdf",
        "section_title": "Osteopathic Council of NZ — Code of Conduct (Jan 2023)",
        "metadata": {
            "binds": "registered osteopaths (NZ)",
            "issued_by": "Osteopathic Council of New Zealand",
            "applies_to": "professional conduct including advertising of osteopathic services",
            "notes": "the council's Code of Conduct includes advertising-specific provisions; for narrower advertising-only guidance see also OCNZ Medical Advertisement Policy 2016 (not currently included in v1)",
        },
    },
    {
        # The PDF is a 361-KB kitchen-sink compendium ("Standards framework") covering
        # clinical, cultural, and ethical standards — only ~30 KB is actually about
        # advertising. We slice out just the dedicated "Advertising standard"
        # subdocument (between "Advertising standard" and the next "Telehealth standard"
        # heading) to keep the corpus focused.
        "name": "Physiotherapy Board — Advertising standard (extracted from Standards framework)",
        "url": "https://www.physioboard.org.nz/sites/default/files/Physiotherapy-Board-Code-Standards-Thresholds.pdf",
        "filename": "physiotherapy-board-code-standards-thresholds.pdf",
        "format": "pdf",
        "section_title": "Physiotherapy Board of NZ — Advertising Standard",
        # Generic slicing: keep only content between start and end regex markers.
        # The Advertising standard appears twice in the PDF (once as full text, once
        # as a TOC entry); we anchor on a unique nearby string ("False, misleading or deceptive advertising
        # can also give rise to a breach of the Fair Trading Act 1986") to disambiguate, but
        # the simpler approach: slice from the FIRST occurrence of a heading-on-its-own-line
        # to the FIRST "Telehealth standard" that follows.
        # Slice between the H2 *headings* (not the bare-text TOC entries that also
        # appear earlier in the document). Docling preserves headings as `## Foo`,
        # so we require the heading prefix to disambiguate from TOC text.
        "slice_after": r"^## Advertising standard\s*$",
        "slice_until": r"^## Telehealth standard\s*$",
        "metadata": {
            "binds": "registered physiotherapists (NZ)",
            "issued_by": "Physiotherapy Board of New Zealand",
            "applies_to": "advertising of physiotherapy services; cross-references Fair Trading Act 1986, Consumer Guarantees Act 1993, ASA Codes, HPCAA s7/s8",
            "source_note": "extracted from the larger Physiotherapy Standards framework PDF — see source URL for full document",
        },
    },
    {
        "name": "Chinese Medicine Council — Advertising Standard Guidance",
        "url": "https://www.chinesemedicinecouncil.org.nz/common/Uploaded%20files/RegistrationsDocs/Standards%20Statement%20and%20Policies/Post%20feedback%20Advertising%20Standard%20Guidance%2023June23.pdf",
        "filename": "chinese-medicine-council-advertising-standard-jun-2023.pdf",
        "format": "pdf",
        "section_title": "Chinese Medicine Council of NZ — Advertising Standard Guidance",
        "metadata": {
            "binds": "registered Chinese medicine practitioners (NZ) — including acupuncturists, herbal medicine practitioners, tuina practitioners",
            "issued_by": "Chinese Medicine Council of New Zealand",
            "applies_to": "advertising of Chinese medicine services and products",
            "notes": "Becki flagged that AU practitioners cannot rely on traditional-use claims or 2003 WHO statement when advertising — verify whether NZ council takes the same position",
        },
    },
    {
        "name": "Medical Council — Statement on Advertising (BENCHMARK ONLY)",
        "url": "https://www.mcnz.org.nz/assets/standards/e93109ce92/Statement-on-advertising.pdf",
        "filename": "medical-council-statement-on-advertising.pdf",
        "format": "pdf",
        "section_title": "Medical Council of NZ — Statement on Advertising (benchmark only)",
        "metadata": {
            "binds": "registered medical practitioners (MDs) only",
            "benchmark-only": "true",
            "issued_by": "Medical Council of New Zealand",
            "applies_to": "advertising by registered doctors",
            "WARNING": "Becki: this statement is the STRICTEST in the set, particularly on testimonials. Do NOT cite as authoritative for chiropractors, osteopaths, physiotherapists, Chinese medicine practitioners, naturopaths, or other non-MD professions. Their own councils set the binding rules for them. Use this only as a comparative benchmark when explicitly asked to compare professions.",
        },
    },
]


def fetch_guidance(source: dict) -> str:
    """Download a guidance PDF and convert to markdown, with binding metadata.

    Optional slicing: if ``slice_after`` and/or ``slice_until`` regex strings are
    set on the source, keep only the text BETWEEN those markers (inclusive of the
    start, exclusive of the end). Useful for extracting a single subdocument from
    a larger compendium PDF without losing the source-PDF's URL provenance.
    """
    import re as _re

    print(f"\n→ {source['name']}")

    cache_path = SOURCES_RAW / source["filename"]
    if not cache_path.exists():
        import urllib.request
        req = urllib.request.Request(
            source["url"],
            headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
        )
        try:
            with urllib.request.urlopen(req) as resp:
                data = resp.read()
            cache_path.write_bytes(data)
            print(f"  Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
        except Exception as e:
            print(f"  ⚠ Download failed: {e}")
            return ""
    else:
        print(f"  Using cached: {cache_path.name}")

    body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()

    # Optional slicing — keep only the substring between markers.
    slice_after = source.get("slice_after")
    slice_until = source.get("slice_until")
    if slice_after or slice_until:
        original_size = len(body)
        if slice_after:
            m = _re.search(slice_after, body, flags=_re.MULTILINE)
            if m:
                body = body[m.start():]
            else:
                print(f"  ⚠ slice_after marker {slice_after!r} not found; keeping full text")
        if slice_until:
            m = _re.search(slice_until, body, flags=_re.MULTILINE)
            if m:
                body = body[:m.start()]
            else:
                print(f"  ⚠ slice_until marker {slice_until!r} not found; keeping to end")
        body = body.strip()
        print(f"  ✂ Sliced: {original_size:,} → {len(body):,} chars")

    # Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
    body, _clean_stats = clean_corpus_artifacts(body)
    print(format_stats(_clean_stats, label=source["name"]))

    # Demote body headings so they nest under the source-level H2 wrapper we
    # add below (avoids cross-source collisions like multiple "## Introduction").
    body = demote_headings(body)

    meta = source.get("metadata", {})
    meta_lines = [f"{k}: {v}" for k, v in meta.items()]
    meta_block = "\n".join(meta_lines)

    return (
        f"\n## {source['section_title']}\n\n"
        f"Source: {source['url']}\n\n"
        f"{meta_block}\n\n"
        f"{body}\n"
    )


def build():
    print("Building professional_codes compilation\n")

    guidance_blocks = []
    for src in GUIDANCE_SOURCES:
        block = fetch_guidance(src)
        if block:
            guidance_blocks.append(block)

    body = "\n\n".join(guidance_blocks)

    compilation = f"""# Professional Codes — NZ Healthcare Marketing Regulation

Source: https://www.health.govt.nz/our-work/regulation-health-and-disability-system/health-practitioners-competence-assurance-act

This compilation covers the advertising-specific standards issued by the registration boards/councils for the regulated professions in scope: chiropractors, osteopaths, physiotherapists, and Chinese medicine practitioners. The Medical Council Statement on Advertising is also included **as a benchmark only** — it does NOT bind non-MD practitioners.

**Critical retrieval rule:** each council document has a `binds:` metadata line declaring which practitioners it applies to. The Medical Council statement is the strictest in the set on testimonials and should not be cited as authoritative for non-MD professions; their own councils set the binding rules. Use the binding metadata to scope answers to the practitioner type the user is asking about.

{body}
"""

    DOMAIN_FILE.write_text(compilation, encoding="utf-8")
    lines = compilation.count("\n") + 1
    size_kb = len(compilation.encode("utf-8")) / 1024
    print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")

    # Section-size sanity check — flags imbalanced sections that suggest a
    # wrong-source choice (one PDF disproportionately larger than the others).
    # Learning from v1 bug: 4-of-5 council docs were 13–33 KB but Physiotherapy
    # was 361 KB because the chosen PDF was a kitchen-sink compendium.
    #
    # Granularity note: this check counts SOURCE-LEVEL H2 sections only — the
    # ones we explicitly inject in `fetch_guidance` (which always have a
    # `Source: ...` line immediately under the heading). Docling-extracted
    # body content also contains H2 headings, but those are sub-sections of a
    # single source PDF and shouldn't be measured at this granularity.
    import re as _re
    # Match: H2 heading + blank line + Source: URL line (source-level pattern)
    source_section_pattern = _re.compile(
        r"^## (?P<title>[^\n]+)\n\nSource: ",
        flags=_re.MULTILINE,
    )
    matches = list(source_section_pattern.finditer(compilation))
    section_sizes = []
    for i, m in enumerate(matches):
        end = matches[i + 1].start() if i + 1 < len(matches) else len(compilation)
        section_sizes.append((end - m.start(), m.group("title")))
    if section_sizes:
        section_sizes.sort(reverse=True)
        biggest = section_sizes[0][0]
        smallest = section_sizes[-1][0]
        ratio = biggest / max(smallest, 1)
        # Threshold tuned from observation: legitimate content variation across
        # focused-vs-broader council docs sits around 6–7×. The catastrophic
        # kitchen-sink case (e.g. v1 Physiotherapy at 26×) is the target. 8×
        # gives us a comfortable margin without false alarms on legitimate
        # imbalance (e.g. one council publishes a comprehensive code, another a
        # short statement).
        threshold = 8
        print(f"\nSection size distribution (biggest/smallest ratio: {ratio:.1f}x):")
        for size, title in section_sizes:
            warn = " ⚠ disproportionate" if ratio > threshold and size == biggest else ""
            print(f"  {size:>7,} chars  {title[:70]}{warn}")
        if ratio > threshold:
            print(f"\n  ⚠ One section is >{threshold}x larger than the smallest. May indicate a")
            print(f"    wrong-source PDF (kitchen-sink compendium vs focused doc). Investigate.")


if __name__ == "__main__":
    build()