hmc-rag / scripts /build_professional_codes_compilation.py
webmuppet
Initial commit β€” health marketing compliance RAG
bad8b6c
"""
Build the `professional_codes` corpus domain.
Council/board advertising standards for the audience's regulated professions:
- Chiropractic Board β€” Advertising Policy 2025
- Osteopathic Council β€” Code of Conduct (Jan 2023, contains advertising provisions)
- Physiotherapy Board β€” Code of Ethics and Professional Conduct (Code, Standards, Thresholds)
- Chinese Medicine Council β€” Advertising Standard Guidance
- Medical Council β€” Statement on Advertising (BENCHMARK ONLY β€” does NOT bind non-MD practitioners)
The critical v2 design feature in this domain is `binds:` scope metadata.
Each council document declares which practitioner classes it binds. The
Medical Council statement is included as a benchmark but tagged
`benchmark-only: true` so the retriever does not cite it as authoritative
for chiropractors, naturopaths, etc. Becki's correction in v1 β†’ v2 review:
"the Medical Council statement is the strictest in the set, particularly
on testimonials. For a chiropractor or naturopath user, citing it could
give answers more conservative than their own regulator requires."
Pattern adapted from `build_advertising_standards_compilation.py` (no
LEGISLATION_SOURCES β€” all PDFs).
"""
from __future__ import annotations
import os
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402
from extract_pdf import extract_to_markdown, demote_headings # noqa: E402
PROJECT_ROOT = Path(__file__).resolve().parents[1]
CORPUS_DIR = PROJECT_ROOT / "corpus"
SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"
CORPUS_DIR.mkdir(exist_ok=True)
SOURCES_RAW.mkdir(parents=True, exist_ok=True)
DOMAIN_FILE = CORPUS_DIR / "professional-codes.md"
GUIDANCE_SOURCES = [
{
"name": "Chiropractic Board β€” Advertising Policy 2025",
"url": "https://chiropracticboard.org.nz/assets/ElementalFiles/Policies/Advertising-Policy-2025-FINAL.pdf",
"filename": "chiropractic-board-advertising-policy-2025.pdf",
"format": "pdf",
"section_title": "Chiropractic Board of NZ β€” Advertising Policy 2025",
"metadata": {
"binds": "registered chiropractors (NZ)",
"issued_by": "Chiropractic Board of New Zealand (Te Poari Kaikorohiti o Aotearoa)",
"applies_to": "advertising of chiropractic services and chiropractic care products",
},
},
{
"name": "Osteopathic Council β€” Code of Conduct for Osteopaths (Jan 2023)",
"url": "https://www.osteopathiccouncil.org.nz/common/Uploaded%20files/Publications/OCNZ%20Code%20of%20Conduct%20Jan23.pdf",
"filename": "osteopathic-council-code-of-conduct-jan-2023.pdf",
"format": "pdf",
"section_title": "Osteopathic Council of NZ β€” Code of Conduct (Jan 2023)",
"metadata": {
"binds": "registered osteopaths (NZ)",
"issued_by": "Osteopathic Council of New Zealand",
"applies_to": "professional conduct including advertising of osteopathic services",
"notes": "the council's Code of Conduct includes advertising-specific provisions; for narrower advertising-only guidance see also OCNZ Medical Advertisement Policy 2016 (not currently included in v1)",
},
},
{
# The PDF is a 361-KB kitchen-sink compendium ("Standards framework") covering
# clinical, cultural, and ethical standards β€” only ~30 KB is actually about
# advertising. We slice out just the dedicated "Advertising standard"
# subdocument (between "Advertising standard" and the next "Telehealth standard"
# heading) to keep the corpus focused.
"name": "Physiotherapy Board β€” Advertising standard (extracted from Standards framework)",
"url": "https://www.physioboard.org.nz/sites/default/files/Physiotherapy-Board-Code-Standards-Thresholds.pdf",
"filename": "physiotherapy-board-code-standards-thresholds.pdf",
"format": "pdf",
"section_title": "Physiotherapy Board of NZ β€” Advertising Standard",
# Generic slicing: keep only content between start and end regex markers.
# The Advertising standard appears twice in the PDF (once as full text, once
# as a TOC entry); we anchor on a unique nearby string ("False, misleading or deceptive advertising
# can also give rise to a breach of the Fair Trading Act 1986") to disambiguate, but
# the simpler approach: slice from the FIRST occurrence of a heading-on-its-own-line
# to the FIRST "Telehealth standard" that follows.
# Slice between the H2 *headings* (not the bare-text TOC entries that also
# appear earlier in the document). Docling preserves headings as `## Foo`,
# so we require the heading prefix to disambiguate from TOC text.
"slice_after": r"^## Advertising standard\s*$",
"slice_until": r"^## Telehealth standard\s*$",
"metadata": {
"binds": "registered physiotherapists (NZ)",
"issued_by": "Physiotherapy Board of New Zealand",
"applies_to": "advertising of physiotherapy services; cross-references Fair Trading Act 1986, Consumer Guarantees Act 1993, ASA Codes, HPCAA s7/s8",
"source_note": "extracted from the larger Physiotherapy Standards framework PDF β€” see source URL for full document",
},
},
{
"name": "Chinese Medicine Council β€” Advertising Standard Guidance",
"url": "https://www.chinesemedicinecouncil.org.nz/common/Uploaded%20files/RegistrationsDocs/Standards%20Statement%20and%20Policies/Post%20feedback%20Advertising%20Standard%20Guidance%2023June23.pdf",
"filename": "chinese-medicine-council-advertising-standard-jun-2023.pdf",
"format": "pdf",
"section_title": "Chinese Medicine Council of NZ β€” Advertising Standard Guidance",
"metadata": {
"binds": "registered Chinese medicine practitioners (NZ) β€” including acupuncturists, herbal medicine practitioners, tuina practitioners",
"issued_by": "Chinese Medicine Council of New Zealand",
"applies_to": "advertising of Chinese medicine services and products",
"notes": "Becki flagged that AU practitioners cannot rely on traditional-use claims or 2003 WHO statement when advertising β€” verify whether NZ council takes the same position",
},
},
{
"name": "Medical Council β€” Statement on Advertising (BENCHMARK ONLY)",
"url": "https://www.mcnz.org.nz/assets/standards/e93109ce92/Statement-on-advertising.pdf",
"filename": "medical-council-statement-on-advertising.pdf",
"format": "pdf",
"section_title": "Medical Council of NZ β€” Statement on Advertising (benchmark only)",
"metadata": {
"binds": "registered medical practitioners (MDs) only",
"benchmark-only": "true",
"issued_by": "Medical Council of New Zealand",
"applies_to": "advertising by registered doctors",
"WARNING": "Becki: this statement is the STRICTEST in the set, particularly on testimonials. Do NOT cite as authoritative for chiropractors, osteopaths, physiotherapists, Chinese medicine practitioners, naturopaths, or other non-MD professions. Their own councils set the binding rules for them. Use this only as a comparative benchmark when explicitly asked to compare professions.",
},
},
]
def fetch_guidance(source: dict) -> str:
"""Download a guidance PDF and convert to markdown, with binding metadata.
Optional slicing: if ``slice_after`` and/or ``slice_until`` regex strings are
set on the source, keep only the text BETWEEN those markers (inclusive of the
start, exclusive of the end). Useful for extracting a single subdocument from
a larger compendium PDF without losing the source-PDF's URL provenance.
"""
import re as _re
print(f"\n→ {source['name']}")
cache_path = SOURCES_RAW / source["filename"]
if not cache_path.exists():
import urllib.request
req = urllib.request.Request(
source["url"],
headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
)
try:
with urllib.request.urlopen(req) as resp:
data = resp.read()
cache_path.write_bytes(data)
print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
except Exception as e:
print(f" ⚠ Download failed: {e}")
return ""
else:
print(f" Using cached: {cache_path.name}")
body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()
# Optional slicing β€” keep only the substring between markers.
slice_after = source.get("slice_after")
slice_until = source.get("slice_until")
if slice_after or slice_until:
original_size = len(body)
if slice_after:
m = _re.search(slice_after, body, flags=_re.MULTILINE)
if m:
body = body[m.start():]
else:
print(f" ⚠ slice_after marker {slice_after!r} not found; keeping full text")
if slice_until:
m = _re.search(slice_until, body, flags=_re.MULTILINE)
if m:
body = body[:m.start()]
else:
print(f" ⚠ slice_until marker {slice_until!r} not found; keeping to end")
body = body.strip()
print(f" βœ‚ Sliced: {original_size:,} β†’ {len(body):,} chars")
# Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
body, _clean_stats = clean_corpus_artifacts(body)
print(format_stats(_clean_stats, label=source["name"]))
# Demote body headings so they nest under the source-level H2 wrapper we
# add below (avoids cross-source collisions like multiple "## Introduction").
body = demote_headings(body)
meta = source.get("metadata", {})
meta_lines = [f"{k}: {v}" for k, v in meta.items()]
meta_block = "\n".join(meta_lines)
return (
f"\n## {source['section_title']}\n\n"
f"Source: {source['url']}\n\n"
f"{meta_block}\n\n"
f"{body}\n"
)
def build():
print("Building professional_codes compilation\n")
guidance_blocks = []
for src in GUIDANCE_SOURCES:
block = fetch_guidance(src)
if block:
guidance_blocks.append(block)
body = "\n\n".join(guidance_blocks)
compilation = f"""# Professional Codes β€” NZ Healthcare Marketing Regulation
Source: https://www.health.govt.nz/our-work/regulation-health-and-disability-system/health-practitioners-competence-assurance-act
This compilation covers the advertising-specific standards issued by the registration boards/councils for the regulated professions in scope: chiropractors, osteopaths, physiotherapists, and Chinese medicine practitioners. The Medical Council Statement on Advertising is also included **as a benchmark only** β€” it does NOT bind non-MD practitioners.
**Critical retrieval rule:** each council document has a `binds:` metadata line declaring which practitioners it applies to. The Medical Council statement is the strictest in the set on testimonials and should not be cited as authoritative for non-MD professions; their own councils set the binding rules. Use the binding metadata to scope answers to the practitioner type the user is asking about.
{body}
"""
DOMAIN_FILE.write_text(compilation, encoding="utf-8")
lines = compilation.count("\n") + 1
size_kb = len(compilation.encode("utf-8")) / 1024
print(f"\nβœ… Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")
# Section-size sanity check β€” flags imbalanced sections that suggest a
# wrong-source choice (one PDF disproportionately larger than the others).
# Learning from v1 bug: 4-of-5 council docs were 13–33 KB but Physiotherapy
# was 361 KB because the chosen PDF was a kitchen-sink compendium.
#
# Granularity note: this check counts SOURCE-LEVEL H2 sections only β€” the
# ones we explicitly inject in `fetch_guidance` (which always have a
# `Source: ...` line immediately under the heading). Docling-extracted
# body content also contains H2 headings, but those are sub-sections of a
# single source PDF and shouldn't be measured at this granularity.
import re as _re
# Match: H2 heading + blank line + Source: URL line (source-level pattern)
source_section_pattern = _re.compile(
r"^## (?P<title>[^\n]+)\n\nSource: ",
flags=_re.MULTILINE,
)
matches = list(source_section_pattern.finditer(compilation))
section_sizes = []
for i, m in enumerate(matches):
end = matches[i + 1].start() if i + 1 < len(matches) else len(compilation)
section_sizes.append((end - m.start(), m.group("title")))
if section_sizes:
section_sizes.sort(reverse=True)
biggest = section_sizes[0][0]
smallest = section_sizes[-1][0]
ratio = biggest / max(smallest, 1)
# Threshold tuned from observation: legitimate content variation across
# focused-vs-broader council docs sits around 6–7Γ—. The catastrophic
# kitchen-sink case (e.g. v1 Physiotherapy at 26Γ—) is the target. 8Γ—
# gives us a comfortable margin without false alarms on legitimate
# imbalance (e.g. one council publishes a comprehensive code, another a
# short statement).
threshold = 8
print(f"\nSection size distribution (biggest/smallest ratio: {ratio:.1f}x):")
for size, title in section_sizes:
warn = " ⚠ disproportionate" if ratio > threshold and size == biggest else ""
print(f" {size:>7,} chars {title[:70]}{warn}")
if ratio > threshold:
print(f"\n ⚠ One section is >{threshold}x larger than the smallest. May indicate a")
print(f" wrong-source PDF (kitchen-sink compendium vs focused doc). Investigate.")
if __name__ == "__main__":
build()