Spaces:

webmuppetnz
/

hmc-rag

Running

hmc-rag / scripts /build_professional_codes_compilation.py

webmuppet

Initial commit — health marketing compliance RAG

bad8b6c 5 days ago

14.1 kB

	"""
	Build the `professional_codes` corpus domain.

	Council/board advertising standards for the audience's regulated professions:

	- Chiropractic Board — Advertising Policy 2025
	- Osteopathic Council — Code of Conduct (Jan 2023, contains advertising provisions)
	- Physiotherapy Board — Code of Ethics and Professional Conduct (Code, Standards, Thresholds)
	- Chinese Medicine Council — Advertising Standard Guidance
	- Medical Council — Statement on Advertising (BENCHMARK ONLY — does NOT bind non-MD practitioners)

	The critical v2 design feature in this domain is `binds:` scope metadata.
	Each council document declares which practitioner classes it binds. The
	Medical Council statement is included as a benchmark but tagged
	`benchmark-only: true` so the retriever does not cite it as authoritative
	for chiropractors, naturopaths, etc. Becki's correction in v1 → v2 review:
	"the Medical Council statement is the strictest in the set, particularly
	on testimonials. For a chiropractor or naturopath user, citing it could
	give answers more conservative than their own regulator requires."

	Pattern adapted from `build_advertising_standards_compilation.py` (no
	LEGISLATION_SOURCES — all PDFs).
	"""

	from __future__ import annotations

	import os
	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).resolve().parent))

	from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402
	from extract_pdf import extract_to_markdown, demote_headings # noqa: E402

	PROJECT_ROOT = Path(__file__).resolve().parents[1]
	CORPUS_DIR = PROJECT_ROOT / "corpus"
	SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"

	CORPUS_DIR.mkdir(exist_ok=True)
	SOURCES_RAW.mkdir(parents=True, exist_ok=True)

	DOMAIN_FILE = CORPUS_DIR / "professional-codes.md"

	GUIDANCE_SOURCES = [
	{
	"name": "Chiropractic Board — Advertising Policy 2025",
	"url": "https://chiropracticboard.org.nz/assets/ElementalFiles/Policies/Advertising-Policy-2025-FINAL.pdf",
	"filename": "chiropractic-board-advertising-policy-2025.pdf",
	"format": "pdf",
	"section_title": "Chiropractic Board of NZ — Advertising Policy 2025",
	"metadata": {
	"binds": "registered chiropractors (NZ)",
	"issued_by": "Chiropractic Board of New Zealand (Te Poari Kaikorohiti o Aotearoa)",
	"applies_to": "advertising of chiropractic services and chiropractic care products",
	},
	},
	{
	"name": "Osteopathic Council — Code of Conduct for Osteopaths (Jan 2023)",
	"url": "https://www.osteopathiccouncil.org.nz/common/Uploaded%20files/Publications/OCNZ%20Code%20of%20Conduct%20Jan23.pdf",
	"filename": "osteopathic-council-code-of-conduct-jan-2023.pdf",
	"format": "pdf",
	"section_title": "Osteopathic Council of NZ — Code of Conduct (Jan 2023)",
	"metadata": {
	"binds": "registered osteopaths (NZ)",
	"issued_by": "Osteopathic Council of New Zealand",
	"applies_to": "professional conduct including advertising of osteopathic services",
	"notes": "the council's Code of Conduct includes advertising-specific provisions; for narrower advertising-only guidance see also OCNZ Medical Advertisement Policy 2016 (not currently included in v1)",
	},
	},
	{
	# The PDF is a 361-KB kitchen-sink compendium ("Standards framework") covering
	# clinical, cultural, and ethical standards — only ~30 KB is actually about
	# advertising. We slice out just the dedicated "Advertising standard"
	# subdocument (between "Advertising standard" and the next "Telehealth standard"
	# heading) to keep the corpus focused.
	"name": "Physiotherapy Board — Advertising standard (extracted from Standards framework)",
	"url": "https://www.physioboard.org.nz/sites/default/files/Physiotherapy-Board-Code-Standards-Thresholds.pdf",
	"filename": "physiotherapy-board-code-standards-thresholds.pdf",
	"format": "pdf",
	"section_title": "Physiotherapy Board of NZ — Advertising Standard",
	# Generic slicing: keep only content between start and end regex markers.
	# The Advertising standard appears twice in the PDF (once as full text, once
	# as a TOC entry); we anchor on a unique nearby string ("False, misleading or deceptive advertising
	# can also give rise to a breach of the Fair Trading Act 1986") to disambiguate, but
	# the simpler approach: slice from the FIRST occurrence of a heading-on-its-own-line
	# to the FIRST "Telehealth standard" that follows.
	# Slice between the H2 headings (not the bare-text TOC entries that also
	# appear earlier in the document). Docling preserves headings as `## Foo`,
	# so we require the heading prefix to disambiguate from TOC text.
	"slice_after": r"^## Advertising standard\s*$",
	"slice_until": r"^## Telehealth standard\s*$",
	"metadata": {
	"binds": "registered physiotherapists (NZ)",
	"issued_by": "Physiotherapy Board of New Zealand",
	"applies_to": "advertising of physiotherapy services; cross-references Fair Trading Act 1986, Consumer Guarantees Act 1993, ASA Codes, HPCAA s7/s8",
	"source_note": "extracted from the larger Physiotherapy Standards framework PDF — see source URL for full document",
	},
	},
	{
	"name": "Chinese Medicine Council — Advertising Standard Guidance",
	"url": "https://www.chinesemedicinecouncil.org.nz/common/Uploaded%20files/RegistrationsDocs/Standards%20Statement%20and%20Policies/Post%20feedback%20Advertising%20Standard%20Guidance%2023June23.pdf",
	"filename": "chinese-medicine-council-advertising-standard-jun-2023.pdf",
	"format": "pdf",
	"section_title": "Chinese Medicine Council of NZ — Advertising Standard Guidance",
	"metadata": {
	"binds": "registered Chinese medicine practitioners (NZ) — including acupuncturists, herbal medicine practitioners, tuina practitioners",
	"issued_by": "Chinese Medicine Council of New Zealand",
	"applies_to": "advertising of Chinese medicine services and products",
	"notes": "Becki flagged that AU practitioners cannot rely on traditional-use claims or 2003 WHO statement when advertising — verify whether NZ council takes the same position",
	},
	},
	{
	"name": "Medical Council — Statement on Advertising (BENCHMARK ONLY)",
	"url": "https://www.mcnz.org.nz/assets/standards/e93109ce92/Statement-on-advertising.pdf",
	"filename": "medical-council-statement-on-advertising.pdf",
	"format": "pdf",
	"section_title": "Medical Council of NZ — Statement on Advertising (benchmark only)",
	"metadata": {
	"binds": "registered medical practitioners (MDs) only",
	"benchmark-only": "true",
	"issued_by": "Medical Council of New Zealand",
	"applies_to": "advertising by registered doctors",
	"WARNING": "Becki: this statement is the STRICTEST in the set, particularly on testimonials. Do NOT cite as authoritative for chiropractors, osteopaths, physiotherapists, Chinese medicine practitioners, naturopaths, or other non-MD professions. Their own councils set the binding rules for them. Use this only as a comparative benchmark when explicitly asked to compare professions.",
	},
	},
	]


	def fetch_guidance(source: dict) -> str:
	"""Download a guidance PDF and convert to markdown, with binding metadata.

	Optional slicing: if ``slice_after`` and/or ``slice_until`` regex strings are
	set on the source, keep only the text BETWEEN those markers (inclusive of the
	start, exclusive of the end). Useful for extracting a single subdocument from
	a larger compendium PDF without losing the source-PDF's URL provenance.
	"""
	import re as _re

	print(f"\n→ {source['name']}")

	cache_path = SOURCES_RAW / source["filename"]
	if not cache_path.exists():
	import urllib.request
	req = urllib.request.Request(
	source["url"],
	headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
	)
	try:
	with urllib.request.urlopen(req) as resp:
	data = resp.read()
	cache_path.write_bytes(data)
	print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
	except Exception as e:
	print(f" ⚠ Download failed: {e}")
	return ""
	else:
	print(f" Using cached: {cache_path.name}")

	body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()

	# Optional slicing — keep only the substring between markers.
	slice_after = source.get("slice_after")
	slice_until = source.get("slice_until")
	if slice_after or slice_until:
	original_size = len(body)
	if slice_after:
	m = _re.search(slice_after, body, flags=_re.MULTILINE)
	if m:
	body = body[m.start():]
	else:
	print(f" ⚠ slice_after marker {slice_after!r} not found; keeping full text")
	if slice_until:
	m = _re.search(slice_until, body, flags=_re.MULTILINE)
	if m:
	body = body[:m.start()]
	else:
	print(f" ⚠ slice_until marker {slice_until!r} not found; keeping to end")
	body = body.strip()
	print(f" ✂ Sliced: {original_size:,} → {len(body):,} chars")

	# Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
	body, _clean_stats = clean_corpus_artifacts(body)
	print(format_stats(_clean_stats, label=source["name"]))

	# Demote body headings so they nest under the source-level H2 wrapper we
	# add below (avoids cross-source collisions like multiple "## Introduction").
	body = demote_headings(body)

	meta = source.get("metadata", {})
	meta_lines = [f"{k}: {v}" for k, v in meta.items()]
	meta_block = "\n".join(meta_lines)

	return (
	f"\n## {source['section_title']}\n\n"
	f"Source: {source['url']}\n\n"
	f"{meta_block}\n\n"
	f"{body}\n"
	)


	def build():
	print("Building professional_codes compilation\n")

	guidance_blocks = []
	for src in GUIDANCE_SOURCES:
	block = fetch_guidance(src)
	if block:
	guidance_blocks.append(block)

	body = "\n\n".join(guidance_blocks)

	compilation = f"""# Professional Codes — NZ Healthcare Marketing Regulation

	Source: https://www.health.govt.nz/our-work/regulation-health-and-disability-system/health-practitioners-competence-assurance-act

	This compilation covers the advertising-specific standards issued by the registration boards/councils for the regulated professions in scope: chiropractors, osteopaths, physiotherapists, and Chinese medicine practitioners. The Medical Council Statement on Advertising is also included as a benchmark only — it does NOT bind non-MD practitioners.

	Critical retrieval rule: each council document has a `binds:` metadata line declaring which practitioners it applies to. The Medical Council statement is the strictest in the set on testimonials and should not be cited as authoritative for non-MD professions; their own councils set the binding rules. Use the binding metadata to scope answers to the practitioner type the user is asking about.

	{body}
	"""

	DOMAIN_FILE.write_text(compilation, encoding="utf-8")
	lines = compilation.count("\n") + 1
	size_kb = len(compilation.encode("utf-8")) / 1024
	print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")

	# Section-size sanity check — flags imbalanced sections that suggest a
	# wrong-source choice (one PDF disproportionately larger than the others).
	# Learning from v1 bug: 4-of-5 council docs were 13–33 KB but Physiotherapy
	# was 361 KB because the chosen PDF was a kitchen-sink compendium.
	#
	# Granularity note: this check counts SOURCE-LEVEL H2 sections only — the
	# ones we explicitly inject in `fetch_guidance` (which always have a
	# `Source: ...` line immediately under the heading). Docling-extracted
	# body content also contains H2 headings, but those are sub-sections of a
	# single source PDF and shouldn't be measured at this granularity.
	import re as _re
	# Match: H2 heading + blank line + Source: URL line (source-level pattern)
	source_section_pattern = _re.compile(
	r"^## (?P<title>[^\n]+)\n\nSource: ",
	flags=_re.MULTILINE,
	)
	matches = list(source_section_pattern.finditer(compilation))
	section_sizes = []
	for i, m in enumerate(matches):
	end = matches[i + 1].start() if i + 1 < len(matches) else len(compilation)
	section_sizes.append((end - m.start(), m.group("title")))
	if section_sizes:
	section_sizes.sort(reverse=True)
	biggest = section_sizes[0][0]
	smallest = section_sizes[-1][0]
	ratio = biggest / max(smallest, 1)
	# Threshold tuned from observation: legitimate content variation across
	# focused-vs-broader council docs sits around 6–7×. The catastrophic
	# kitchen-sink case (e.g. v1 Physiotherapy at 26×) is the target. 8×
	# gives us a comfortable margin without false alarms on legitimate
	# imbalance (e.g. one council publishes a comprehensive code, another a
	# short statement).
	threshold = 8
	print(f"\nSection size distribution (biggest/smallest ratio: {ratio:.1f}x):")
	for size, title in section_sizes:
	warn = " ⚠ disproportionate" if ratio > threshold and size == biggest else ""
	print(f" {size:>7,} chars {title[:70]}{warn}")
	if ratio > threshold:
	print(f"\n ⚠ One section is >{threshold}x larger than the smallest. May indicate a")
	print(f" wrong-source PDF (kitchen-sink compendium vs focused doc). Investigate.")


	if __name__ == "__main__":
	build()