Spaces:
Running
Running
File size: 14,132 Bytes
bad8b6c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 | """
Build the `professional_codes` corpus domain.
Council/board advertising standards for the audience's regulated professions:
- Chiropractic Board β Advertising Policy 2025
- Osteopathic Council β Code of Conduct (Jan 2023, contains advertising provisions)
- Physiotherapy Board β Code of Ethics and Professional Conduct (Code, Standards, Thresholds)
- Chinese Medicine Council β Advertising Standard Guidance
- Medical Council β Statement on Advertising (BENCHMARK ONLY β does NOT bind non-MD practitioners)
The critical v2 design feature in this domain is `binds:` scope metadata.
Each council document declares which practitioner classes it binds. The
Medical Council statement is included as a benchmark but tagged
`benchmark-only: true` so the retriever does not cite it as authoritative
for chiropractors, naturopaths, etc. Becki's correction in v1 β v2 review:
"the Medical Council statement is the strictest in the set, particularly
on testimonials. For a chiropractor or naturopath user, citing it could
give answers more conservative than their own regulator requires."
Pattern adapted from `build_advertising_standards_compilation.py` (no
LEGISLATION_SOURCES β all PDFs).
"""
from __future__ import annotations
import os
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402
from extract_pdf import extract_to_markdown, demote_headings # noqa: E402
PROJECT_ROOT = Path(__file__).resolve().parents[1]
CORPUS_DIR = PROJECT_ROOT / "corpus"
SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"
CORPUS_DIR.mkdir(exist_ok=True)
SOURCES_RAW.mkdir(parents=True, exist_ok=True)
DOMAIN_FILE = CORPUS_DIR / "professional-codes.md"
GUIDANCE_SOURCES = [
{
"name": "Chiropractic Board β Advertising Policy 2025",
"url": "https://chiropracticboard.org.nz/assets/ElementalFiles/Policies/Advertising-Policy-2025-FINAL.pdf",
"filename": "chiropractic-board-advertising-policy-2025.pdf",
"format": "pdf",
"section_title": "Chiropractic Board of NZ β Advertising Policy 2025",
"metadata": {
"binds": "registered chiropractors (NZ)",
"issued_by": "Chiropractic Board of New Zealand (Te Poari Kaikorohiti o Aotearoa)",
"applies_to": "advertising of chiropractic services and chiropractic care products",
},
},
{
"name": "Osteopathic Council β Code of Conduct for Osteopaths (Jan 2023)",
"url": "https://www.osteopathiccouncil.org.nz/common/Uploaded%20files/Publications/OCNZ%20Code%20of%20Conduct%20Jan23.pdf",
"filename": "osteopathic-council-code-of-conduct-jan-2023.pdf",
"format": "pdf",
"section_title": "Osteopathic Council of NZ β Code of Conduct (Jan 2023)",
"metadata": {
"binds": "registered osteopaths (NZ)",
"issued_by": "Osteopathic Council of New Zealand",
"applies_to": "professional conduct including advertising of osteopathic services",
"notes": "the council's Code of Conduct includes advertising-specific provisions; for narrower advertising-only guidance see also OCNZ Medical Advertisement Policy 2016 (not currently included in v1)",
},
},
{
# The PDF is a 361-KB kitchen-sink compendium ("Standards framework") covering
# clinical, cultural, and ethical standards β only ~30 KB is actually about
# advertising. We slice out just the dedicated "Advertising standard"
# subdocument (between "Advertising standard" and the next "Telehealth standard"
# heading) to keep the corpus focused.
"name": "Physiotherapy Board β Advertising standard (extracted from Standards framework)",
"url": "https://www.physioboard.org.nz/sites/default/files/Physiotherapy-Board-Code-Standards-Thresholds.pdf",
"filename": "physiotherapy-board-code-standards-thresholds.pdf",
"format": "pdf",
"section_title": "Physiotherapy Board of NZ β Advertising Standard",
# Generic slicing: keep only content between start and end regex markers.
# The Advertising standard appears twice in the PDF (once as full text, once
# as a TOC entry); we anchor on a unique nearby string ("False, misleading or deceptive advertising
# can also give rise to a breach of the Fair Trading Act 1986") to disambiguate, but
# the simpler approach: slice from the FIRST occurrence of a heading-on-its-own-line
# to the FIRST "Telehealth standard" that follows.
# Slice between the H2 *headings* (not the bare-text TOC entries that also
# appear earlier in the document). Docling preserves headings as `## Foo`,
# so we require the heading prefix to disambiguate from TOC text.
"slice_after": r"^## Advertising standard\s*$",
"slice_until": r"^## Telehealth standard\s*$",
"metadata": {
"binds": "registered physiotherapists (NZ)",
"issued_by": "Physiotherapy Board of New Zealand",
"applies_to": "advertising of physiotherapy services; cross-references Fair Trading Act 1986, Consumer Guarantees Act 1993, ASA Codes, HPCAA s7/s8",
"source_note": "extracted from the larger Physiotherapy Standards framework PDF β see source URL for full document",
},
},
{
"name": "Chinese Medicine Council β Advertising Standard Guidance",
"url": "https://www.chinesemedicinecouncil.org.nz/common/Uploaded%20files/RegistrationsDocs/Standards%20Statement%20and%20Policies/Post%20feedback%20Advertising%20Standard%20Guidance%2023June23.pdf",
"filename": "chinese-medicine-council-advertising-standard-jun-2023.pdf",
"format": "pdf",
"section_title": "Chinese Medicine Council of NZ β Advertising Standard Guidance",
"metadata": {
"binds": "registered Chinese medicine practitioners (NZ) β including acupuncturists, herbal medicine practitioners, tuina practitioners",
"issued_by": "Chinese Medicine Council of New Zealand",
"applies_to": "advertising of Chinese medicine services and products",
"notes": "Becki flagged that AU practitioners cannot rely on traditional-use claims or 2003 WHO statement when advertising β verify whether NZ council takes the same position",
},
},
{
"name": "Medical Council β Statement on Advertising (BENCHMARK ONLY)",
"url": "https://www.mcnz.org.nz/assets/standards/e93109ce92/Statement-on-advertising.pdf",
"filename": "medical-council-statement-on-advertising.pdf",
"format": "pdf",
"section_title": "Medical Council of NZ β Statement on Advertising (benchmark only)",
"metadata": {
"binds": "registered medical practitioners (MDs) only",
"benchmark-only": "true",
"issued_by": "Medical Council of New Zealand",
"applies_to": "advertising by registered doctors",
"WARNING": "Becki: this statement is the STRICTEST in the set, particularly on testimonials. Do NOT cite as authoritative for chiropractors, osteopaths, physiotherapists, Chinese medicine practitioners, naturopaths, or other non-MD professions. Their own councils set the binding rules for them. Use this only as a comparative benchmark when explicitly asked to compare professions.",
},
},
]
def fetch_guidance(source: dict) -> str:
"""Download a guidance PDF and convert to markdown, with binding metadata.
Optional slicing: if ``slice_after`` and/or ``slice_until`` regex strings are
set on the source, keep only the text BETWEEN those markers (inclusive of the
start, exclusive of the end). Useful for extracting a single subdocument from
a larger compendium PDF without losing the source-PDF's URL provenance.
"""
import re as _re
print(f"\nβ {source['name']}")
cache_path = SOURCES_RAW / source["filename"]
if not cache_path.exists():
import urllib.request
req = urllib.request.Request(
source["url"],
headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
)
try:
with urllib.request.urlopen(req) as resp:
data = resp.read()
cache_path.write_bytes(data)
print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
except Exception as e:
print(f" β Download failed: {e}")
return ""
else:
print(f" Using cached: {cache_path.name}")
body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()
# Optional slicing β keep only the substring between markers.
slice_after = source.get("slice_after")
slice_until = source.get("slice_until")
if slice_after or slice_until:
original_size = len(body)
if slice_after:
m = _re.search(slice_after, body, flags=_re.MULTILINE)
if m:
body = body[m.start():]
else:
print(f" β slice_after marker {slice_after!r} not found; keeping full text")
if slice_until:
m = _re.search(slice_until, body, flags=_re.MULTILINE)
if m:
body = body[:m.start()]
else:
print(f" β slice_until marker {slice_until!r} not found; keeping to end")
body = body.strip()
print(f" β Sliced: {original_size:,} β {len(body):,} chars")
# Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
body, _clean_stats = clean_corpus_artifacts(body)
print(format_stats(_clean_stats, label=source["name"]))
# Demote body headings so they nest under the source-level H2 wrapper we
# add below (avoids cross-source collisions like multiple "## Introduction").
body = demote_headings(body)
meta = source.get("metadata", {})
meta_lines = [f"{k}: {v}" for k, v in meta.items()]
meta_block = "\n".join(meta_lines)
return (
f"\n## {source['section_title']}\n\n"
f"Source: {source['url']}\n\n"
f"{meta_block}\n\n"
f"{body}\n"
)
def build():
print("Building professional_codes compilation\n")
guidance_blocks = []
for src in GUIDANCE_SOURCES:
block = fetch_guidance(src)
if block:
guidance_blocks.append(block)
body = "\n\n".join(guidance_blocks)
compilation = f"""# Professional Codes β NZ Healthcare Marketing Regulation
Source: https://www.health.govt.nz/our-work/regulation-health-and-disability-system/health-practitioners-competence-assurance-act
This compilation covers the advertising-specific standards issued by the registration boards/councils for the regulated professions in scope: chiropractors, osteopaths, physiotherapists, and Chinese medicine practitioners. The Medical Council Statement on Advertising is also included **as a benchmark only** β it does NOT bind non-MD practitioners.
**Critical retrieval rule:** each council document has a `binds:` metadata line declaring which practitioners it applies to. The Medical Council statement is the strictest in the set on testimonials and should not be cited as authoritative for non-MD professions; their own councils set the binding rules. Use the binding metadata to scope answers to the practitioner type the user is asking about.
{body}
"""
DOMAIN_FILE.write_text(compilation, encoding="utf-8")
lines = compilation.count("\n") + 1
size_kb = len(compilation.encode("utf-8")) / 1024
print(f"\nβ
Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")
# Section-size sanity check β flags imbalanced sections that suggest a
# wrong-source choice (one PDF disproportionately larger than the others).
# Learning from v1 bug: 4-of-5 council docs were 13β33 KB but Physiotherapy
# was 361 KB because the chosen PDF was a kitchen-sink compendium.
#
# Granularity note: this check counts SOURCE-LEVEL H2 sections only β the
# ones we explicitly inject in `fetch_guidance` (which always have a
# `Source: ...` line immediately under the heading). Docling-extracted
# body content also contains H2 headings, but those are sub-sections of a
# single source PDF and shouldn't be measured at this granularity.
import re as _re
# Match: H2 heading + blank line + Source: URL line (source-level pattern)
source_section_pattern = _re.compile(
r"^## (?P<title>[^\n]+)\n\nSource: ",
flags=_re.MULTILINE,
)
matches = list(source_section_pattern.finditer(compilation))
section_sizes = []
for i, m in enumerate(matches):
end = matches[i + 1].start() if i + 1 < len(matches) else len(compilation)
section_sizes.append((end - m.start(), m.group("title")))
if section_sizes:
section_sizes.sort(reverse=True)
biggest = section_sizes[0][0]
smallest = section_sizes[-1][0]
ratio = biggest / max(smallest, 1)
# Threshold tuned from observation: legitimate content variation across
# focused-vs-broader council docs sits around 6β7Γ. The catastrophic
# kitchen-sink case (e.g. v1 Physiotherapy at 26Γ) is the target. 8Γ
# gives us a comfortable margin without false alarms on legitimate
# imbalance (e.g. one council publishes a comprehensive code, another a
# short statement).
threshold = 8
print(f"\nSection size distribution (biggest/smallest ratio: {ratio:.1f}x):")
for size, title in section_sizes:
warn = " β disproportionate" if ratio > threshold and size == biggest else ""
print(f" {size:>7,} chars {title[:70]}{warn}")
if ratio > threshold:
print(f"\n β One section is >{threshold}x larger than the smallest. May indicate a")
print(f" wrong-source PDF (kitchen-sink compendium vs focused doc). Investigate.")
if __name__ == "__main__":
build()
|