Spaces:
Running
Running
| """ | |
| Build the `professional_codes` corpus domain. | |
| Council/board advertising standards for the audience's regulated professions: | |
| - Chiropractic Board β Advertising Policy 2025 | |
| - Osteopathic Council β Code of Conduct (Jan 2023, contains advertising provisions) | |
| - Physiotherapy Board β Code of Ethics and Professional Conduct (Code, Standards, Thresholds) | |
| - Chinese Medicine Council β Advertising Standard Guidance | |
| - Medical Council β Statement on Advertising (BENCHMARK ONLY β does NOT bind non-MD practitioners) | |
| The critical v2 design feature in this domain is `binds:` scope metadata. | |
| Each council document declares which practitioner classes it binds. The | |
| Medical Council statement is included as a benchmark but tagged | |
| `benchmark-only: true` so the retriever does not cite it as authoritative | |
| for chiropractors, naturopaths, etc. Becki's correction in v1 β v2 review: | |
| "the Medical Council statement is the strictest in the set, particularly | |
| on testimonials. For a chiropractor or naturopath user, citing it could | |
| give answers more conservative than their own regulator requires." | |
| Pattern adapted from `build_advertising_standards_compilation.py` (no | |
| LEGISLATION_SOURCES β all PDFs). | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).resolve().parent)) | |
| from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402 | |
| from extract_pdf import extract_to_markdown, demote_headings # noqa: E402 | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| CORPUS_DIR = PROJECT_ROOT / "corpus" | |
| SOURCES_RAW = PROJECT_ROOT / "sources" / "raw" | |
| CORPUS_DIR.mkdir(exist_ok=True) | |
| SOURCES_RAW.mkdir(parents=True, exist_ok=True) | |
| DOMAIN_FILE = CORPUS_DIR / "professional-codes.md" | |
| GUIDANCE_SOURCES = [ | |
| { | |
| "name": "Chiropractic Board β Advertising Policy 2025", | |
| "url": "https://chiropracticboard.org.nz/assets/ElementalFiles/Policies/Advertising-Policy-2025-FINAL.pdf", | |
| "filename": "chiropractic-board-advertising-policy-2025.pdf", | |
| "format": "pdf", | |
| "section_title": "Chiropractic Board of NZ β Advertising Policy 2025", | |
| "metadata": { | |
| "binds": "registered chiropractors (NZ)", | |
| "issued_by": "Chiropractic Board of New Zealand (Te Poari Kaikorohiti o Aotearoa)", | |
| "applies_to": "advertising of chiropractic services and chiropractic care products", | |
| }, | |
| }, | |
| { | |
| "name": "Osteopathic Council β Code of Conduct for Osteopaths (Jan 2023)", | |
| "url": "https://www.osteopathiccouncil.org.nz/common/Uploaded%20files/Publications/OCNZ%20Code%20of%20Conduct%20Jan23.pdf", | |
| "filename": "osteopathic-council-code-of-conduct-jan-2023.pdf", | |
| "format": "pdf", | |
| "section_title": "Osteopathic Council of NZ β Code of Conduct (Jan 2023)", | |
| "metadata": { | |
| "binds": "registered osteopaths (NZ)", | |
| "issued_by": "Osteopathic Council of New Zealand", | |
| "applies_to": "professional conduct including advertising of osteopathic services", | |
| "notes": "the council's Code of Conduct includes advertising-specific provisions; for narrower advertising-only guidance see also OCNZ Medical Advertisement Policy 2016 (not currently included in v1)", | |
| }, | |
| }, | |
| { | |
| # The PDF is a 361-KB kitchen-sink compendium ("Standards framework") covering | |
| # clinical, cultural, and ethical standards β only ~30 KB is actually about | |
| # advertising. We slice out just the dedicated "Advertising standard" | |
| # subdocument (between "Advertising standard" and the next "Telehealth standard" | |
| # heading) to keep the corpus focused. | |
| "name": "Physiotherapy Board β Advertising standard (extracted from Standards framework)", | |
| "url": "https://www.physioboard.org.nz/sites/default/files/Physiotherapy-Board-Code-Standards-Thresholds.pdf", | |
| "filename": "physiotherapy-board-code-standards-thresholds.pdf", | |
| "format": "pdf", | |
| "section_title": "Physiotherapy Board of NZ β Advertising Standard", | |
| # Generic slicing: keep only content between start and end regex markers. | |
| # The Advertising standard appears twice in the PDF (once as full text, once | |
| # as a TOC entry); we anchor on a unique nearby string ("False, misleading or deceptive advertising | |
| # can also give rise to a breach of the Fair Trading Act 1986") to disambiguate, but | |
| # the simpler approach: slice from the FIRST occurrence of a heading-on-its-own-line | |
| # to the FIRST "Telehealth standard" that follows. | |
| # Slice between the H2 *headings* (not the bare-text TOC entries that also | |
| # appear earlier in the document). Docling preserves headings as `## Foo`, | |
| # so we require the heading prefix to disambiguate from TOC text. | |
| "slice_after": r"^## Advertising standard\s*$", | |
| "slice_until": r"^## Telehealth standard\s*$", | |
| "metadata": { | |
| "binds": "registered physiotherapists (NZ)", | |
| "issued_by": "Physiotherapy Board of New Zealand", | |
| "applies_to": "advertising of physiotherapy services; cross-references Fair Trading Act 1986, Consumer Guarantees Act 1993, ASA Codes, HPCAA s7/s8", | |
| "source_note": "extracted from the larger Physiotherapy Standards framework PDF β see source URL for full document", | |
| }, | |
| }, | |
| { | |
| "name": "Chinese Medicine Council β Advertising Standard Guidance", | |
| "url": "https://www.chinesemedicinecouncil.org.nz/common/Uploaded%20files/RegistrationsDocs/Standards%20Statement%20and%20Policies/Post%20feedback%20Advertising%20Standard%20Guidance%2023June23.pdf", | |
| "filename": "chinese-medicine-council-advertising-standard-jun-2023.pdf", | |
| "format": "pdf", | |
| "section_title": "Chinese Medicine Council of NZ β Advertising Standard Guidance", | |
| "metadata": { | |
| "binds": "registered Chinese medicine practitioners (NZ) β including acupuncturists, herbal medicine practitioners, tuina practitioners", | |
| "issued_by": "Chinese Medicine Council of New Zealand", | |
| "applies_to": "advertising of Chinese medicine services and products", | |
| "notes": "Becki flagged that AU practitioners cannot rely on traditional-use claims or 2003 WHO statement when advertising β verify whether NZ council takes the same position", | |
| }, | |
| }, | |
| { | |
| "name": "Medical Council β Statement on Advertising (BENCHMARK ONLY)", | |
| "url": "https://www.mcnz.org.nz/assets/standards/e93109ce92/Statement-on-advertising.pdf", | |
| "filename": "medical-council-statement-on-advertising.pdf", | |
| "format": "pdf", | |
| "section_title": "Medical Council of NZ β Statement on Advertising (benchmark only)", | |
| "metadata": { | |
| "binds": "registered medical practitioners (MDs) only", | |
| "benchmark-only": "true", | |
| "issued_by": "Medical Council of New Zealand", | |
| "applies_to": "advertising by registered doctors", | |
| "WARNING": "Becki: this statement is the STRICTEST in the set, particularly on testimonials. Do NOT cite as authoritative for chiropractors, osteopaths, physiotherapists, Chinese medicine practitioners, naturopaths, or other non-MD professions. Their own councils set the binding rules for them. Use this only as a comparative benchmark when explicitly asked to compare professions.", | |
| }, | |
| }, | |
| ] | |
| def fetch_guidance(source: dict) -> str: | |
| """Download a guidance PDF and convert to markdown, with binding metadata. | |
| Optional slicing: if ``slice_after`` and/or ``slice_until`` regex strings are | |
| set on the source, keep only the text BETWEEN those markers (inclusive of the | |
| start, exclusive of the end). Useful for extracting a single subdocument from | |
| a larger compendium PDF without losing the source-PDF's URL provenance. | |
| """ | |
| import re as _re | |
| print(f"\nβ {source['name']}") | |
| cache_path = SOURCES_RAW / source["filename"] | |
| if not cache_path.exists(): | |
| import urllib.request | |
| req = urllib.request.Request( | |
| source["url"], | |
| headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"}, | |
| ) | |
| try: | |
| with urllib.request.urlopen(req) as resp: | |
| data = resp.read() | |
| cache_path.write_bytes(data) | |
| print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)") | |
| except Exception as e: | |
| print(f" β Download failed: {e}") | |
| return "" | |
| else: | |
| print(f" Using cached: {cache_path.name}") | |
| body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip() | |
| # Optional slicing β keep only the substring between markers. | |
| slice_after = source.get("slice_after") | |
| slice_until = source.get("slice_until") | |
| if slice_after or slice_until: | |
| original_size = len(body) | |
| if slice_after: | |
| m = _re.search(slice_after, body, flags=_re.MULTILINE) | |
| if m: | |
| body = body[m.start():] | |
| else: | |
| print(f" β slice_after marker {slice_after!r} not found; keeping full text") | |
| if slice_until: | |
| m = _re.search(slice_until, body, flags=_re.MULTILINE) | |
| if m: | |
| body = body[:m.start()] | |
| else: | |
| print(f" β slice_until marker {slice_until!r} not found; keeping to end") | |
| body = body.strip() | |
| print(f" β Sliced: {original_size:,} β {len(body):,} chars") | |
| # Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars) | |
| body, _clean_stats = clean_corpus_artifacts(body) | |
| print(format_stats(_clean_stats, label=source["name"])) | |
| # Demote body headings so they nest under the source-level H2 wrapper we | |
| # add below (avoids cross-source collisions like multiple "## Introduction"). | |
| body = demote_headings(body) | |
| meta = source.get("metadata", {}) | |
| meta_lines = [f"{k}: {v}" for k, v in meta.items()] | |
| meta_block = "\n".join(meta_lines) | |
| return ( | |
| f"\n## {source['section_title']}\n\n" | |
| f"Source: {source['url']}\n\n" | |
| f"{meta_block}\n\n" | |
| f"{body}\n" | |
| ) | |
| def build(): | |
| print("Building professional_codes compilation\n") | |
| guidance_blocks = [] | |
| for src in GUIDANCE_SOURCES: | |
| block = fetch_guidance(src) | |
| if block: | |
| guidance_blocks.append(block) | |
| body = "\n\n".join(guidance_blocks) | |
| compilation = f"""# Professional Codes β NZ Healthcare Marketing Regulation | |
| Source: https://www.health.govt.nz/our-work/regulation-health-and-disability-system/health-practitioners-competence-assurance-act | |
| This compilation covers the advertising-specific standards issued by the registration boards/councils for the regulated professions in scope: chiropractors, osteopaths, physiotherapists, and Chinese medicine practitioners. The Medical Council Statement on Advertising is also included **as a benchmark only** β it does NOT bind non-MD practitioners. | |
| **Critical retrieval rule:** each council document has a `binds:` metadata line declaring which practitioners it applies to. The Medical Council statement is the strictest in the set on testimonials and should not be cited as authoritative for non-MD professions; their own councils set the binding rules. Use the binding metadata to scope answers to the practitioner type the user is asking about. | |
| {body} | |
| """ | |
| DOMAIN_FILE.write_text(compilation, encoding="utf-8") | |
| lines = compilation.count("\n") + 1 | |
| size_kb = len(compilation.encode("utf-8")) / 1024 | |
| print(f"\nβ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)") | |
| # Section-size sanity check β flags imbalanced sections that suggest a | |
| # wrong-source choice (one PDF disproportionately larger than the others). | |
| # Learning from v1 bug: 4-of-5 council docs were 13β33 KB but Physiotherapy | |
| # was 361 KB because the chosen PDF was a kitchen-sink compendium. | |
| # | |
| # Granularity note: this check counts SOURCE-LEVEL H2 sections only β the | |
| # ones we explicitly inject in `fetch_guidance` (which always have a | |
| # `Source: ...` line immediately under the heading). Docling-extracted | |
| # body content also contains H2 headings, but those are sub-sections of a | |
| # single source PDF and shouldn't be measured at this granularity. | |
| import re as _re | |
| # Match: H2 heading + blank line + Source: URL line (source-level pattern) | |
| source_section_pattern = _re.compile( | |
| r"^## (?P<title>[^\n]+)\n\nSource: ", | |
| flags=_re.MULTILINE, | |
| ) | |
| matches = list(source_section_pattern.finditer(compilation)) | |
| section_sizes = [] | |
| for i, m in enumerate(matches): | |
| end = matches[i + 1].start() if i + 1 < len(matches) else len(compilation) | |
| section_sizes.append((end - m.start(), m.group("title"))) | |
| if section_sizes: | |
| section_sizes.sort(reverse=True) | |
| biggest = section_sizes[0][0] | |
| smallest = section_sizes[-1][0] | |
| ratio = biggest / max(smallest, 1) | |
| # Threshold tuned from observation: legitimate content variation across | |
| # focused-vs-broader council docs sits around 6β7Γ. The catastrophic | |
| # kitchen-sink case (e.g. v1 Physiotherapy at 26Γ) is the target. 8Γ | |
| # gives us a comfortable margin without false alarms on legitimate | |
| # imbalance (e.g. one council publishes a comprehensive code, another a | |
| # short statement). | |
| threshold = 8 | |
| print(f"\nSection size distribution (biggest/smallest ratio: {ratio:.1f}x):") | |
| for size, title in section_sizes: | |
| warn = " β disproportionate" if ratio > threshold and size == biggest else "" | |
| print(f" {size:>7,} chars {title[:70]}{warn}") | |
| if ratio > threshold: | |
| print(f"\n β One section is >{threshold}x larger than the smallest. May indicate a") | |
| print(f" wrong-source PDF (kitchen-sink compendium vs focused doc). Investigate.") | |
| if __name__ == "__main__": | |
| build() | |