""" Build the `consumer_protection` corpus domain. Combines: - Fair Trading Act 1986 (Part 1 — Misleading and deceptive conduct, with s12A substantiation provision flagged) — administered by the Commerce Commission - ComCom Making Accurate Claims (health and nutrition) guidance Pattern adapted from `build_medicines_and_supplements_compilation.py` (the canonical template). See that script for design notes on per-source section flagging, fallback strategies, and the open question on retrieval mechanism for `tags:` metadata. """ from __future__ import annotations import os import re import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent)) from bs4 import BeautifulSoup # noqa: E402 from convert_legislation_html import ( # noqa: E402 download_html, extract_part, extract_subpart_by_content, convert_part, convert_subpart, ) from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402 from extract_pdf import extract_to_markdown, demote_headings # noqa: E402 PROJECT_ROOT = Path(__file__).resolve().parents[1] CORPUS_DIR = PROJECT_ROOT / "corpus" SOURCES_RAW = PROJECT_ROOT / "sources" / "raw" CORPUS_DIR.mkdir(exist_ok=True) SOURCES_RAW.mkdir(parents=True, exist_ok=True) DOMAIN_FILE = CORPUS_DIR / "consumer-protection.md" LEGISLATION_SOURCES = [ { "name": "Fair Trading Act 1986", "url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/whole.html", "filename": "fair-trading-act-1986.html", "base_url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/", # First run: take all Parts. After inspection we may narrow to just Part 1 # (Misleading and deceptive conduct) — the part containing s12A substantiation. "parts_by_id": [], "parts_by_text": [], "section_flags": [ # s12A — substantiation. Becki: "the provision that most often trips up # health and wellness advertisers" { "match": re.compile(r"^####\s*12A\b[^\n]*", re.MULTILINE), "tags": "substantiation, claims, frequently-cited", }, ], }, ] GUIDANCE_SOURCES = [ { "name": "ComCom — Making accurate claims (health and nutrition)", "url": "https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims", "filename": "comcom-making-accurate-claims.html", "format": "html", "section_title": "Commerce Commission — Making Accurate Claims (Health and Nutrition)", }, # Additional ComCom guidance can be added here once URLs confirmed: # - "Trusting origin, environment and health claims" (consumer-facing) # - Health sector competition guidelines (older but still cited) ] def fetch_legislation(source: dict) -> str: """Download (or use cached) HTML and convert to markdown with per-section URLs.""" print(f"\n→ {source['name']}") path = download_html(source["url"], source["filename"]) with open(path, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") parts_md = [] for part_id in source.get("parts_by_id", []): part = extract_part(soup, part_id) if part: md = convert_part(part, base_url=source["base_url"]) parts_md.append(md) print(f" ✓ Part by id={part_id} ({len(md)} chars)") else: print(f" ⚠ Part id={part_id} not found") for text_match in source.get("parts_by_text", []): part = next( (p for p in soup.find_all("div", class_="part") if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()), None, ) if part: md = convert_part(part, base_url=source["base_url"]) parts_md.append(md) print(f" ✓ Part by text={text_match!r} ({len(md)} chars)") continue subpart = extract_subpart_by_content(soup, text_match) if subpart: md = convert_subpart(subpart, base_url=source["base_url"]) parts_md.append(md) print(f" ✓ Subpart by text={text_match!r} ({len(md)} chars)") else: print(f" ⚠ No Part/Subpart matching text={text_match!r}") if not source.get("parts_by_id") and not source.get("parts_by_text"): for part in soup.find_all("div", class_="part"): md = convert_part(part, base_url=source["base_url"]) parts_md.append(md) print(f" ✓ All Parts ({len(parts_md)} parts, " f"{sum(len(m) for m in parts_md)} chars)") merged = "\n\n".join(parts_md) # Clean PDF/HTML extraction artefacts before applying section flags merged, _clean_stats = clean_corpus_artifacts(merged) print(format_stats(_clean_stats, label=source["name"])) for flag in source.get("section_flags", []): before = merged.count(flag["tags"]) merged = flag["match"].sub( lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}", merged, ) applied = merged.count(flag["tags"]) - before print(f" ✓ Applied flag {flag['tags']!r}: {applied} match(es)") # Demote headings so legislation Parts/Subparts/sections nest under the # source-level H2 wrapper added in build() — avoids cross-Act H2 collisions # like Privacy Act + UEMA both having "Part 1 Preliminary provisions". merged = demote_headings(merged) return merged def fetch_guidance(source: dict) -> str: """Download a guidance document (HTML or PDF) and convert to markdown.""" print(f"\n→ {source['name']}") cache_path = SOURCES_RAW / source["filename"] if not cache_path.exists(): import urllib.request req = urllib.request.Request( source["url"], headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"}, ) try: with urllib.request.urlopen(req) as resp: data = resp.read() cache_path.write_bytes(data) print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)") except Exception as e: print(f" ⚠ Download failed: {e}") return "" else: print(f" Using cached: {cache_path.name}") body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip() # Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars) body, _clean_stats = clean_corpus_artifacts(body) print(format_stats(_clean_stats, label=source["name"])) # Demote body headings so they nest under the source-level H2 wrapper we # add below (avoids cross-source collisions like multiple "## Introduction"). body = demote_headings(body) return ( f"\n## {source['section_title']}\n\n" f"Source: {source['url']}\n\n" f"{body}\n" ) def build(): print("Building consumer_protection compilation\n") legislation_blocks = [] for src in LEGISLATION_SOURCES: block = fetch_legislation(src) if block: legislation_blocks.append( f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}" ) guidance_blocks = [] for src in GUIDANCE_SOURCES: block = fetch_guidance(src) if block: guidance_blocks.append(block) body = "\n\n".join(legislation_blocks + guidance_blocks) compilation = f"""# Consumer Protection — NZ Healthcare Marketing Regulation Source: https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims This compilation covers New Zealand's general consumer-protection law as it applies to healthcare marketing. The Fair Trading Act 1986 — administered by the Commerce Commission — prohibits misleading and deceptive conduct, false representations, and unsubstantiated claims (s12A). For health and wellness advertisers, s12A is the most-tripped-over provision; making any health benefit claim without a reasonable basis is a Fair Trading Act breach regardless of whether the claim is technically true. Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing. {body} """ DOMAIN_FILE.write_text(compilation, encoding="utf-8") lines = compilation.count("\n") + 1 size_kb = len(compilation.encode("utf-8")) / 1024 print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)") if __name__ == "__main__": build()