hmc-rag / scripts /build_consumer_protection_compilation.py
webmuppet
Initial commit — health marketing compliance RAG
bad8b6c
"""
Build the `consumer_protection` corpus domain.
Combines:
- Fair Trading Act 1986 (Part 1 — Misleading and deceptive conduct, with s12A
substantiation provision flagged) — administered by the Commerce Commission
- ComCom Making Accurate Claims (health and nutrition) guidance
Pattern adapted from `build_medicines_and_supplements_compilation.py` (the
canonical template). See that script for design notes on per-source section
flagging, fallback strategies, and the open question on retrieval mechanism
for `tags:` metadata.
"""
from __future__ import annotations
import os
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
from bs4 import BeautifulSoup # noqa: E402
from convert_legislation_html import ( # noqa: E402
download_html,
extract_part,
extract_subpart_by_content,
convert_part,
convert_subpart,
)
from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402
from extract_pdf import extract_to_markdown, demote_headings # noqa: E402
PROJECT_ROOT = Path(__file__).resolve().parents[1]
CORPUS_DIR = PROJECT_ROOT / "corpus"
SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"
CORPUS_DIR.mkdir(exist_ok=True)
SOURCES_RAW.mkdir(parents=True, exist_ok=True)
DOMAIN_FILE = CORPUS_DIR / "consumer-protection.md"
LEGISLATION_SOURCES = [
{
"name": "Fair Trading Act 1986",
"url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/whole.html",
"filename": "fair-trading-act-1986.html",
"base_url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/",
# First run: take all Parts. After inspection we may narrow to just Part 1
# (Misleading and deceptive conduct) — the part containing s12A substantiation.
"parts_by_id": [],
"parts_by_text": [],
"section_flags": [
# s12A — substantiation. Becki: "the provision that most often trips up
# health and wellness advertisers"
{
"match": re.compile(r"^####\s*12A\b[^\n]*", re.MULTILINE),
"tags": "substantiation, claims, frequently-cited",
},
],
},
]
GUIDANCE_SOURCES = [
{
"name": "ComCom — Making accurate claims (health and nutrition)",
"url": "https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims",
"filename": "comcom-making-accurate-claims.html",
"format": "html",
"section_title": "Commerce Commission — Making Accurate Claims (Health and Nutrition)",
},
# Additional ComCom guidance can be added here once URLs confirmed:
# - "Trusting origin, environment and health claims" (consumer-facing)
# - Health sector competition guidelines (older but still cited)
]
def fetch_legislation(source: dict) -> str:
"""Download (or use cached) HTML and convert to markdown with per-section URLs."""
print(f"\n→ {source['name']}")
path = download_html(source["url"], source["filename"])
with open(path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
parts_md = []
for part_id in source.get("parts_by_id", []):
part = extract_part(soup, part_id)
if part:
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ Part by id={part_id} ({len(md)} chars)")
else:
print(f" âš  Part id={part_id} not found")
for text_match in source.get("parts_by_text", []):
part = next(
(p for p in soup.find_all("div", class_="part")
if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()),
None,
)
if part:
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ Part by text={text_match!r} ({len(md)} chars)")
continue
subpart = extract_subpart_by_content(soup, text_match)
if subpart:
md = convert_subpart(subpart, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ Subpart by text={text_match!r} ({len(md)} chars)")
else:
print(f" âš  No Part/Subpart matching text={text_match!r}")
if not source.get("parts_by_id") and not source.get("parts_by_text"):
for part in soup.find_all("div", class_="part"):
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ All Parts ({len(parts_md)} parts, "
f"{sum(len(m) for m in parts_md)} chars)")
merged = "\n\n".join(parts_md)
# Clean PDF/HTML extraction artefacts before applying section flags
merged, _clean_stats = clean_corpus_artifacts(merged)
print(format_stats(_clean_stats, label=source["name"]))
for flag in source.get("section_flags", []):
before = merged.count(flag["tags"])
merged = flag["match"].sub(
lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}",
merged,
)
applied = merged.count(flag["tags"]) - before
print(f" ✓ Applied flag {flag['tags']!r}: {applied} match(es)")
# Demote headings so legislation Parts/Subparts/sections nest under the
# source-level H2 wrapper added in build() — avoids cross-Act H2 collisions
# like Privacy Act + UEMA both having "Part 1 Preliminary provisions".
merged = demote_headings(merged)
return merged
def fetch_guidance(source: dict) -> str:
"""Download a guidance document (HTML or PDF) and convert to markdown."""
print(f"\n→ {source['name']}")
cache_path = SOURCES_RAW / source["filename"]
if not cache_path.exists():
import urllib.request
req = urllib.request.Request(
source["url"],
headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
)
try:
with urllib.request.urlopen(req) as resp:
data = resp.read()
cache_path.write_bytes(data)
print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
except Exception as e:
print(f" âš  Download failed: {e}")
return ""
else:
print(f" Using cached: {cache_path.name}")
body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()
# Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
body, _clean_stats = clean_corpus_artifacts(body)
print(format_stats(_clean_stats, label=source["name"]))
# Demote body headings so they nest under the source-level H2 wrapper we
# add below (avoids cross-source collisions like multiple "## Introduction").
body = demote_headings(body)
return (
f"\n## {source['section_title']}\n\n"
f"Source: {source['url']}\n\n"
f"{body}\n"
)
def build():
print("Building consumer_protection compilation\n")
legislation_blocks = []
for src in LEGISLATION_SOURCES:
block = fetch_legislation(src)
if block:
legislation_blocks.append(
f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}"
)
guidance_blocks = []
for src in GUIDANCE_SOURCES:
block = fetch_guidance(src)
if block:
guidance_blocks.append(block)
body = "\n\n".join(legislation_blocks + guidance_blocks)
compilation = f"""# Consumer Protection — NZ Healthcare Marketing Regulation
Source: https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims
This compilation covers New Zealand's general consumer-protection law as it applies to healthcare marketing. The Fair Trading Act 1986 — administered by the Commerce Commission — prohibits misleading and deceptive conduct, false representations, and unsubstantiated claims (s12A). For health and wellness advertisers, s12A is the most-tripped-over provision; making any health benefit claim without a reasonable basis is a Fair Trading Act breach regardless of whether the claim is technically true.
Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing.
{body}
"""
DOMAIN_FILE.write_text(compilation, encoding="utf-8")
lines = compilation.count("\n") + 1
size_kb = len(compilation.encode("utf-8")) / 1024
print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")
if __name__ == "__main__":
build()