Spaces:

webmuppetnz
/

hmc-rag

Sleeping

hmc-rag / scripts /build_consumer_protection_compilation.py

webmuppet

Initial commit — health marketing compliance RAG

bad8b6c 22 days ago

8.59 kB

	"""
	Build the `consumer_protection` corpus domain.

	Combines:
	- Fair Trading Act 1986 (Part 1 — Misleading and deceptive conduct, with s12A
	substantiation provision flagged) — administered by the Commerce Commission
	- ComCom Making Accurate Claims (health and nutrition) guidance

	Pattern adapted from `build_medicines_and_supplements_compilation.py` (the
	canonical template). See that script for design notes on per-source section
	flagging, fallback strategies, and the open question on retrieval mechanism
	for `tags:` metadata.
	"""

	from __future__ import annotations

	import os
	import re
	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).resolve().parent))

	from bs4 import BeautifulSoup # noqa: E402
	from convert_legislation_html import ( # noqa: E402
	download_html,
	extract_part,
	extract_subpart_by_content,
	convert_part,
	convert_subpart,
	)
	from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402
	from extract_pdf import extract_to_markdown, demote_headings # noqa: E402

	PROJECT_ROOT = Path(__file__).resolve().parents[1]
	CORPUS_DIR = PROJECT_ROOT / "corpus"
	SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"

	CORPUS_DIR.mkdir(exist_ok=True)
	SOURCES_RAW.mkdir(parents=True, exist_ok=True)

	DOMAIN_FILE = CORPUS_DIR / "consumer-protection.md"

	LEGISLATION_SOURCES = [
	{
	"name": "Fair Trading Act 1986",
	"url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/whole.html",
	"filename": "fair-trading-act-1986.html",
	"base_url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/",
	# First run: take all Parts. After inspection we may narrow to just Part 1
	# (Misleading and deceptive conduct) — the part containing s12A substantiation.
	"parts_by_id": [],
	"parts_by_text": [],
	"section_flags": [
	# s12A — substantiation. Becki: "the provision that most often trips up
	# health and wellness advertisers"
	{
	"match": re.compile(r"^####\s12A\b[^\n]", re.MULTILINE),
	"tags": "substantiation, claims, frequently-cited",
	},
	],
	},
	]

	GUIDANCE_SOURCES = [
	{
	"name": "ComCom — Making accurate claims (health and nutrition)",
	"url": "https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims",
	"filename": "comcom-making-accurate-claims.html",
	"format": "html",
	"section_title": "Commerce Commission — Making Accurate Claims (Health and Nutrition)",
	},
	# Additional ComCom guidance can be added here once URLs confirmed:
	# - "Trusting origin, environment and health claims" (consumer-facing)
	# - Health sector competition guidelines (older but still cited)
	]


	def fetch_legislation(source: dict) -> str:
	"""Download (or use cached) HTML and convert to markdown with per-section URLs."""
	print(f"\n→ {source['name']}")
	path = download_html(source["url"], source["filename"])

	with open(path, "r", encoding="utf-8") as f:
	soup = BeautifulSoup(f, "html.parser")

	parts_md = []

	for part_id in source.get("parts_by_id", []):
	part = extract_part(soup, part_id)
	if part:
	md = convert_part(part, base_url=source["base_url"])
	parts_md.append(md)
	print(f" ✓ Part by id={part_id} ({len(md)} chars)")
	else:
	print(f" ⚠ Part id={part_id} not found")

	for text_match in source.get("parts_by_text", []):
	part = next(
	(p for p in soup.find_all("div", class_="part")
	if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()),
	None,
	)
	if part:
	md = convert_part(part, base_url=source["base_url"])
	parts_md.append(md)
	print(f" ✓ Part by text={text_match!r} ({len(md)} chars)")
	continue
	subpart = extract_subpart_by_content(soup, text_match)
	if subpart:
	md = convert_subpart(subpart, base_url=source["base_url"])
	parts_md.append(md)
	print(f" ✓ Subpart by text={text_match!r} ({len(md)} chars)")
	else:
	print(f" ⚠ No Part/Subpart matching text={text_match!r}")

	if not source.get("parts_by_id") and not source.get("parts_by_text"):
	for part in soup.find_all("div", class_="part"):
	md = convert_part(part, base_url=source["base_url"])
	parts_md.append(md)
	print(f" ✓ All Parts ({len(parts_md)} parts, "
	f"{sum(len(m) for m in parts_md)} chars)")

	merged = "\n\n".join(parts_md)

	# Clean PDF/HTML extraction artefacts before applying section flags
	merged, _clean_stats = clean_corpus_artifacts(merged)
	print(format_stats(_clean_stats, label=source["name"]))

	for flag in source.get("section_flags", []):
	before = merged.count(flag["tags"])
	merged = flag["match"].sub(
	lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}",
	merged,
	)
	applied = merged.count(flag["tags"]) - before
	print(f" ✓ Applied flag {flag['tags']!r}: {applied} match(es)")

	# Demote headings so legislation Parts/Subparts/sections nest under the
	# source-level H2 wrapper added in build() — avoids cross-Act H2 collisions
	# like Privacy Act + UEMA both having "Part 1 Preliminary provisions".
	merged = demote_headings(merged)

	return merged


	def fetch_guidance(source: dict) -> str:
	"""Download a guidance document (HTML or PDF) and convert to markdown."""
	print(f"\n→ {source['name']}")

	cache_path = SOURCES_RAW / source["filename"]
	if not cache_path.exists():
	import urllib.request
	req = urllib.request.Request(
	source["url"],
	headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
	)
	try:
	with urllib.request.urlopen(req) as resp:
	data = resp.read()
	cache_path.write_bytes(data)
	print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
	except Exception as e:
	print(f" ⚠ Download failed: {e}")
	return ""
	else:
	print(f" Using cached: {cache_path.name}")

	body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()

	# Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
	body, _clean_stats = clean_corpus_artifacts(body)
	print(format_stats(_clean_stats, label=source["name"]))

	# Demote body headings so they nest under the source-level H2 wrapper we
	# add below (avoids cross-source collisions like multiple "## Introduction").
	body = demote_headings(body)

	return (
	f"\n## {source['section_title']}\n\n"
	f"Source: {source['url']}\n\n"
	f"{body}\n"
	)


	def build():
	print("Building consumer_protection compilation\n")

	legislation_blocks = []
	for src in LEGISLATION_SOURCES:
	block = fetch_legislation(src)
	if block:
	legislation_blocks.append(
	f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}"
	)

	guidance_blocks = []
	for src in GUIDANCE_SOURCES:
	block = fetch_guidance(src)
	if block:
	guidance_blocks.append(block)

	body = "\n\n".join(legislation_blocks + guidance_blocks)

	compilation = f"""# Consumer Protection — NZ Healthcare Marketing Regulation

	Source: https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims

	This compilation covers New Zealand's general consumer-protection law as it applies to healthcare marketing. The Fair Trading Act 1986 — administered by the Commerce Commission — prohibits misleading and deceptive conduct, false representations, and unsubstantiated claims (s12A). For health and wellness advertisers, s12A is the most-tripped-over provision; making any health benefit claim without a reasonable basis is a Fair Trading Act breach regardless of whether the claim is technically true.

	Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing.

	{body}
	"""

	DOMAIN_FILE.write_text(compilation, encoding="utf-8")
	lines = compilation.count("\n") + 1
	size_kb = len(compilation.encode("utf-8")) / 1024
	print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")


	if __name__ == "__main__":
	build()