hmc-rag / scripts /build_marketing_comms_compilation.py
webmuppet
Initial commit — health marketing compliance RAG
bad8b6c
"""
Build the `marketing_comms` corpus domain.
Combines the "can I email this list?" cluster:
- Privacy Act 2020 (Information Privacy Principles, particularly IPP 10
limits on use and IPP 11 limits on disclosure)
- Health Information Privacy Code 2020 (incorporating Amendment No 1, 2022) —
applies the IPPs to health information specifically; the relevant code of
practice issued by the Privacy Commissioner under s32 of the Privacy Act
- Unsolicited Electronic Messages Act 2007 (NZ's anti-spam law — applies to
email and SMS marketing)
Pattern adapted from `build_medicines_and_supplements_compilation.py`.
"""
from __future__ import annotations
import os
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
from bs4 import BeautifulSoup # noqa: E402
from convert_legislation_html import ( # noqa: E402
download_html,
extract_part,
extract_subpart_by_content,
convert_part,
convert_subpart,
)
from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402
from extract_pdf import extract_to_markdown, demote_headings # noqa: E402
PROJECT_ROOT = Path(__file__).resolve().parents[1]
CORPUS_DIR = PROJECT_ROOT / "corpus"
SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"
CORPUS_DIR.mkdir(exist_ok=True)
SOURCES_RAW.mkdir(parents=True, exist_ok=True)
DOMAIN_FILE = CORPUS_DIR / "marketing-comms.md"
LEGISLATION_SOURCES = [
{
"name": "Privacy Act 2020",
"url": "https://www.legislation.govt.nz/act/public/2020/0031/latest/whole.html",
"filename": "privacy-act-2020.html",
"base_url": "https://www.legislation.govt.nz/act/public/2020/0031/latest/",
# Take all Parts on first run; narrow after inspection if needed.
# IPPs are in Part 3 (Information privacy principles) — the most relevant.
"parts_by_id": [],
"parts_by_text": [],
"section_flags": [
# IPPs 10 and 11 are the marketing-relevant principles (limits on use
# and disclosure). They live in s22 in the Act (the IPPs are
# enumerated within section 22 as Principles 1–13).
# Marker for retrieval: tag the section that lists them.
{
"match": re.compile(r"^####\s*22\b[^\n]*", re.MULTILINE),
"tags": "ipp10, ipp11, use-disclosure, marketing-comms",
},
],
},
{
"name": "Unsolicited Electronic Messages Act 2007",
"url": "https://www.legislation.govt.nz/act/public/2007/0007/latest/whole.html",
"filename": "uema-2007.html",
"base_url": "https://www.legislation.govt.nz/act/public/2007/0007/latest/",
"parts_by_id": [],
"parts_by_text": [],
"section_flags": [
# s9 — prohibition on sending unsolicited commercial electronic messages
{
"match": re.compile(r"^####\s*9\b[^\n]*", re.MULTILINE),
"tags": "spam-prohibition, consent, marketing-comms",
},
# s11 — unsubscribe facility requirement
{
"match": re.compile(r"^####\s*11\b[^\n]*", re.MULTILINE),
"tags": "unsubscribe, marketing-comms",
},
],
},
]
GUIDANCE_SOURCES = [
{
"name": "Health Information Privacy Code 2020 (incorporating Amendment No 1)",
# The "website-version" PDF is the canonical hub-of-truth published by OPC.
"url": "https://www.privacy.org.nz/assets/Codes-of-Practice-2020/Health-Information-Privacy-Code-2020-website-version.pdf",
"filename": "health-information-privacy-code-2020.pdf",
"format": "pdf",
"section_title": "Health Information Privacy Code 2020 (HIPC)",
},
]
def fetch_legislation(source: dict) -> str:
"""Download (or use cached) HTML and convert to markdown with per-section URLs."""
print(f"\n→ {source['name']}")
path = download_html(source["url"], source["filename"])
with open(path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
parts_md = []
for part_id in source.get("parts_by_id", []):
part = extract_part(soup, part_id)
if part:
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ Part by id={part_id} ({len(md)} chars)")
else:
print(f" ⚠ Part id={part_id} not found")
for text_match in source.get("parts_by_text", []):
part = next(
(p for p in soup.find_all("div", class_="part")
if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()),
None,
)
if part:
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ Part by text={text_match!r} ({len(md)} chars)")
continue
subpart = extract_subpart_by_content(soup, text_match)
if subpart:
md = convert_subpart(subpart, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ Subpart by text={text_match!r} ({len(md)} chars)")
else:
print(f" ⚠ No Part/Subpart matching text={text_match!r}")
if not source.get("parts_by_id") and not source.get("parts_by_text"):
for part in soup.find_all("div", class_="part"):
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" ✓ All Parts ({len(parts_md)} parts, "
f"{sum(len(m) for m in parts_md)} chars)")
merged = "\n\n".join(parts_md)
# Clean PDF/HTML extraction artefacts before applying section flags
merged, _clean_stats = clean_corpus_artifacts(merged)
print(format_stats(_clean_stats, label=source["name"]))
for flag in source.get("section_flags", []):
before = merged.count(flag["tags"])
merged = flag["match"].sub(
lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}",
merged,
)
applied = merged.count(flag["tags"]) - before
print(f" ✓ Applied flag {flag['tags']!r}: {applied} match(es)")
# Demote headings so legislation Parts/Subparts/sections nest under the
# source-level H2 wrapper added in build() — avoids cross-Act H2 collisions
# like Privacy Act + UEMA both having "Part 1 Preliminary provisions".
merged = demote_headings(merged)
return merged
def fetch_guidance(source: dict) -> str:
"""Download a guidance document (HTML or PDF) and convert to markdown."""
print(f"\n→ {source['name']}")
cache_path = SOURCES_RAW / source["filename"]
if not cache_path.exists():
import urllib.request
req = urllib.request.Request(
source["url"],
headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
)
try:
with urllib.request.urlopen(req) as resp:
data = resp.read()
cache_path.write_bytes(data)
print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
except Exception as e:
print(f" ⚠ Download failed: {e}")
return ""
else:
print(f" Using cached: {cache_path.name}")
body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()
# Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
body, _clean_stats = clean_corpus_artifacts(body)
print(format_stats(_clean_stats, label=source["name"]))
# Demote body headings so they nest under the source-level H2 wrapper we
# add below (avoids cross-source collisions like multiple "## Introduction").
body = demote_headings(body)
return (
f"\n## {source['section_title']}\n\n"
f"Source: {source['url']}\n\n"
f"{body}\n"
)
def build():
print("Building marketing_comms compilation\n")
legislation_blocks = []
for src in LEGISLATION_SOURCES:
block = fetch_legislation(src)
if block:
legislation_blocks.append(
f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}"
)
guidance_blocks = []
for src in GUIDANCE_SOURCES:
block = fetch_guidance(src)
if block:
guidance_blocks.append(block)
body = "\n\n".join(legislation_blocks + guidance_blocks)
compilation = f"""# Marketing Communications — NZ Healthcare Marketing Regulation
Source: https://www.privacy.org.nz/privacy-act-2020/codes-of-practice/health-information-privacy-code-2020/
This compilation covers the legal cluster that governs how healthcare and health-product marketers can communicate with audiences and customers: the Privacy Act 2020 (information privacy principles), the Health Information Privacy Code 2020 (which applies the Act to health information), and the Unsolicited Electronic Messages Act 2007 (NZ's spam law for email and SMS marketing).
The practical question this compilation answers is: "can I email this list?" Most marketing-comms compliance failures cut across all three instruments — privacy law restricting how you can use a contact list, HIPC adding stricter rules where the contact data includes health information, and UEMA imposing consent-and-unsubscribe requirements on the messages themselves.
Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing.
{body}
"""
DOMAIN_FILE.write_text(compilation, encoding="utf-8")
lines = compilation.count("\n") + 1
size_kb = len(compilation.encode("utf-8")) / 1024
print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")
if __name__ == "__main__":
build()