Spaces:
Sleeping
Sleeping
| """ | |
| Build the `marketing_comms` corpus domain. | |
| Combines the "can I email this list?" cluster: | |
| - Privacy Act 2020 (Information Privacy Principles, particularly IPP 10 | |
| limits on use and IPP 11 limits on disclosure) | |
| - Health Information Privacy Code 2020 (incorporating Amendment No 1, 2022) — | |
| applies the IPPs to health information specifically; the relevant code of | |
| practice issued by the Privacy Commissioner under s32 of the Privacy Act | |
| - Unsolicited Electronic Messages Act 2007 (NZ's anti-spam law — applies to | |
| email and SMS marketing) | |
| Pattern adapted from `build_medicines_and_supplements_compilation.py`. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import re | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).resolve().parent)) | |
| from bs4 import BeautifulSoup # noqa: E402 | |
| from convert_legislation_html import ( # noqa: E402 | |
| download_html, | |
| extract_part, | |
| extract_subpart_by_content, | |
| convert_part, | |
| convert_subpart, | |
| ) | |
| from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402 | |
| from extract_pdf import extract_to_markdown, demote_headings # noqa: E402 | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| CORPUS_DIR = PROJECT_ROOT / "corpus" | |
| SOURCES_RAW = PROJECT_ROOT / "sources" / "raw" | |
| CORPUS_DIR.mkdir(exist_ok=True) | |
| SOURCES_RAW.mkdir(parents=True, exist_ok=True) | |
| DOMAIN_FILE = CORPUS_DIR / "marketing-comms.md" | |
| LEGISLATION_SOURCES = [ | |
| { | |
| "name": "Privacy Act 2020", | |
| "url": "https://www.legislation.govt.nz/act/public/2020/0031/latest/whole.html", | |
| "filename": "privacy-act-2020.html", | |
| "base_url": "https://www.legislation.govt.nz/act/public/2020/0031/latest/", | |
| # Take all Parts on first run; narrow after inspection if needed. | |
| # IPPs are in Part 3 (Information privacy principles) — the most relevant. | |
| "parts_by_id": [], | |
| "parts_by_text": [], | |
| "section_flags": [ | |
| # IPPs 10 and 11 are the marketing-relevant principles (limits on use | |
| # and disclosure). They live in s22 in the Act (the IPPs are | |
| # enumerated within section 22 as Principles 1–13). | |
| # Marker for retrieval: tag the section that lists them. | |
| { | |
| "match": re.compile(r"^####\s*22\b[^\n]*", re.MULTILINE), | |
| "tags": "ipp10, ipp11, use-disclosure, marketing-comms", | |
| }, | |
| ], | |
| }, | |
| { | |
| "name": "Unsolicited Electronic Messages Act 2007", | |
| "url": "https://www.legislation.govt.nz/act/public/2007/0007/latest/whole.html", | |
| "filename": "uema-2007.html", | |
| "base_url": "https://www.legislation.govt.nz/act/public/2007/0007/latest/", | |
| "parts_by_id": [], | |
| "parts_by_text": [], | |
| "section_flags": [ | |
| # s9 — prohibition on sending unsolicited commercial electronic messages | |
| { | |
| "match": re.compile(r"^####\s*9\b[^\n]*", re.MULTILINE), | |
| "tags": "spam-prohibition, consent, marketing-comms", | |
| }, | |
| # s11 — unsubscribe facility requirement | |
| { | |
| "match": re.compile(r"^####\s*11\b[^\n]*", re.MULTILINE), | |
| "tags": "unsubscribe, marketing-comms", | |
| }, | |
| ], | |
| }, | |
| ] | |
| GUIDANCE_SOURCES = [ | |
| { | |
| "name": "Health Information Privacy Code 2020 (incorporating Amendment No 1)", | |
| # The "website-version" PDF is the canonical hub-of-truth published by OPC. | |
| "url": "https://www.privacy.org.nz/assets/Codes-of-Practice-2020/Health-Information-Privacy-Code-2020-website-version.pdf", | |
| "filename": "health-information-privacy-code-2020.pdf", | |
| "format": "pdf", | |
| "section_title": "Health Information Privacy Code 2020 (HIPC)", | |
| }, | |
| ] | |
| def fetch_legislation(source: dict) -> str: | |
| """Download (or use cached) HTML and convert to markdown with per-section URLs.""" | |
| print(f"\n→ {source['name']}") | |
| path = download_html(source["url"], source["filename"]) | |
| with open(path, "r", encoding="utf-8") as f: | |
| soup = BeautifulSoup(f, "html.parser") | |
| parts_md = [] | |
| for part_id in source.get("parts_by_id", []): | |
| part = extract_part(soup, part_id) | |
| if part: | |
| md = convert_part(part, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ Part by id={part_id} ({len(md)} chars)") | |
| else: | |
| print(f" ⚠ Part id={part_id} not found") | |
| for text_match in source.get("parts_by_text", []): | |
| part = next( | |
| (p for p in soup.find_all("div", class_="part") | |
| if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()), | |
| None, | |
| ) | |
| if part: | |
| md = convert_part(part, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ Part by text={text_match!r} ({len(md)} chars)") | |
| continue | |
| subpart = extract_subpart_by_content(soup, text_match) | |
| if subpart: | |
| md = convert_subpart(subpart, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ Subpart by text={text_match!r} ({len(md)} chars)") | |
| else: | |
| print(f" ⚠ No Part/Subpart matching text={text_match!r}") | |
| if not source.get("parts_by_id") and not source.get("parts_by_text"): | |
| for part in soup.find_all("div", class_="part"): | |
| md = convert_part(part, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ All Parts ({len(parts_md)} parts, " | |
| f"{sum(len(m) for m in parts_md)} chars)") | |
| merged = "\n\n".join(parts_md) | |
| # Clean PDF/HTML extraction artefacts before applying section flags | |
| merged, _clean_stats = clean_corpus_artifacts(merged) | |
| print(format_stats(_clean_stats, label=source["name"])) | |
| for flag in source.get("section_flags", []): | |
| before = merged.count(flag["tags"]) | |
| merged = flag["match"].sub( | |
| lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}", | |
| merged, | |
| ) | |
| applied = merged.count(flag["tags"]) - before | |
| print(f" ✓ Applied flag {flag['tags']!r}: {applied} match(es)") | |
| # Demote headings so legislation Parts/Subparts/sections nest under the | |
| # source-level H2 wrapper added in build() — avoids cross-Act H2 collisions | |
| # like Privacy Act + UEMA both having "Part 1 Preliminary provisions". | |
| merged = demote_headings(merged) | |
| return merged | |
| def fetch_guidance(source: dict) -> str: | |
| """Download a guidance document (HTML or PDF) and convert to markdown.""" | |
| print(f"\n→ {source['name']}") | |
| cache_path = SOURCES_RAW / source["filename"] | |
| if not cache_path.exists(): | |
| import urllib.request | |
| req = urllib.request.Request( | |
| source["url"], | |
| headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"}, | |
| ) | |
| try: | |
| with urllib.request.urlopen(req) as resp: | |
| data = resp.read() | |
| cache_path.write_bytes(data) | |
| print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)") | |
| except Exception as e: | |
| print(f" ⚠ Download failed: {e}") | |
| return "" | |
| else: | |
| print(f" Using cached: {cache_path.name}") | |
| body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip() | |
| # Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars) | |
| body, _clean_stats = clean_corpus_artifacts(body) | |
| print(format_stats(_clean_stats, label=source["name"])) | |
| # Demote body headings so they nest under the source-level H2 wrapper we | |
| # add below (avoids cross-source collisions like multiple "## Introduction"). | |
| body = demote_headings(body) | |
| return ( | |
| f"\n## {source['section_title']}\n\n" | |
| f"Source: {source['url']}\n\n" | |
| f"{body}\n" | |
| ) | |
| def build(): | |
| print("Building marketing_comms compilation\n") | |
| legislation_blocks = [] | |
| for src in LEGISLATION_SOURCES: | |
| block = fetch_legislation(src) | |
| if block: | |
| legislation_blocks.append( | |
| f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}" | |
| ) | |
| guidance_blocks = [] | |
| for src in GUIDANCE_SOURCES: | |
| block = fetch_guidance(src) | |
| if block: | |
| guidance_blocks.append(block) | |
| body = "\n\n".join(legislation_blocks + guidance_blocks) | |
| compilation = f"""# Marketing Communications — NZ Healthcare Marketing Regulation | |
| Source: https://www.privacy.org.nz/privacy-act-2020/codes-of-practice/health-information-privacy-code-2020/ | |
| This compilation covers the legal cluster that governs how healthcare and health-product marketers can communicate with audiences and customers: the Privacy Act 2020 (information privacy principles), the Health Information Privacy Code 2020 (which applies the Act to health information), and the Unsolicited Electronic Messages Act 2007 (NZ's spam law for email and SMS marketing). | |
| The practical question this compilation answers is: "can I email this list?" Most marketing-comms compliance failures cut across all three instruments — privacy law restricting how you can use a contact list, HIPC adding stricter rules where the contact data includes health information, and UEMA imposing consent-and-unsubscribe requirements on the messages themselves. | |
| Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing. | |
| {body} | |
| """ | |
| DOMAIN_FILE.write_text(compilation, encoding="utf-8") | |
| lines = compilation.count("\n") + 1 | |
| size_kb = len(compilation.encode("utf-8")) / 1024 | |
| print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)") | |
| if __name__ == "__main__": | |
| build() | |