Spaces:
Sleeping
Sleeping
| """ | |
| Build the `consumer_protection` corpus domain. | |
| Combines: | |
| - Fair Trading Act 1986 (Part 1 — Misleading and deceptive conduct, with s12A | |
| substantiation provision flagged) — administered by the Commerce Commission | |
| - ComCom Making Accurate Claims (health and nutrition) guidance | |
| Pattern adapted from `build_medicines_and_supplements_compilation.py` (the | |
| canonical template). See that script for design notes on per-source section | |
| flagging, fallback strategies, and the open question on retrieval mechanism | |
| for `tags:` metadata. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import re | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).resolve().parent)) | |
| from bs4 import BeautifulSoup # noqa: E402 | |
| from convert_legislation_html import ( # noqa: E402 | |
| download_html, | |
| extract_part, | |
| extract_subpart_by_content, | |
| convert_part, | |
| convert_subpart, | |
| ) | |
| from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402 | |
| from extract_pdf import extract_to_markdown, demote_headings # noqa: E402 | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| CORPUS_DIR = PROJECT_ROOT / "corpus" | |
| SOURCES_RAW = PROJECT_ROOT / "sources" / "raw" | |
| CORPUS_DIR.mkdir(exist_ok=True) | |
| SOURCES_RAW.mkdir(parents=True, exist_ok=True) | |
| DOMAIN_FILE = CORPUS_DIR / "consumer-protection.md" | |
| LEGISLATION_SOURCES = [ | |
| { | |
| "name": "Fair Trading Act 1986", | |
| "url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/whole.html", | |
| "filename": "fair-trading-act-1986.html", | |
| "base_url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/", | |
| # First run: take all Parts. After inspection we may narrow to just Part 1 | |
| # (Misleading and deceptive conduct) — the part containing s12A substantiation. | |
| "parts_by_id": [], | |
| "parts_by_text": [], | |
| "section_flags": [ | |
| # s12A — substantiation. Becki: "the provision that most often trips up | |
| # health and wellness advertisers" | |
| { | |
| "match": re.compile(r"^####\s*12A\b[^\n]*", re.MULTILINE), | |
| "tags": "substantiation, claims, frequently-cited", | |
| }, | |
| ], | |
| }, | |
| ] | |
| GUIDANCE_SOURCES = [ | |
| { | |
| "name": "ComCom — Making accurate claims (health and nutrition)", | |
| "url": "https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims", | |
| "filename": "comcom-making-accurate-claims.html", | |
| "format": "html", | |
| "section_title": "Commerce Commission — Making Accurate Claims (Health and Nutrition)", | |
| }, | |
| # Additional ComCom guidance can be added here once URLs confirmed: | |
| # - "Trusting origin, environment and health claims" (consumer-facing) | |
| # - Health sector competition guidelines (older but still cited) | |
| ] | |
| def fetch_legislation(source: dict) -> str: | |
| """Download (or use cached) HTML and convert to markdown with per-section URLs.""" | |
| print(f"\n→ {source['name']}") | |
| path = download_html(source["url"], source["filename"]) | |
| with open(path, "r", encoding="utf-8") as f: | |
| soup = BeautifulSoup(f, "html.parser") | |
| parts_md = [] | |
| for part_id in source.get("parts_by_id", []): | |
| part = extract_part(soup, part_id) | |
| if part: | |
| md = convert_part(part, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ Part by id={part_id} ({len(md)} chars)") | |
| else: | |
| print(f" âš Part id={part_id} not found") | |
| for text_match in source.get("parts_by_text", []): | |
| part = next( | |
| (p for p in soup.find_all("div", class_="part") | |
| if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()), | |
| None, | |
| ) | |
| if part: | |
| md = convert_part(part, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ Part by text={text_match!r} ({len(md)} chars)") | |
| continue | |
| subpart = extract_subpart_by_content(soup, text_match) | |
| if subpart: | |
| md = convert_subpart(subpart, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ Subpart by text={text_match!r} ({len(md)} chars)") | |
| else: | |
| print(f" âš No Part/Subpart matching text={text_match!r}") | |
| if not source.get("parts_by_id") and not source.get("parts_by_text"): | |
| for part in soup.find_all("div", class_="part"): | |
| md = convert_part(part, base_url=source["base_url"]) | |
| parts_md.append(md) | |
| print(f" ✓ All Parts ({len(parts_md)} parts, " | |
| f"{sum(len(m) for m in parts_md)} chars)") | |
| merged = "\n\n".join(parts_md) | |
| # Clean PDF/HTML extraction artefacts before applying section flags | |
| merged, _clean_stats = clean_corpus_artifacts(merged) | |
| print(format_stats(_clean_stats, label=source["name"])) | |
| for flag in source.get("section_flags", []): | |
| before = merged.count(flag["tags"]) | |
| merged = flag["match"].sub( | |
| lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}", | |
| merged, | |
| ) | |
| applied = merged.count(flag["tags"]) - before | |
| print(f" ✓ Applied flag {flag['tags']!r}: {applied} match(es)") | |
| # Demote headings so legislation Parts/Subparts/sections nest under the | |
| # source-level H2 wrapper added in build() — avoids cross-Act H2 collisions | |
| # like Privacy Act + UEMA both having "Part 1 Preliminary provisions". | |
| merged = demote_headings(merged) | |
| return merged | |
| def fetch_guidance(source: dict) -> str: | |
| """Download a guidance document (HTML or PDF) and convert to markdown.""" | |
| print(f"\n→ {source['name']}") | |
| cache_path = SOURCES_RAW / source["filename"] | |
| if not cache_path.exists(): | |
| import urllib.request | |
| req = urllib.request.Request( | |
| source["url"], | |
| headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"}, | |
| ) | |
| try: | |
| with urllib.request.urlopen(req) as resp: | |
| data = resp.read() | |
| cache_path.write_bytes(data) | |
| print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)") | |
| except Exception as e: | |
| print(f" âš Download failed: {e}") | |
| return "" | |
| else: | |
| print(f" Using cached: {cache_path.name}") | |
| body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip() | |
| # Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars) | |
| body, _clean_stats = clean_corpus_artifacts(body) | |
| print(format_stats(_clean_stats, label=source["name"])) | |
| # Demote body headings so they nest under the source-level H2 wrapper we | |
| # add below (avoids cross-source collisions like multiple "## Introduction"). | |
| body = demote_headings(body) | |
| return ( | |
| f"\n## {source['section_title']}\n\n" | |
| f"Source: {source['url']}\n\n" | |
| f"{body}\n" | |
| ) | |
| def build(): | |
| print("Building consumer_protection compilation\n") | |
| legislation_blocks = [] | |
| for src in LEGISLATION_SOURCES: | |
| block = fetch_legislation(src) | |
| if block: | |
| legislation_blocks.append( | |
| f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}" | |
| ) | |
| guidance_blocks = [] | |
| for src in GUIDANCE_SOURCES: | |
| block = fetch_guidance(src) | |
| if block: | |
| guidance_blocks.append(block) | |
| body = "\n\n".join(legislation_blocks + guidance_blocks) | |
| compilation = f"""# Consumer Protection — NZ Healthcare Marketing Regulation | |
| Source: https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims | |
| This compilation covers New Zealand's general consumer-protection law as it applies to healthcare marketing. The Fair Trading Act 1986 — administered by the Commerce Commission — prohibits misleading and deceptive conduct, false representations, and unsubstantiated claims (s12A). For health and wellness advertisers, s12A is the most-tripped-over provision; making any health benefit claim without a reasonable basis is a Fair Trading Act breach regardless of whether the claim is technically true. | |
| Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing. | |
| {body} | |
| """ | |
| DOMAIN_FILE.write_text(compilation, encoding="utf-8") | |
| lines = compilation.count("\n") + 1 | |
| size_kb = len(compilation.encode("utf-8")) / 1024 | |
| print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)") | |
| if __name__ == "__main__": | |
| build() | |