Spaces:
Sleeping
Sleeping
File size: 8,590 Bytes
bad8b6c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | """
Build the `consumer_protection` corpus domain.
Combines:
- Fair Trading Act 1986 (Part 1 β Misleading and deceptive conduct, with s12A
substantiation provision flagged) β administered by the Commerce Commission
- ComCom Making Accurate Claims (health and nutrition) guidance
Pattern adapted from `build_medicines_and_supplements_compilation.py` (the
canonical template). See that script for design notes on per-source section
flagging, fallback strategies, and the open question on retrieval mechanism
for `tags:` metadata.
"""
from __future__ import annotations
import os
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
from bs4 import BeautifulSoup # noqa: E402
from convert_legislation_html import ( # noqa: E402
download_html,
extract_part,
extract_subpart_by_content,
convert_part,
convert_subpart,
)
from clean_artifacts import clean_corpus_artifacts, format_stats # noqa: E402
from extract_pdf import extract_to_markdown, demote_headings # noqa: E402
PROJECT_ROOT = Path(__file__).resolve().parents[1]
CORPUS_DIR = PROJECT_ROOT / "corpus"
SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"
CORPUS_DIR.mkdir(exist_ok=True)
SOURCES_RAW.mkdir(parents=True, exist_ok=True)
DOMAIN_FILE = CORPUS_DIR / "consumer-protection.md"
LEGISLATION_SOURCES = [
{
"name": "Fair Trading Act 1986",
"url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/whole.html",
"filename": "fair-trading-act-1986.html",
"base_url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/",
# First run: take all Parts. After inspection we may narrow to just Part 1
# (Misleading and deceptive conduct) β the part containing s12A substantiation.
"parts_by_id": [],
"parts_by_text": [],
"section_flags": [
# s12A β substantiation. Becki: "the provision that most often trips up
# health and wellness advertisers"
{
"match": re.compile(r"^####\s*12A\b[^\n]*", re.MULTILINE),
"tags": "substantiation, claims, frequently-cited",
},
],
},
]
GUIDANCE_SOURCES = [
{
"name": "ComCom β Making accurate claims (health and nutrition)",
"url": "https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims",
"filename": "comcom-making-accurate-claims.html",
"format": "html",
"section_title": "Commerce Commission β Making Accurate Claims (Health and Nutrition)",
},
# Additional ComCom guidance can be added here once URLs confirmed:
# - "Trusting origin, environment and health claims" (consumer-facing)
# - Health sector competition guidelines (older but still cited)
]
def fetch_legislation(source: dict) -> str:
"""Download (or use cached) HTML and convert to markdown with per-section URLs."""
print(f"\nβ {source['name']}")
path = download_html(source["url"], source["filename"])
with open(path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
parts_md = []
for part_id in source.get("parts_by_id", []):
part = extract_part(soup, part_id)
if part:
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" β Part by id={part_id} ({len(md)} chars)")
else:
print(f" β Part id={part_id} not found")
for text_match in source.get("parts_by_text", []):
part = next(
(p for p in soup.find_all("div", class_="part")
if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()),
None,
)
if part:
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" β Part by text={text_match!r} ({len(md)} chars)")
continue
subpart = extract_subpart_by_content(soup, text_match)
if subpart:
md = convert_subpart(subpart, base_url=source["base_url"])
parts_md.append(md)
print(f" β Subpart by text={text_match!r} ({len(md)} chars)")
else:
print(f" β No Part/Subpart matching text={text_match!r}")
if not source.get("parts_by_id") and not source.get("parts_by_text"):
for part in soup.find_all("div", class_="part"):
md = convert_part(part, base_url=source["base_url"])
parts_md.append(md)
print(f" β All Parts ({len(parts_md)} parts, "
f"{sum(len(m) for m in parts_md)} chars)")
merged = "\n\n".join(parts_md)
# Clean PDF/HTML extraction artefacts before applying section flags
merged, _clean_stats = clean_corpus_artifacts(merged)
print(format_stats(_clean_stats, label=source["name"]))
for flag in source.get("section_flags", []):
before = merged.count(flag["tags"])
merged = flag["match"].sub(
lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}",
merged,
)
applied = merged.count(flag["tags"]) - before
print(f" β Applied flag {flag['tags']!r}: {applied} match(es)")
# Demote headings so legislation Parts/Subparts/sections nest under the
# source-level H2 wrapper added in build() β avoids cross-Act H2 collisions
# like Privacy Act + UEMA both having "Part 1 Preliminary provisions".
merged = demote_headings(merged)
return merged
def fetch_guidance(source: dict) -> str:
"""Download a guidance document (HTML or PDF) and convert to markdown."""
print(f"\nβ {source['name']}")
cache_path = SOURCES_RAW / source["filename"]
if not cache_path.exists():
import urllib.request
req = urllib.request.Request(
source["url"],
headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
)
try:
with urllib.request.urlopen(req) as resp:
data = resp.read()
cache_path.write_bytes(data)
print(f" Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
except Exception as e:
print(f" β Download failed: {e}")
return ""
else:
print(f" Using cached: {cache_path.name}")
body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()
# Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
body, _clean_stats = clean_corpus_artifacts(body)
print(format_stats(_clean_stats, label=source["name"]))
# Demote body headings so they nest under the source-level H2 wrapper we
# add below (avoids cross-source collisions like multiple "## Introduction").
body = demote_headings(body)
return (
f"\n## {source['section_title']}\n\n"
f"Source: {source['url']}\n\n"
f"{body}\n"
)
def build():
print("Building consumer_protection compilation\n")
legislation_blocks = []
for src in LEGISLATION_SOURCES:
block = fetch_legislation(src)
if block:
legislation_blocks.append(
f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}"
)
guidance_blocks = []
for src in GUIDANCE_SOURCES:
block = fetch_guidance(src)
if block:
guidance_blocks.append(block)
body = "\n\n".join(legislation_blocks + guidance_blocks)
compilation = f"""# Consumer Protection β NZ Healthcare Marketing Regulation
Source: https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims
This compilation covers New Zealand's general consumer-protection law as it applies to healthcare marketing. The Fair Trading Act 1986 β administered by the Commerce Commission β prohibits misleading and deceptive conduct, false representations, and unsubstantiated claims (s12A). For health and wellness advertisers, s12A is the most-tripped-over provision; making any health benefit claim without a reasonable basis is a Fair Trading Act breach regardless of whether the claim is technically true.
Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing.
{body}
"""
DOMAIN_FILE.write_text(compilation, encoding="utf-8")
lines = compilation.count("\n") + 1
size_kb = len(compilation.encode("utf-8")) / 1024
print(f"\nβ
Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")
if __name__ == "__main__":
build()
|