File size: 8,590 Bytes
bad8b6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
"""
Build the `consumer_protection` corpus domain.

Combines:
- Fair Trading Act 1986 (Part 1 β€” Misleading and deceptive conduct, with s12A
  substantiation provision flagged) β€” administered by the Commerce Commission
- ComCom Making Accurate Claims (health and nutrition) guidance

Pattern adapted from `build_medicines_and_supplements_compilation.py` (the
canonical template). See that script for design notes on per-source section
flagging, fallback strategies, and the open question on retrieval mechanism
for `tags:` metadata.
"""

from __future__ import annotations

import os
import re
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent))

from bs4 import BeautifulSoup  # noqa: E402
from convert_legislation_html import (  # noqa: E402
    download_html,
    extract_part,
    extract_subpart_by_content,
    convert_part,
    convert_subpart,
)
from clean_artifacts import clean_corpus_artifacts, format_stats  # noqa: E402
from extract_pdf import extract_to_markdown, demote_headings  # noqa: E402

PROJECT_ROOT = Path(__file__).resolve().parents[1]
CORPUS_DIR = PROJECT_ROOT / "corpus"
SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"

CORPUS_DIR.mkdir(exist_ok=True)
SOURCES_RAW.mkdir(parents=True, exist_ok=True)

DOMAIN_FILE = CORPUS_DIR / "consumer-protection.md"

LEGISLATION_SOURCES = [
    {
        "name": "Fair Trading Act 1986",
        "url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/whole.html",
        "filename": "fair-trading-act-1986.html",
        "base_url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/",
        # First run: take all Parts. After inspection we may narrow to just Part 1
        # (Misleading and deceptive conduct) β€” the part containing s12A substantiation.
        "parts_by_id": [],
        "parts_by_text": [],
        "section_flags": [
            # s12A β€” substantiation. Becki: "the provision that most often trips up
            # health and wellness advertisers"
            {
                "match": re.compile(r"^####\s*12A\b[^\n]*", re.MULTILINE),
                "tags": "substantiation, claims, frequently-cited",
            },
        ],
    },
]

GUIDANCE_SOURCES = [
    {
        "name": "ComCom β€” Making accurate claims (health and nutrition)",
        "url": "https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims",
        "filename": "comcom-making-accurate-claims.html",
        "format": "html",
        "section_title": "Commerce Commission β€” Making Accurate Claims (Health and Nutrition)",
    },
    # Additional ComCom guidance can be added here once URLs confirmed:
    # - "Trusting origin, environment and health claims" (consumer-facing)
    # - Health sector competition guidelines (older but still cited)
]


def fetch_legislation(source: dict) -> str:
    """Download (or use cached) HTML and convert to markdown with per-section URLs."""
    print(f"\n→ {source['name']}")
    path = download_html(source["url"], source["filename"])

    with open(path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    parts_md = []

    for part_id in source.get("parts_by_id", []):
        part = extract_part(soup, part_id)
        if part:
            md = convert_part(part, base_url=source["base_url"])
            parts_md.append(md)
            print(f"  βœ“ Part by id={part_id} ({len(md)} chars)")
        else:
            print(f"  ⚠ Part id={part_id} not found")

    for text_match in source.get("parts_by_text", []):
        part = next(
            (p for p in soup.find_all("div", class_="part")
             if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()),
            None,
        )
        if part:
            md = convert_part(part, base_url=source["base_url"])
            parts_md.append(md)
            print(f"  βœ“ Part by text={text_match!r} ({len(md)} chars)")
            continue
        subpart = extract_subpart_by_content(soup, text_match)
        if subpart:
            md = convert_subpart(subpart, base_url=source["base_url"])
            parts_md.append(md)
            print(f"  βœ“ Subpart by text={text_match!r} ({len(md)} chars)")
        else:
            print(f"  ⚠ No Part/Subpart matching text={text_match!r}")

    if not source.get("parts_by_id") and not source.get("parts_by_text"):
        for part in soup.find_all("div", class_="part"):
            md = convert_part(part, base_url=source["base_url"])
            parts_md.append(md)
        print(f"  βœ“ All Parts ({len(parts_md)} parts, "
              f"{sum(len(m) for m in parts_md)} chars)")

    merged = "\n\n".join(parts_md)

    # Clean PDF/HTML extraction artefacts before applying section flags
    merged, _clean_stats = clean_corpus_artifacts(merged)
    print(format_stats(_clean_stats, label=source["name"]))

    for flag in source.get("section_flags", []):
        before = merged.count(flag["tags"])
        merged = flag["match"].sub(
            lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}",
            merged,
        )
        applied = merged.count(flag["tags"]) - before
        print(f"  βœ“ Applied flag {flag['tags']!r}: {applied} match(es)")

    # Demote headings so legislation Parts/Subparts/sections nest under the
    # source-level H2 wrapper added in build() β€” avoids cross-Act H2 collisions
    # like Privacy Act + UEMA both having "Part 1 Preliminary provisions".
    merged = demote_headings(merged)

    return merged


def fetch_guidance(source: dict) -> str:
    """Download a guidance document (HTML or PDF) and convert to markdown."""
    print(f"\n→ {source['name']}")

    cache_path = SOURCES_RAW / source["filename"]
    if not cache_path.exists():
        import urllib.request
        req = urllib.request.Request(
            source["url"],
            headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
        )
        try:
            with urllib.request.urlopen(req) as resp:
                data = resp.read()
            cache_path.write_bytes(data)
            print(f"  Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
        except Exception as e:
            print(f"  ⚠ Download failed: {e}")
            return ""
    else:
        print(f"  Using cached: {cache_path.name}")

    body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()

    # Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
    body, _clean_stats = clean_corpus_artifacts(body)
    print(format_stats(_clean_stats, label=source["name"]))

    # Demote body headings so they nest under the source-level H2 wrapper we
    # add below (avoids cross-source collisions like multiple "## Introduction").
    body = demote_headings(body)

    return (
        f"\n## {source['section_title']}\n\n"
        f"Source: {source['url']}\n\n"
        f"{body}\n"
    )


def build():
    print("Building consumer_protection compilation\n")

    legislation_blocks = []
    for src in LEGISLATION_SOURCES:
        block = fetch_legislation(src)
        if block:
            legislation_blocks.append(
                f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}"
            )

    guidance_blocks = []
    for src in GUIDANCE_SOURCES:
        block = fetch_guidance(src)
        if block:
            guidance_blocks.append(block)

    body = "\n\n".join(legislation_blocks + guidance_blocks)

    compilation = f"""# Consumer Protection β€” NZ Healthcare Marketing Regulation

Source: https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims

This compilation covers New Zealand's general consumer-protection law as it applies to healthcare marketing. The Fair Trading Act 1986 β€” administered by the Commerce Commission β€” prohibits misleading and deceptive conduct, false representations, and unsubstantiated claims (s12A). For health and wellness advertisers, s12A is the most-tripped-over provision; making any health benefit claim without a reasonable basis is a Fair Trading Act breach regardless of whether the claim is technically true.

Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing.

{body}
"""

    DOMAIN_FILE.write_text(compilation, encoding="utf-8")
    lines = compilation.count("\n") + 1
    size_kb = len(compilation.encode("utf-8")) / 1024
    print(f"\nβœ… Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")


if __name__ == "__main__":
    build()