Spaces:

webmuppetnz
/

hmc-rag

Sleeping

File size: 8,590 Bytes

bad8b6c

"""
Build the `consumer_protection` corpus domain.

Combines:
- Fair Trading Act 1986 (Part 1 — Misleading and deceptive conduct, with s12A
  substantiation provision flagged) — administered by the Commerce Commission
- ComCom Making Accurate Claims (health and nutrition) guidance

Pattern adapted from `build_medicines_and_supplements_compilation.py` (the
canonical template). See that script for design notes on per-source section
flagging, fallback strategies, and the open question on retrieval mechanism
for `tags:` metadata.
"""

from __future__ import annotations

import os
import re
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent))

from bs4 import BeautifulSoup  # noqa: E402
from convert_legislation_html import (  # noqa: E402
    download_html,
    extract_part,
    extract_subpart_by_content,
    convert_part,
    convert_subpart,
)
from clean_artifacts import clean_corpus_artifacts, format_stats  # noqa: E402
from extract_pdf import extract_to_markdown, demote_headings  # noqa: E402

PROJECT_ROOT = Path(__file__).resolve().parents[1]
CORPUS_DIR = PROJECT_ROOT / "corpus"
SOURCES_RAW = PROJECT_ROOT / "sources" / "raw"

CORPUS_DIR.mkdir(exist_ok=True)
SOURCES_RAW.mkdir(parents=True, exist_ok=True)

DOMAIN_FILE = CORPUS_DIR / "consumer-protection.md"

LEGISLATION_SOURCES = [
    {
        "name": "Fair Trading Act 1986",
        "url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/whole.html",
        "filename": "fair-trading-act-1986.html",
        "base_url": "https://www.legislation.govt.nz/act/public/1986/0121/latest/",
        # First run: take all Parts. After inspection we may narrow to just Part 1
        # (Misleading and deceptive conduct) — the part containing s12A substantiation.
        "parts_by_id": [],
        "parts_by_text": [],
        "section_flags": [
            # s12A — substantiation. Becki: "the provision that most often trips up
            # health and wellness advertisers"
            {
                "match": re.compile(r"^####\s*12A\b[^\n]*", re.MULTILINE),
                "tags": "substantiation, claims, frequently-cited",
            },
        ],
    },
]

GUIDANCE_SOURCES = [
    {
        "name": "ComCom — Making accurate claims (health and nutrition)",
        "url": "https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims",
        "filename": "comcom-making-accurate-claims.html",
        "format": "html",
        "section_title": "Commerce Commission — Making Accurate Claims (Health and Nutrition)",
    },
    # Additional ComCom guidance can be added here once URLs confirmed:
    # - "Trusting origin, environment and health claims" (consumer-facing)
    # - Health sector competition guidelines (older but still cited)
]


def fetch_legislation(source: dict) -> str:
    """Download (or use cached) HTML and convert to markdown with per-section URLs."""
    print(f"\n→ {source['name']}")
    path = download_html(source["url"], source["filename"])

    with open(path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    parts_md = []

    for part_id in source.get("parts_by_id", []):
        part = extract_part(soup, part_id)
        if part:
            md = convert_part(part, base_url=source["base_url"])
            parts_md.append(md)
            print(f"  ✓ Part by id={part_id} ({len(md)} chars)")
        else:
            print(f"  ⚠ Part id={part_id} not found")

    for text_match in source.get("parts_by_text", []):
        part = next(
            (p for p in soup.find_all("div", class_="part")
             if p.find("h2", class_="part") and text_match.lower() in p.find("h2", class_="part").get_text().lower()),
            None,
        )
        if part:
            md = convert_part(part, base_url=source["base_url"])
            parts_md.append(md)
            print(f"  ✓ Part by text={text_match!r} ({len(md)} chars)")
            continue
        subpart = extract_subpart_by_content(soup, text_match)
        if subpart:
            md = convert_subpart(subpart, base_url=source["base_url"])
            parts_md.append(md)
            print(f"  ✓ Subpart by text={text_match!r} ({len(md)} chars)")
        else:
            print(f"  ⚠ No Part/Subpart matching text={text_match!r}")

    if not source.get("parts_by_id") and not source.get("parts_by_text"):
        for part in soup.find_all("div", class_="part"):
            md = convert_part(part, base_url=source["base_url"])
            parts_md.append(md)
        print(f"  ✓ All Parts ({len(parts_md)} parts, "
              f"{sum(len(m) for m in parts_md)} chars)")

    merged = "\n\n".join(parts_md)

    # Clean PDF/HTML extraction artefacts before applying section flags
    merged, _clean_stats = clean_corpus_artifacts(merged)
    print(format_stats(_clean_stats, label=source["name"]))

    for flag in source.get("section_flags", []):
        before = merged.count(flag["tags"])
        merged = flag["match"].sub(
            lambda m: f"{m.group(0)}\n\ntags: {flag['tags']}",
            merged,
        )
        applied = merged.count(flag["tags"]) - before
        print(f"  ✓ Applied flag {flag['tags']!r}: {applied} match(es)")

    # Demote headings so legislation Parts/Subparts/sections nest under the
    # source-level H2 wrapper added in build() — avoids cross-Act H2 collisions
    # like Privacy Act + UEMA both having "Part 1 Preliminary provisions".
    merged = demote_headings(merged)

    return merged


def fetch_guidance(source: dict) -> str:
    """Download a guidance document (HTML or PDF) and convert to markdown."""
    print(f"\n→ {source['name']}")

    cache_path = SOURCES_RAW / source["filename"]
    if not cache_path.exists():
        import urllib.request
        req = urllib.request.Request(
            source["url"],
            headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"},
        )
        try:
            with urllib.request.urlopen(req) as resp:
                data = resp.read()
            cache_path.write_bytes(data)
            print(f"  Downloaded: {cache_path.name} ({len(data) / 1024:.0f} KB)")
        except Exception as e:
            print(f"  ⚠ Download failed: {e}")
            return ""
    else:
        print(f"  Using cached: {cache_path.name}")

    body = extract_to_markdown(cache_path, format_hint=source.get("format")).strip()

    # Clean PDF/HTML extraction artefacts (page numbers, headers, chrome, control chars)
    body, _clean_stats = clean_corpus_artifacts(body)
    print(format_stats(_clean_stats, label=source["name"]))

    # Demote body headings so they nest under the source-level H2 wrapper we
    # add below (avoids cross-source collisions like multiple "## Introduction").
    body = demote_headings(body)

    return (
        f"\n## {source['section_title']}\n\n"
        f"Source: {source['url']}\n\n"
        f"{body}\n"
    )


def build():
    print("Building consumer_protection compilation\n")

    legislation_blocks = []
    for src in LEGISLATION_SOURCES:
        block = fetch_legislation(src)
        if block:
            legislation_blocks.append(
                f"\n## {src['name']}\n\nSource: {src['base_url']}\n\n{block}"
            )

    guidance_blocks = []
    for src in GUIDANCE_SOURCES:
        block = fetch_guidance(src)
        if block:
            guidance_blocks.append(block)

    body = "\n\n".join(legislation_blocks + guidance_blocks)

    compilation = f"""# Consumer Protection — NZ Healthcare Marketing Regulation

Source: https://comcom.govt.nz/business/dealing-with-typical-situations/making-accurate-claims

This compilation covers New Zealand's general consumer-protection law as it applies to healthcare marketing. The Fair Trading Act 1986 — administered by the Commerce Commission — prohibits misleading and deceptive conduct, false representations, and unsubstantiated claims (s12A). For health and wellness advertisers, s12A is the most-tripped-over provision; making any health benefit claim without a reasonable basis is a Fair Trading Act breach regardless of whether the claim is technically true.

Frequently-cited provisions are flagged with `tags:` metadata for retrieval-time surfacing.

{body}
"""

    DOMAIN_FILE.write_text(compilation, encoding="utf-8")
    lines = compilation.count("\n") + 1
    size_kb = len(compilation.encode("utf-8")) / 1024
    print(f"\n✅ Wrote {DOMAIN_FILE} ({lines} lines, {size_kb:.1f} KB)")


if __name__ == "__main__":
    build()