"""Verify every labeled URL in train.csv is present in data/products.jsonl.

URLs in the labels appear in two forms:
  https://www.shl.com/products/product-catalog/view/<slug>/
  https://www.shl.com/solutions/products/product-catalog/view/<slug>/
We normalize both to the slug for comparison.
"""
from __future__ import annotations

import csv
import json
import re
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
PRODUCTS = ROOT / "data" / "products.jsonl"
TRAIN = ROOT / "train.csv"
TEST = ROOT / "test.csv"

SLUG_RE = re.compile(r"/product-catalog/view/([^/?#]+)/?", re.I)


def slug(url: str) -> str | None:
    if not url:
        return None
    m = SLUG_RE.search(url)
    return m.group(1).lower() if m else None


def main() -> None:
    products = [json.loads(l) for l in PRODUCTS.read_text(encoding="utf-8").splitlines() if l.strip()]
    have = {slug(p["url"]) for p in products if slug(p["url"])}
    print(f"products: {len(products)} | unique slugs: {len(have)}")

    def coverage(csv_path: Path, name: str) -> None:
        labels: list[tuple[str, str]] = []
        with csv_path.open(encoding="utf-8") as f:
            reader = csv.reader(f)
            next(reader, None)
            for row in reader:
                if len(row) >= 2 and row[1]:
                    labels.append((row[0], row[1]))
        if not labels:
            print(f"\n{name}: no labels (empty test set is normal).")
            return
        missing = [(q[:60], u) for q, u in labels if slug(u) not in have]
        print(f"\n{name}: {len(labels)} labeled rows | missing in catalog: {len(missing)}")
        for q, u in missing[:10]:
            print(f"  MISS  q={q!r}  url={u}")

    coverage(TRAIN, "train")
    coverage(TEST, "test")


if __name__ == "__main__":
    main()