"""Verify every labeled URL in train.csv is present in data/products.jsonl. URLs in the labels appear in two forms: https://www.shl.com/products/product-catalog/view// https://www.shl.com/solutions/products/product-catalog/view// We normalize both to the slug for comparison. """ from __future__ import annotations import csv import json import re from pathlib import Path ROOT = Path(__file__).resolve().parent.parent PRODUCTS = ROOT / "data" / "products.jsonl" TRAIN = ROOT / "train.csv" TEST = ROOT / "test.csv" SLUG_RE = re.compile(r"/product-catalog/view/([^/?#]+)/?", re.I) def slug(url: str) -> str | None: if not url: return None m = SLUG_RE.search(url) return m.group(1).lower() if m else None def main() -> None: products = [json.loads(l) for l in PRODUCTS.read_text(encoding="utf-8").splitlines() if l.strip()] have = {slug(p["url"]) for p in products if slug(p["url"])} print(f"products: {len(products)} | unique slugs: {len(have)}") def coverage(csv_path: Path, name: str) -> None: labels: list[tuple[str, str]] = [] with csv_path.open(encoding="utf-8") as f: reader = csv.reader(f) next(reader, None) for row in reader: if len(row) >= 2 and row[1]: labels.append((row[0], row[1])) if not labels: print(f"\n{name}: no labels (empty test set is normal).") return missing = [(q[:60], u) for q, u in labels if slug(u) not in have] print(f"\n{name}: {len(labels)} labeled rows | missing in catalog: {len(missing)}") for q, u in missing[:10]: print(f" MISS q={q!r} url={u}") coverage(TRAIN, "train") coverage(TEST, "test") if __name__ == "__main__": main()