shl-recommender-api / scripts /validate_coverage.py
pankaj
SHL recommender — initial deploy
870800f
"""Verify every labeled URL in train.csv is present in data/products.jsonl.
URLs in the labels appear in two forms:
https://www.shl.com/products/product-catalog/view/<slug>/
https://www.shl.com/solutions/products/product-catalog/view/<slug>/
We normalize both to the slug for comparison.
"""
from __future__ import annotations
import csv
import json
import re
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
PRODUCTS = ROOT / "data" / "products.jsonl"
TRAIN = ROOT / "train.csv"
TEST = ROOT / "test.csv"
SLUG_RE = re.compile(r"/product-catalog/view/([^/?#]+)/?", re.I)
def slug(url: str) -> str | None:
if not url:
return None
m = SLUG_RE.search(url)
return m.group(1).lower() if m else None
def main() -> None:
products = [json.loads(l) for l in PRODUCTS.read_text(encoding="utf-8").splitlines() if l.strip()]
have = {slug(p["url"]) for p in products if slug(p["url"])}
print(f"products: {len(products)} | unique slugs: {len(have)}")
def coverage(csv_path: Path, name: str) -> None:
labels: list[tuple[str, str]] = []
with csv_path.open(encoding="utf-8") as f:
reader = csv.reader(f)
next(reader, None)
for row in reader:
if len(row) >= 2 and row[1]:
labels.append((row[0], row[1]))
if not labels:
print(f"\n{name}: no labels (empty test set is normal).")
return
missing = [(q[:60], u) for q, u in labels if slug(u) not in have]
print(f"\n{name}: {len(labels)} labeled rows | missing in catalog: {len(missing)}")
for q, u in missing[:10]:
print(f" MISS q={q!r} url={u}")
coverage(TRAIN, "train")
coverage(TEST, "test")
if __name__ == "__main__":
main()