Spaces:

kshitij8076
/

shl-recommender-api

Sleeping

shl-recommender-api / scripts /validate_coverage.py

pankaj

SHL recommender — initial deploy

870800f 29 days ago

1.8 kB

	"""Verify every labeled URL in train.csv is present in data/products.jsonl.

	URLs in the labels appear in two forms:
	https://www.shl.com/products/product-catalog/view/<slug>/
	https://www.shl.com/solutions/products/product-catalog/view/<slug>/
	We normalize both to the slug for comparison.
	"""
	from __future__ import annotations

	import csv
	import json
	import re
	from pathlib import Path

	ROOT = Path(__file__).resolve().parent.parent
	PRODUCTS = ROOT / "data" / "products.jsonl"
	TRAIN = ROOT / "train.csv"
	TEST = ROOT / "test.csv"

	SLUG_RE = re.compile(r"/product-catalog/view/([^/?#]+)/?", re.I)


	def slug(url: str) -> str \| None:
	if not url:
	return None
	m = SLUG_RE.search(url)
	return m.group(1).lower() if m else None


	def main() -> None:
	products = [json.loads(l) for l in PRODUCTS.read_text(encoding="utf-8").splitlines() if l.strip()]
	have = {slug(p["url"]) for p in products if slug(p["url"])}
	print(f"products: {len(products)} \| unique slugs: {len(have)}")

	def coverage(csv_path: Path, name: str) -> None:
	labels: list[tuple[str, str]] = []
	with csv_path.open(encoding="utf-8") as f:
	reader = csv.reader(f)
	next(reader, None)
	for row in reader:
	if len(row) >= 2 and row[1]:
	labels.append((row[0], row[1]))
	if not labels:
	print(f"\n{name}: no labels (empty test set is normal).")
	return
	missing = [(q[:60], u) for q, u in labels if slug(u) not in have]
	print(f"\n{name}: {len(labels)} labeled rows \| missing in catalog: {len(missing)}")
	for q, u in missing[:10]:
	print(f" MISS q={q!r} url={u}")

	coverage(TRAIN, "train")
	coverage(TEST, "test")


	if __name__ == "__main__":
	main()