Spaces:
Sleeping
Sleeping
| """Verify every labeled URL in train.csv is present in data/products.jsonl. | |
| URLs in the labels appear in two forms: | |
| https://www.shl.com/products/product-catalog/view/<slug>/ | |
| https://www.shl.com/solutions/products/product-catalog/view/<slug>/ | |
| We normalize both to the slug for comparison. | |
| """ | |
| from __future__ import annotations | |
| import csv | |
| import json | |
| import re | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parent.parent | |
| PRODUCTS = ROOT / "data" / "products.jsonl" | |
| TRAIN = ROOT / "train.csv" | |
| TEST = ROOT / "test.csv" | |
| SLUG_RE = re.compile(r"/product-catalog/view/([^/?#]+)/?", re.I) | |
| def slug(url: str) -> str | None: | |
| if not url: | |
| return None | |
| m = SLUG_RE.search(url) | |
| return m.group(1).lower() if m else None | |
| def main() -> None: | |
| products = [json.loads(l) for l in PRODUCTS.read_text(encoding="utf-8").splitlines() if l.strip()] | |
| have = {slug(p["url"]) for p in products if slug(p["url"])} | |
| print(f"products: {len(products)} | unique slugs: {len(have)}") | |
| def coverage(csv_path: Path, name: str) -> None: | |
| labels: list[tuple[str, str]] = [] | |
| with csv_path.open(encoding="utf-8") as f: | |
| reader = csv.reader(f) | |
| next(reader, None) | |
| for row in reader: | |
| if len(row) >= 2 and row[1]: | |
| labels.append((row[0], row[1])) | |
| if not labels: | |
| print(f"\n{name}: no labels (empty test set is normal).") | |
| return | |
| missing = [(q[:60], u) for q, u in labels if slug(u) not in have] | |
| print(f"\n{name}: {len(labels)} labeled rows | missing in catalog: {len(missing)}") | |
| for q, u in missing[:10]: | |
| print(f" MISS q={q!r} url={u}") | |
| coverage(TRAIN, "train") | |
| coverage(TEST, "test") | |
| if __name__ == "__main__": | |
| main() | |