| """Chronological leave-one-out split. |
| |
| For each user, sort interactions by timestamp (ties broken by item index for |
| determinism). The newest becomes the test positive, second-newest the val |
| positive, the rest go to train. The `min_user_interactions` filter in |
| preprocessing guarantees every user has at least 3 interactions. |
| |
| Random LOO leaks future info and is the #1 methodology error in rec-sys |
| tutorials — see Rendle et al. 2020, "Neural Collaborative Filtering vs. |
| Matrix Factorization Revisited." |
| """ |
|
|
| from __future__ import annotations |
|
|
| from dataclasses import dataclass |
|
|
| import numpy as np |
|
|
| from ..logging_utils import get_logger |
|
|
| _logger = get_logger(__name__) |
|
|
|
|
| @dataclass(frozen=True) |
| class Split: |
| """A train/val/test split of user-item positives. |
| |
| `train_pairs` is an [N, 2] array of (user_idx, item_idx). |
| `val_pairs` / `test_pairs` are [M, 2] arrays — exactly one row per user |
| that has both a val and a test interaction (they should all, by the |
| min-interactions filter). |
| """ |
|
|
| train_pairs: np.ndarray |
| val_pairs: np.ndarray |
| test_pairs: np.ndarray |
|
|
|
|
| def leave_one_out_split(interactions: np.ndarray) -> Split: |
| """Split the [N, 3] (user_idx, item_idx, timestamp) array chronologically.""" |
| if interactions.ndim != 2 or interactions.shape[1] != 3: |
| raise ValueError( |
| f"expected interactions of shape [N, 3], got {interactions.shape}" |
| ) |
|
|
| |
| |
| order = np.lexsort( |
| (interactions[:, 1], interactions[:, 2], interactions[:, 0]) |
| ) |
| sorted_ia = interactions[order] |
|
|
| users = sorted_ia[:, 0] |
| |
| |
| boundaries = np.flatnonzero(np.diff(users, prepend=users[0] - 1, append=users[-1] + 1)) |
| |
|
|
| train_idx: list[np.ndarray] = [] |
| val_idx: list[int] = [] |
| test_idx: list[int] = [] |
|
|
| for start, end in zip(boundaries[:-1], boundaries[1:]): |
| run_len = end - start |
| if run_len < 3: |
| |
| _logger.warning( |
| "Skipping user_idx=%d with only %d interactions", |
| int(users[start]), |
| run_len, |
| ) |
| continue |
| |
| test_idx.append(end - 1) |
| val_idx.append(end - 2) |
| train_idx.append(np.arange(start, end - 2)) |
|
|
| train_rows = np.concatenate(train_idx) if train_idx else np.empty(0, dtype=np.int64) |
| val_rows = np.asarray(val_idx, dtype=np.int64) |
| test_rows = np.asarray(test_idx, dtype=np.int64) |
|
|
| train_pairs = sorted_ia[train_rows, :2].astype(np.int64) |
| val_pairs = sorted_ia[val_rows, :2].astype(np.int64) |
| test_pairs = sorted_ia[test_rows, :2].astype(np.int64) |
|
|
| _logger.info( |
| "LOO split: train=%d, val=%d, test=%d pairs", |
| len(train_pairs), |
| len(val_pairs), |
| len(test_pairs), |
| ) |
|
|
| return Split(train_pairs=train_pairs, val_pairs=val_pairs, test_pairs=test_pairs) |
|
|
|
|
| def build_user_positives( |
| train_pairs: np.ndarray, |
| val_pairs: np.ndarray, |
| test_pairs: np.ndarray, |
| num_users: int, |
| ) -> list[set[int]]: |
| """Return a list indexed by user_idx -> set of item indices the user has |
| interacted with across ALL splits. |
| |
| Used by the negative sampler: a sampled "negative" must not collide with |
| any positive the user has, including held-out val/test positives. |
| Otherwise you'd be training against the very items you're evaluating on. |
| """ |
| seen: list[set[int]] = [set() for _ in range(num_users)] |
| for arr in (train_pairs, val_pairs, test_pairs): |
| for u, i in arr: |
| seen[int(u)].add(int(i)) |
| return seen |
|
|
|
|
| def build_user_train_positives( |
| train_pairs: np.ndarray, num_users: int |
| ) -> list[set[int]]: |
| """Subset of `build_user_positives` that includes only training positives. |
| |
| Used by the evaluator to mask items the user has already seen in training |
| (so they don't get recommended as if they were novel). |
| """ |
| seen: list[set[int]] = [set() for _ in range(num_users)] |
| for u, i in train_pairs: |
| seen[int(u)].add(int(i)) |
| return seen |
|
|