Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Build User Sequences for Sequential Models (SASRec, YoutubeDNN) | |
| Converts user interaction history into padded sequences for training. | |
| TIME-SPLIT (strict): Uses train.csv ONLY for sequences and item_map. | |
| This prevents leakage when Ranking uses SASRec embeddings as features: | |
| - Val/test samples must not appear in user history when computing sasrec_score. | |
| - Recall models (SASRec, YoutubeDNN) overwrite these with their own train-only output. | |
| Usage: | |
| python scripts/data/build_sequences.py | |
| Input: | |
| - data/rec/train.csv (val.csv, test.csv exist but are NOT used for sequences) | |
| Output: | |
| - data/rec/user_sequences.pkl (Dict[user_id, List[item_id]]) — train-only | |
| - data/rec/item_map.pkl (Dict[isbn, item_id]) — train-only | |
| """ | |
| import pandas as pd | |
| import pickle | |
| import logging | |
| from pathlib import Path | |
| from tqdm import tqdm | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def build_sequences(data_dir="data/rec", max_len=50): | |
| """ | |
| Build user sequences from train.csv only (strict time-split). | |
| Val/test are excluded to avoid leakage in ranking features (sasrec_score). | |
| """ | |
| logger.info("Building user sequences (train-only, time-split)...") | |
| train_df = pd.read_csv(f"{data_dir}/train.csv") | |
| # 1. Item map from train only (matches SASRec/YoutubeDNN) | |
| items = train_df["isbn"].unique() | |
| item_map = {isbn: i + 1 for i, isbn in enumerate(items)} | |
| logger.info(" Items (train): %d", len(item_map)) | |
| # 2. User history from train only (no val/test) | |
| user_history = {} | |
| if "timestamp" in train_df.columns: | |
| train_df = train_df.sort_values(["user_id", "timestamp"]) | |
| for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc=" Processing"): | |
| u = str(row["user_id"]) | |
| item = item_map.get(row["isbn"]) | |
| if item is None: | |
| continue | |
| if u not in user_history: | |
| user_history[u] = [] | |
| user_history[u].append(item) | |
| final_seqs = {u: hist[-max_len:] for u, hist in user_history.items()} | |
| logger.info(" Users: %d", len(final_seqs)) | |
| data_dir = Path(data_dir) | |
| data_dir.mkdir(parents=True, exist_ok=True) | |
| with open(data_dir / "item_map.pkl", "wb") as f: | |
| pickle.dump(item_map, f) | |
| with open(data_dir / "user_sequences.pkl", "wb") as f: | |
| pickle.dump(final_seqs, f) | |
| logger.info("Sequence data saved (train-only).") | |
| if __name__ == "__main__": | |
| build_sequences() | |