Spaces:
Sleeping
Sleeping
File size: 2,488 Bytes
fe617ac 5af0c50 fe617ac 5af0c50 fe617ac 5af0c50 fe617ac 5af0c50 fe617ac 5af0c50 fe617ac 5af0c50 fe617ac 5af0c50 fe617ac 5af0c50 fe617ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
#!/usr/bin/env python3
"""
Build User Sequences for Sequential Models (SASRec, YoutubeDNN)
Converts user interaction history into padded sequences for training.
TIME-SPLIT (strict): Uses train.csv ONLY for sequences and item_map.
This prevents leakage when Ranking uses SASRec embeddings as features:
- Val/test samples must not appear in user history when computing sasrec_score.
- Recall models (SASRec, YoutubeDNN) overwrite these with their own train-only output.
Usage:
python scripts/data/build_sequences.py
Input:
- data/rec/train.csv (val.csv, test.csv exist but are NOT used for sequences)
Output:
- data/rec/user_sequences.pkl (Dict[user_id, List[item_id]]) — train-only
- data/rec/item_map.pkl (Dict[isbn, item_id]) — train-only
"""
import pandas as pd
import pickle
import logging
from pathlib import Path
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def build_sequences(data_dir="data/rec", max_len=50):
"""
Build user sequences from train.csv only (strict time-split).
Val/test are excluded to avoid leakage in ranking features (sasrec_score).
"""
logger.info("Building user sequences (train-only, time-split)...")
train_df = pd.read_csv(f"{data_dir}/train.csv")
# 1. Item map from train only (matches SASRec/YoutubeDNN)
items = train_df["isbn"].unique()
item_map = {isbn: i + 1 for i, isbn in enumerate(items)}
logger.info(" Items (train): %d", len(item_map))
# 2. User history from train only (no val/test)
user_history = {}
if "timestamp" in train_df.columns:
train_df = train_df.sort_values(["user_id", "timestamp"])
for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc=" Processing"):
u = str(row["user_id"])
item = item_map.get(row["isbn"])
if item is None:
continue
if u not in user_history:
user_history[u] = []
user_history[u].append(item)
final_seqs = {u: hist[-max_len:] for u, hist in user_history.items()}
logger.info(" Users: %d", len(final_seqs))
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)
with open(data_dir / "item_map.pkl", "wb") as f:
pickle.dump(item_map, f)
with open(data_dir / "user_sequences.pkl", "wb") as f:
pickle.dump(final_seqs, f)
logger.info("Sequence data saved (train-only).")
if __name__ == "__main__":
build_sequences()
|