File size: 3,065 Bytes
fe617ac
 
 
 
 
 
3f281f1
fe617ac
 
 
 
 
 
 
 
 
 
3f281f1
fe617ac
3f281f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python3
"""
数据划分脚本 - 为推荐系统准备训练/验证/测试集

划分策略: 时序划分 (Leave-Last-Out)
- 每个用户的最后一次评分 → test
- 每个用户的倒数第二次评分 → val
- 其余评分 → train

只保留评分 >= 3 次的用户 (有足够历史)
"""

import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import time
import logging

logger = logging.getLogger(__name__)

DATA_PATH = Path("data/raw/Books_rating.csv")
OUTPUT_DIR = Path("data/rec")


def run(
    data_path: Path = DATA_PATH,
    output_dir: Path = OUTPUT_DIR,
) -> None:
    """Split train/val/test with Leave-Last-Out. Callable from Pipeline."""
    output_dir.mkdir(parents=True, exist_ok=True)
    start_time = time.time()

    logger.info("Loading raw ratings...")
    df = pd.read_csv(data_path, usecols=['Id', 'User_id', 'review/score', 'review/time', 'review/text'])
    df.columns = ['isbn', 'user_id', 'rating', 'timestamp', 'review']
    logger.info(f"  Records: {len(df):,}, Users: {df['user_id'].nunique():,}, Items: {df['isbn'].nunique():,}")

    logger.info("Cleaning data...")
    df = df.drop_duplicates(subset=['user_id', 'isbn'], keep='last')
    df = df.dropna(subset=['rating', 'timestamp'])
    df = df[df['rating'] > 0]

    logger.info("Filtering active users (>=3 interactions)...")
    user_counts = df.groupby('user_id').size()
    active_users = user_counts[user_counts >= 3].index
    df = df[df['user_id'].isin(active_users)]
    logger.info(f"  Active users: {len(active_users):,}, Records: {len(df):,}")

    logger.info("Splitting train/val/test (Leave-Last-Out)...")
    df = df.sort_values(['user_id', 'timestamp'])

    train_list = []
    val_list = []
    test_list = []

    for user_id, group in tqdm(df.groupby('user_id'), desc="  Splitting"):
        group = group.sort_values('timestamp')
        test_list.append(group.iloc[-1])
        val_list.append(group.iloc[-2])
        train_list.extend(group.iloc[:-2].to_dict('records'))

    train_df = pd.DataFrame(train_list)
    val_df = pd.DataFrame(val_list)
    test_df = pd.DataFrame(test_list)

    logger.info(f"  Train: {len(train_df):,}, Val: {len(val_df):,}, Test: {len(test_df):,}")

    train_df.to_csv(output_dir / 'train.csv', index=False)
    val_df.to_csv(output_dir / 'val.csv', index=False)
    test_df.to_csv(output_dir / 'test.csv', index=False)
    pd.DataFrame({'user_id': active_users}).to_csv(output_dir / 'active_users.csv', index=False)

    with open(output_dir / 'stats.txt', 'w') as f:
        for k, v in [('total_records', len(df)), ('train_records', len(train_df)),
                     ('val_records', len(val_df)), ('test_records', len(test_df)),
                     ('active_users', len(active_users)), ('books', df['isbn'].nunique())]:
            f.write(f'{k}: {v:,}\n')

    logger.info("Split complete in %.1fs", time.time() - start_time)


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
    run()