File size: 4,888 Bytes
f43af3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import pickle
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')


# source data: https://tianchi.aliyun.com/dataset/dataDetail?dataId=649

def check_dominate_event_type(event_type_seq, threshold=0.7):
    event_type = np.unique(event_type_seq)
    total_len = len(event_type_seq)
    type_ratio = [len(event_type_seq[event_type_seq == event_type_i]) / total_len for event_type_i in event_type]

    return True if max(type_ratio) > threshold else False


def cate_map(cate_id, cate_event_map_df):
    res = cate_event_map_df[cate_event_map_df['cate'] == cate_id]['event_id'].to_list()[0]
    return res


def read_data_step_3(source_dir, cate_dir, target_dir):
    train_df = pd.read_csv(source_dir, header=0)

    cate_event_map_df = pd.read_csv(cate_dir, header=0)

    train_df['event_type'] = train_df['cate_id'].apply(lambda x: cate_map(x, cate_event_map_df))
    print(train_df['event_type'].value_counts(normalize=True))
    unique_user_id = np.unique(train_df['user_id'])

    for idx, user_id in enumerate(unique_user_id):
        user_df = train_df[train_df['user_id'] == user_id]
        prev_time = user_df.iloc[0, 4]
        event_dtime = user_df['event_dtime'].values
        event_time = user_df['event_time'].values
        event_dtime[0] = 0.0

        for i in range(1, len(event_time)):
            if event_dtime[i] > 50.0: # too large interval
                rand_dt = np.random.random() + 0.1
                event_time[i] = prev_time + rand_dt
                event_dtime[i] = rand_dt
            else:
                event_time[i] = event_time[i - 1] + event_dtime[i]
            prev_time = event_time[i]

        user_df['event_dtime'] = event_dtime
        user_df['event_time'] = event_time

        print(min(event_dtime[1:]), max(event_dtime))

        assert abs(np.mean(user_df['event_time'].diff().values[1:]) - np.mean(event_dtime[1:])) < 0.0001

    train_df.to_csv(target_dir)
    return


def read_data_step_2(source_dir):
    train_df = pd.read_csv(source_dir, header=None)
    train_df.columns = ['user_id', 'item_id', 'cate_id', 'event_type_raw', 'event_time']
    count = train_df['cate_id'].value_counts(normalize=True)
    pd.DataFrame(count).to_csv('taobao_map.csv', header=True)

    return


def read_data_step_1(source_dir, target_dir):
    train_df = pd.read_csv(source_dir, header=None)
    train_df.columns = ['user_id', 'item_id', 'cate_id', 'event_type_raw', 'event_time']
    train_df['event_time'] /= 10000
    unique_user_id = np.unique(train_df['user_id'])

    train_df = train_df[train_df['event_type_raw'] == 'pv']

    res = pd.DataFrame()
    total_seq = 0

    for idx, user_id in enumerate(unique_user_id):
        print(f'user {idx}')
        user_df = train_df[train_df['user_id'] == user_id]

        # drop consecutive duplicate on pv
        user_df = user_df.loc[user_df['cate_id'].shift() != user_df['cate_id']]
        user_df.fillna(0.0, inplace=True)

        user_df.sort_values(by=['event_time'], inplace=True)
        user_df['event_dtime'] = user_df['event_time'].diff()

        user_df.fillna(0.0, inplace=True)

        # drop dtime < 0.05
        user_df = user_df[user_df['event_dtime'] > 0.1]

        if len(user_df) < 40:
            print('user seq is too short, skip it')
            continue

        total_seq += 1
        print(f'{total_seq} users have been recorded')
        res = pd.concat([res, user_df])
        if total_seq > 2000:
            break

    res.to_csv(target_dir, header=True, index=False)

    return


def save_data(source_dir):
    df = pd.read_csv(source_dir, header=0)
    unique_user_id = np.unique(df['user_id'])
    res = []
    print(np.unique(df['event_type']))
    for idx, user_id in enumerate(unique_user_id):
        print(f'user {idx}')
        user_seq = []
        user_df = df[df['user_id'] == user_id]
        length = 0
        for idx_row, row in user_df.iterrows():
            event_dtime = 0 if length == 0 else row['event_dtime']
            user_seq.append({"time_since_last_event": event_dtime,
                             "time_since_start": row['event_time'],
                             "type_event": row['event_type']
                             })
            length += 1

        res.append(user_seq)

    with open('../data/taobao/train.pkl', "wb") as f_out:
        pickle.dump(
            {
                "dim_process": 17,
                'train': res[:1300]
            }, f_out
        )

    with open('../data/taobao/dev.pkl', "wb") as f_out:
        pickle.dump(
            {
                "dim_process": 17,
                'dev': res[1300:1500]
            }, f_out
        )

    with open('../data/taobao/test.pkl', "wb") as f_out:
        pickle.dump(
            {
                "dim_process": 17,
                'test': res[1500:]
            }, f_out
        )

    return