File size: 4,888 Bytes
f43af3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import pickle
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
# source data: https://tianchi.aliyun.com/dataset/dataDetail?dataId=649
def check_dominate_event_type(event_type_seq, threshold=0.7):
event_type = np.unique(event_type_seq)
total_len = len(event_type_seq)
type_ratio = [len(event_type_seq[event_type_seq == event_type_i]) / total_len for event_type_i in event_type]
return True if max(type_ratio) > threshold else False
def cate_map(cate_id, cate_event_map_df):
res = cate_event_map_df[cate_event_map_df['cate'] == cate_id]['event_id'].to_list()[0]
return res
def read_data_step_3(source_dir, cate_dir, target_dir):
train_df = pd.read_csv(source_dir, header=0)
cate_event_map_df = pd.read_csv(cate_dir, header=0)
train_df['event_type'] = train_df['cate_id'].apply(lambda x: cate_map(x, cate_event_map_df))
print(train_df['event_type'].value_counts(normalize=True))
unique_user_id = np.unique(train_df['user_id'])
for idx, user_id in enumerate(unique_user_id):
user_df = train_df[train_df['user_id'] == user_id]
prev_time = user_df.iloc[0, 4]
event_dtime = user_df['event_dtime'].values
event_time = user_df['event_time'].values
event_dtime[0] = 0.0
for i in range(1, len(event_time)):
if event_dtime[i] > 50.0: # too large interval
rand_dt = np.random.random() + 0.1
event_time[i] = prev_time + rand_dt
event_dtime[i] = rand_dt
else:
event_time[i] = event_time[i - 1] + event_dtime[i]
prev_time = event_time[i]
user_df['event_dtime'] = event_dtime
user_df['event_time'] = event_time
print(min(event_dtime[1:]), max(event_dtime))
assert abs(np.mean(user_df['event_time'].diff().values[1:]) - np.mean(event_dtime[1:])) < 0.0001
train_df.to_csv(target_dir)
return
def read_data_step_2(source_dir):
train_df = pd.read_csv(source_dir, header=None)
train_df.columns = ['user_id', 'item_id', 'cate_id', 'event_type_raw', 'event_time']
count = train_df['cate_id'].value_counts(normalize=True)
pd.DataFrame(count).to_csv('taobao_map.csv', header=True)
return
def read_data_step_1(source_dir, target_dir):
train_df = pd.read_csv(source_dir, header=None)
train_df.columns = ['user_id', 'item_id', 'cate_id', 'event_type_raw', 'event_time']
train_df['event_time'] /= 10000
unique_user_id = np.unique(train_df['user_id'])
train_df = train_df[train_df['event_type_raw'] == 'pv']
res = pd.DataFrame()
total_seq = 0
for idx, user_id in enumerate(unique_user_id):
print(f'user {idx}')
user_df = train_df[train_df['user_id'] == user_id]
# drop consecutive duplicate on pv
user_df = user_df.loc[user_df['cate_id'].shift() != user_df['cate_id']]
user_df.fillna(0.0, inplace=True)
user_df.sort_values(by=['event_time'], inplace=True)
user_df['event_dtime'] = user_df['event_time'].diff()
user_df.fillna(0.0, inplace=True)
# drop dtime < 0.05
user_df = user_df[user_df['event_dtime'] > 0.1]
if len(user_df) < 40:
print('user seq is too short, skip it')
continue
total_seq += 1
print(f'{total_seq} users have been recorded')
res = pd.concat([res, user_df])
if total_seq > 2000:
break
res.to_csv(target_dir, header=True, index=False)
return
def save_data(source_dir):
df = pd.read_csv(source_dir, header=0)
unique_user_id = np.unique(df['user_id'])
res = []
print(np.unique(df['event_type']))
for idx, user_id in enumerate(unique_user_id):
print(f'user {idx}')
user_seq = []
user_df = df[df['user_id'] == user_id]
length = 0
for idx_row, row in user_df.iterrows():
event_dtime = 0 if length == 0 else row['event_dtime']
user_seq.append({"time_since_last_event": event_dtime,
"time_since_start": row['event_time'],
"type_event": row['event_type']
})
length += 1
res.append(user_seq)
with open('../data/taobao/train.pkl', "wb") as f_out:
pickle.dump(
{
"dim_process": 17,
'train': res[:1300]
}, f_out
)
with open('../data/taobao/dev.pkl', "wb") as f_out:
pickle.dump(
{
"dim_process": 17,
'dev': res[1300:1500]
}, f_out
)
with open('../data/taobao/test.pkl', "wb") as f_out:
pickle.dump(
{
"dim_process": 17,
'test': res[1500:]
}, f_out
)
return
|