MukeshKapoor25's picture
changs
6f2ff70
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
def generate_pairs(line, window_size):
line = np.array(line)
line = line[:, 0]
seqs = []
for i in range(0, len(line), window_size):
seq = line[i:i + window_size]
seqs.append(seq)
seqs += []
seq_pairs = []
for i in range(1, len(seqs)):
seq_pairs.append([seqs[i - 1], seqs[i]])
return seqs
def fixed_window(line, window_size, adaptive_window, seq_len=None, min_len=0):
line = [ln.split(",") for ln in line.split()]
# filter the line/session shorter than 10
if len(line) < min_len:
return [], []
# max seq len
if seq_len is not None:
line = line[:seq_len]
if adaptive_window:
window_size = len(line)
line = np.array(line)
# if time duration exists in data
if line.shape[1] == 2:
tim = line[:,1].astype(float)
line = line[:, 0]
# the first time duration of a session should be 0, so max is window_size(mins) * 60
tim[0] = 0
else:
line = line.squeeze()
# if time duration doesn't exist, then create a zero array for time
tim = np.zeros(line.shape)
logkey_seqs = []
time_seq = []
for i in range(0, len(line), window_size):
logkey_seqs.append(line[i:i + window_size])
time_seq.append(tim[i:i + window_size])
return logkey_seqs, time_seq
def generate_train_valid(data_path, window_size=20, adaptive_window=True,
sample_ratio=1, valid_size=0.1, output_path=None,
scale=None, scale_path=None, seq_len=None, min_len=0):
with open(data_path, 'r') as f:
data_iter = f.readlines()
num_session = int(len(data_iter) * sample_ratio)
# only even number of samples, or drop_last=True in DataLoader API
# coz in parallel computing in CUDA, odd number of samples reports issue when merging the result
# num_session += num_session % 2
test_size = int(min(num_session, len(data_iter)) * valid_size)
# only even number of samples
# test_size += test_size % 2
print("before filtering short session")
print("train size ", int(num_session - test_size))
print("valid size ", int(test_size))
print("="*40)
logkey_seq_pairs = []
time_seq_pairs = []
session = 0
for line in tqdm(data_iter):
if session >= num_session:
break
session += 1
logkeys, times = fixed_window(line, window_size, adaptive_window, seq_len, min_len)
logkey_seq_pairs += logkeys
time_seq_pairs += times
logkey_seq_pairs = np.array(logkey_seq_pairs, dtype=object)
time_seq_pairs = np.array(time_seq_pairs, dtype=object)
logkey_trainset, logkey_validset, time_trainset, time_validset = train_test_split(logkey_seq_pairs,
time_seq_pairs,
test_size=test_size,
random_state=1234)
# sort seq_pairs by seq len
train_len = list(map(len, logkey_trainset))
valid_len = list(map(len, logkey_validset))
train_sort_index = np.argsort(-1 * np.array(train_len))
valid_sort_index = np.argsort(-1 * np.array(valid_len))
logkey_trainset = logkey_trainset[train_sort_index]
logkey_validset = logkey_validset[valid_sort_index]
time_trainset = time_trainset[train_sort_index]
time_validset = time_validset[valid_sort_index]
print("="*40)
print("Num of train seqs", len(logkey_trainset))
print("Num of valid seqs", len(logkey_validset))
print("="*40)
return logkey_trainset, logkey_validset, time_trainset, time_validset