Spaces:
Runtime error
Runtime error
| from tqdm import tqdm | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split | |
| def generate_pairs(line, window_size): | |
| line = np.array(line) | |
| line = line[:, 0] | |
| seqs = [] | |
| for i in range(0, len(line), window_size): | |
| seq = line[i:i + window_size] | |
| seqs.append(seq) | |
| seqs += [] | |
| seq_pairs = [] | |
| for i in range(1, len(seqs)): | |
| seq_pairs.append([seqs[i - 1], seqs[i]]) | |
| return seqs | |
| def fixed_window(line, window_size, adaptive_window, seq_len=None, min_len=0): | |
| line = [ln.split(",") for ln in line.split()] | |
| # filter the line/session shorter than 10 | |
| if len(line) < min_len: | |
| return [], [] | |
| # max seq len | |
| if seq_len is not None: | |
| line = line[:seq_len] | |
| if adaptive_window: | |
| window_size = len(line) | |
| line = np.array(line) | |
| # if time duration exists in data | |
| if line.shape[1] == 2: | |
| tim = line[:,1].astype(float) | |
| line = line[:, 0] | |
| # the first time duration of a session should be 0, so max is window_size(mins) * 60 | |
| tim[0] = 0 | |
| else: | |
| line = line.squeeze() | |
| # if time duration doesn't exist, then create a zero array for time | |
| tim = np.zeros(line.shape) | |
| logkey_seqs = [] | |
| time_seq = [] | |
| for i in range(0, len(line), window_size): | |
| logkey_seqs.append(line[i:i + window_size]) | |
| time_seq.append(tim[i:i + window_size]) | |
| return logkey_seqs, time_seq | |
| def generate_train_valid(data_path, window_size=20, adaptive_window=True, | |
| sample_ratio=1, valid_size=0.1, output_path=None, | |
| scale=None, scale_path=None, seq_len=None, min_len=0): | |
| with open(data_path, 'r') as f: | |
| data_iter = f.readlines() | |
| num_session = int(len(data_iter) * sample_ratio) | |
| # only even number of samples, or drop_last=True in DataLoader API | |
| # coz in parallel computing in CUDA, odd number of samples reports issue when merging the result | |
| # num_session += num_session % 2 | |
| test_size = int(min(num_session, len(data_iter)) * valid_size) | |
| # only even number of samples | |
| # test_size += test_size % 2 | |
| print("before filtering short session") | |
| print("train size ", int(num_session - test_size)) | |
| print("valid size ", int(test_size)) | |
| print("="*40) | |
| logkey_seq_pairs = [] | |
| time_seq_pairs = [] | |
| session = 0 | |
| for line in tqdm(data_iter): | |
| if session >= num_session: | |
| break | |
| session += 1 | |
| logkeys, times = fixed_window(line, window_size, adaptive_window, seq_len, min_len) | |
| logkey_seq_pairs += logkeys | |
| time_seq_pairs += times | |
| logkey_seq_pairs = np.array(logkey_seq_pairs, dtype=object) | |
| time_seq_pairs = np.array(time_seq_pairs, dtype=object) | |
| logkey_trainset, logkey_validset, time_trainset, time_validset = train_test_split(logkey_seq_pairs, | |
| time_seq_pairs, | |
| test_size=test_size, | |
| random_state=1234) | |
| # sort seq_pairs by seq len | |
| train_len = list(map(len, logkey_trainset)) | |
| valid_len = list(map(len, logkey_validset)) | |
| train_sort_index = np.argsort(-1 * np.array(train_len)) | |
| valid_sort_index = np.argsort(-1 * np.array(valid_len)) | |
| logkey_trainset = logkey_trainset[train_sort_index] | |
| logkey_validset = logkey_validset[valid_sort_index] | |
| time_trainset = time_trainset[train_sort_index] | |
| time_validset = time_validset[valid_sort_index] | |
| print("="*40) | |
| print("Num of train seqs", len(logkey_trainset)) | |
| print("Num of valid seqs", len(logkey_validset)) | |
| print("="*40) | |
| return logkey_trainset, logkey_validset, time_trainset, time_validset | |