DTECT / backend /datasets /dynamic_dataset.py
AdhyaSuman's picture
Initial commit with Git LFS for large files
11c72a2
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import scipy.sparse
import scipy.io
from backend.datasets.data import file_utils
class _SequentialDataset(Dataset):
def __init__(self, bow, times, time_wordfreq):
super().__init__()
self.bow = bow
self.times = times
self.time_wordfreq = time_wordfreq
def __len__(self):
return len(self.bow)
def __getitem__(self, index):
return_dict = {
'bow': self.bow[index],
'times': self.times[index],
'time_wordfreq': self.time_wordfreq[self.times[index]],
}
return return_dict
class DynamicDataset:
def __init__(self, dataset_dir, batch_size=200, read_labels=False, use_partition=False, device='cuda', as_tensor=True):
self.load_data(dataset_dir, read_labels, use_partition)
self.vocab_size = len(self.vocab)
self.train_size = len(self.train_bow)
self.num_times = int(self.train_times.max()) + 1 # assuming train_times is a numpy array
self.train_time_wordfreq = self.get_time_wordfreq(self.train_bow, self.train_times)
print('train size: ', len(self.train_bow))
if use_partition:
print('test size: ', len(self.test_bow))
print('vocab size: ', len(self.vocab))
print('average length: {:.3f}'.format(self.train_bow.sum(1).mean().item()))
print('num of each time slice: ', self.num_times, np.bincount(self.train_times))
if as_tensor:
self.train_bow = torch.from_numpy(self.train_bow).float().to(device)
self.train_times = torch.from_numpy(self.train_times).long().to(device)
self.train_time_wordfreq = torch.from_numpy(self.train_time_wordfreq).float().to(device)
if use_partition:
self.test_bow = torch.from_numpy(self.test_bow).float().to(device)
self.test_times = torch.from_numpy(self.test_times).long().to(device)
self.train_dataset = _SequentialDataset(self.train_bow, self.train_times, self.train_time_wordfreq)
if use_partition:
self.test_dataset = _SequentialDataset(self.test_bow, self.test_times, self.train_time_wordfreq)
self.train_dataloader = DataLoader(self.train_dataset, batch_size=batch_size, shuffle=True)
def load_data(self, path, read_labels, use_partition=False):
self.train_bow = scipy.sparse.load_npz(f'{path}/train_bow.npz').toarray().astype('float32')
self.train_texts = file_utils.read_text(f'{path}/train_texts.txt')
self.train_times = np.loadtxt(f'{path}/train_times.txt').astype('int32')
self.vocab = file_utils.read_text(f'{path}/vocab.txt')
self.word_embeddings = scipy.sparse.load_npz(f'{path}/word_embeddings.npz').toarray().astype('float32')
self.pretrained_WE = self.word_embeddings # preserve compatibility
if read_labels:
self.train_labels = np.loadtxt(f'{path}/train_labels.txt').astype('int32')
if use_partition:
self.test_bow = scipy.sparse.load_npz(f'{path}/test_bow.npz').toarray().astype('float32')
self.test_texts = file_utils.read_text(f'{path}/test_texts.txt')
self.test_times = np.loadtxt(f'{path}/test_times.txt').astype('int32')
if read_labels:
self.test_labels = np.loadtxt(f'{path}/test_labels.txt').astype('int32')
# word frequency at each time slice.
def get_time_wordfreq(self, bow, times):
train_time_wordfreq = np.zeros((self.num_times, self.vocab_size))
for time in range(self.num_times):
idx = np.where(times == time)[0]
train_time_wordfreq[time] += bow[idx].sum(0)
cnt_times = np.bincount(times)
train_time_wordfreq = train_time_wordfreq / cnt_times[:, np.newaxis]
return train_time_wordfreq