Spaces:
Running
Running
| import torch | |
| from torch.utils.data import Dataset, DataLoader | |
| import numpy as np | |
| import scipy.sparse | |
| import scipy.io | |
| from backend.datasets.data import file_utils | |
| class _SequentialDataset(Dataset): | |
| def __init__(self, bow, times, time_wordfreq): | |
| super().__init__() | |
| self.bow = bow | |
| self.times = times | |
| self.time_wordfreq = time_wordfreq | |
| def __len__(self): | |
| return len(self.bow) | |
| def __getitem__(self, index): | |
| return_dict = { | |
| 'bow': self.bow[index], | |
| 'times': self.times[index], | |
| 'time_wordfreq': self.time_wordfreq[self.times[index]], | |
| } | |
| return return_dict | |
| class DynamicDataset: | |
| def __init__(self, dataset_dir, batch_size=200, read_labels=False, use_partition=False, device='cuda', as_tensor=True): | |
| self.load_data(dataset_dir, read_labels, use_partition) | |
| self.vocab_size = len(self.vocab) | |
| self.train_size = len(self.train_bow) | |
| self.num_times = int(self.train_times.max()) + 1 # assuming train_times is a numpy array | |
| self.train_time_wordfreq = self.get_time_wordfreq(self.train_bow, self.train_times) | |
| print('train size: ', len(self.train_bow)) | |
| if use_partition: | |
| print('test size: ', len(self.test_bow)) | |
| print('vocab size: ', len(self.vocab)) | |
| print('average length: {:.3f}'.format(self.train_bow.sum(1).mean().item())) | |
| print('num of each time slice: ', self.num_times, np.bincount(self.train_times)) | |
| if as_tensor: | |
| self.train_bow = torch.from_numpy(self.train_bow).float().to(device) | |
| self.train_times = torch.from_numpy(self.train_times).long().to(device) | |
| self.train_time_wordfreq = torch.from_numpy(self.train_time_wordfreq).float().to(device) | |
| if use_partition: | |
| self.test_bow = torch.from_numpy(self.test_bow).float().to(device) | |
| self.test_times = torch.from_numpy(self.test_times).long().to(device) | |
| self.train_dataset = _SequentialDataset(self.train_bow, self.train_times, self.train_time_wordfreq) | |
| if use_partition: | |
| self.test_dataset = _SequentialDataset(self.test_bow, self.test_times, self.train_time_wordfreq) | |
| self.train_dataloader = DataLoader(self.train_dataset, batch_size=batch_size, shuffle=True) | |
| def load_data(self, path, read_labels, use_partition=False): | |
| self.train_bow = scipy.sparse.load_npz(f'{path}/train_bow.npz').toarray().astype('float32') | |
| self.train_texts = file_utils.read_text(f'{path}/train_texts.txt') | |
| self.train_times = np.loadtxt(f'{path}/train_times.txt').astype('int32') | |
| self.vocab = file_utils.read_text(f'{path}/vocab.txt') | |
| self.word_embeddings = scipy.sparse.load_npz(f'{path}/word_embeddings.npz').toarray().astype('float32') | |
| self.pretrained_WE = self.word_embeddings # preserve compatibility | |
| if read_labels: | |
| self.train_labels = np.loadtxt(f'{path}/train_labels.txt').astype('int32') | |
| if use_partition: | |
| self.test_bow = scipy.sparse.load_npz(f'{path}/test_bow.npz').toarray().astype('float32') | |
| self.test_texts = file_utils.read_text(f'{path}/test_texts.txt') | |
| self.test_times = np.loadtxt(f'{path}/test_times.txt').astype('int32') | |
| if read_labels: | |
| self.test_labels = np.loadtxt(f'{path}/test_labels.txt').astype('int32') | |
| # word frequency at each time slice. | |
| def get_time_wordfreq(self, bow, times): | |
| train_time_wordfreq = np.zeros((self.num_times, self.vocab_size)) | |
| for time in range(self.num_times): | |
| idx = np.where(times == time)[0] | |
| train_time_wordfreq[time] += bow[idx].sum(0) | |
| cnt_times = np.bincount(times) | |
| train_time_wordfreq = train_time_wordfreq / cnt_times[:, np.newaxis] | |
| return train_time_wordfreq | |