File size: 3,932 Bytes
11c72a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import scipy.sparse
import scipy.io
from backend.datasets.data import file_utils


class _SequentialDataset(Dataset):
    def __init__(self, bow, times, time_wordfreq):
        super().__init__()
        self.bow = bow
        self.times = times
        self.time_wordfreq = time_wordfreq

    def __len__(self):
        return len(self.bow)

    def __getitem__(self, index):
        return_dict = {
            'bow': self.bow[index],
            'times': self.times[index],
            'time_wordfreq': self.time_wordfreq[self.times[index]],
        }

        return return_dict


class DynamicDataset:
    def __init__(self, dataset_dir, batch_size=200, read_labels=False, use_partition=False, device='cuda', as_tensor=True):

        self.load_data(dataset_dir, read_labels, use_partition)

        self.vocab_size = len(self.vocab)
        self.train_size = len(self.train_bow)
        self.num_times = int(self.train_times.max()) + 1  # assuming train_times is a numpy array
        self.train_time_wordfreq = self.get_time_wordfreq(self.train_bow, self.train_times)

        print('train size: ', len(self.train_bow))
        if use_partition:
            print('test size: ', len(self.test_bow))
        print('vocab size: ', len(self.vocab))
        print('average length: {:.3f}'.format(self.train_bow.sum(1).mean().item()))
        print('num of each time slice: ', self.num_times, np.bincount(self.train_times))

        if as_tensor:
            self.train_bow = torch.from_numpy(self.train_bow).float().to(device)
            self.train_times = torch.from_numpy(self.train_times).long().to(device)
            self.train_time_wordfreq = torch.from_numpy(self.train_time_wordfreq).float().to(device)
            
            if use_partition:
                self.test_bow = torch.from_numpy(self.test_bow).float().to(device)
                self.test_times = torch.from_numpy(self.test_times).long().to(device)

            self.train_dataset = _SequentialDataset(self.train_bow, self.train_times, self.train_time_wordfreq)
            
            if use_partition:
                self.test_dataset = _SequentialDataset(self.test_bow, self.test_times, self.train_time_wordfreq)

            self.train_dataloader = DataLoader(self.train_dataset, batch_size=batch_size, shuffle=True)

    def load_data(self, path, read_labels, use_partition=False):
        self.train_bow = scipy.sparse.load_npz(f'{path}/train_bow.npz').toarray().astype('float32')
        self.train_texts = file_utils.read_text(f'{path}/train_texts.txt')
        self.train_times = np.loadtxt(f'{path}/train_times.txt').astype('int32')
        self.vocab = file_utils.read_text(f'{path}/vocab.txt')
        self.word_embeddings = scipy.sparse.load_npz(f'{path}/word_embeddings.npz').toarray().astype('float32')

        self.pretrained_WE = self.word_embeddings  # preserve compatibility

        if read_labels:
            self.train_labels = np.loadtxt(f'{path}/train_labels.txt').astype('int32')

        if use_partition:
            self.test_bow = scipy.sparse.load_npz(f'{path}/test_bow.npz').toarray().astype('float32')
            self.test_texts = file_utils.read_text(f'{path}/test_texts.txt')
            self.test_times = np.loadtxt(f'{path}/test_times.txt').astype('int32')
            if read_labels:
                self.test_labels = np.loadtxt(f'{path}/test_labels.txt').astype('int32')

    # word frequency at each time slice.
    def get_time_wordfreq(self, bow, times):
        train_time_wordfreq = np.zeros((self.num_times, self.vocab_size))
        for time in range(self.num_times):
            idx = np.where(times == time)[0]
            train_time_wordfreq[time] += bow[idx].sum(0)
        cnt_times = np.bincount(times)

        train_time_wordfreq = train_time_wordfreq / cnt_times[:, np.newaxis]
        return train_time_wordfreq