File size: 11,871 Bytes
f60c555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import math
from utils.helper import *
from logging import getLogger
from collections import defaultdict
import torch.sparse as tsp


class BaseDataset(object):
    def __init__(self, config):
        self.config = config
        self.logger = getLogger("normal")
        self.device = config.device
        self.dataset_name = config.dataset
        self.load_all_data()
        self.processed_eval_data()
        self.n_users = len(set(self.train_data[:, 0]) | set(self.valid_data[:, 0]) | set(self.test_data[:, 0]))
        self.n_items = len(set(self.train_data[:, 1]) | set(self.valid_data[:, 1]) | set(self.test_data[:, 1]))
        self.train_data[:, 1] += self.n_users  # Ensure that the ids are different
        self.valid_data[:, 1] += self.n_users  # Ensure that the ids are different
        self.test_data[:, 1] += self.n_users  # Ensure that the ids are different
        self.dict_user_items()

    def load_all_data(self):
        dataset_path = str('./data/' + self.dataset_name)
        self.train_dataset = np.load(dataset_path + '/train.npy', allow_pickle=True)  # [[1,2,3],[2,3,0]]
        v_feat = np.load(dataset_path + '/image_feat.npy', allow_pickle=True)
        self.i_v_feat = torch.from_numpy(v_feat).type(torch.FloatTensor).to(self.device)  # 4096
        t_feat = np.load(dataset_path + '/text_feat.npy', allow_pickle=True)
        self.i_t_feat = torch.from_numpy(t_feat).type(torch.FloatTensor).to(self.device)  # 384
        self.valid_dataset = np.load(dataset_path + '/valid.npy', allow_pickle=True)  # [[1,2,3],[2,3,0]]
        self.test_dataset = np.load(dataset_path + '/test.npy', allow_pickle=True)  # [[1,2,3],[2,3,0]]

    def processed_eval_data(self):
        self.train_data = self.train_dataset.transpose(1, 0).copy()
        self.valid_data = self.valid_dataset.transpose(1, 0).copy()
        self.test_data = self.test_dataset.transpose(1, 0).copy()

    def load_eval_data(self):
        return self.valid_data, self.test_data

    def dict_user_items(self):
        self.dict_train_u_i = update_dict("user", self.train_data, defaultdict(set))
        self.dict_train_i_u = update_dict("item", self.train_data, defaultdict(set))
        tmp_dict_u_i = update_dict("user", self.valid_data, self.dict_train_u_i)
        self.user_items_dict = update_dict("user", self.test_data, tmp_dict_u_i)

        # Process out the most interacted users
        # (first sort by the number of users interacting with the user, and finally return the user values in descending order)
        sort_itme_num = sorted(self.dict_train_u_i.items(), key=lambda item: len(item[1]), reverse=True)
        self.topK_users = [temp[0] for temp in sort_itme_num]
        self.topK_users_counts = [len(temp[1]) for temp in sort_itme_num]
        # Process out the most interacted items
        # (first sort by the number of users interacting with the item, and finally return the item values in descending order)
        sort_user_num = sorted(self.dict_train_i_u.items(), key=lambda item: len(item[1]), reverse=True)
        self.topK_items = [temp[0] - self.n_users for temp in sort_user_num]  # Guaranteed from 0
        self.topK_items_counts = [len(temp[1]) for temp in sort_user_num]

    def sparse_inter_matrix(self, form):
        return cal_sparse_inter_matrix(self, form)

    def log_info(self, name, interactions, list_u, list_i):
        info = [self.dataset_name]
        inter_num = len(interactions)
        num_u = len(set(list_u))
        num_i = len(set(list_i))
        info.extend(['The number of users: {}'.format(num_u),
                     'Average actions of users: {}'.format(inter_num / num_u)])
        info.extend(['The number of items: {}'.format(num_i),
                     'Average actions of items: {}'.format(inter_num / num_i)])
        info.append('The number of inters: {}'.format(inter_num))
        sparsity = 1 - inter_num / num_u / num_i
        info.append('The sparsity of the dataset: {}%'.format(sparsity * 100))
        self.logger.info('\n====' + name + '====\n' + str('\n'.join(info)))


class Load_dataset(BaseDataset):
    def __init__(self, config):
        super().__init__(config)
        self.item_knn_k = config.item_knn_k
        self.user_knn_k = config.user_knn_k
        self.i_mm_image_weight = config.i_mm_image_weight
        self.u_mm_image_weight = config.u_mm_image_weight
        self.all_set = set(range(self.n_users, self.n_users + self.n_items))
        # Print statistical information
        self.log_info("Training", self.train_data, self.train_data[:, 0], self.train_data[:, 1])

        # ***************************************************************************************
        # Prepare four graphs that will be needed later
        # (user co-occurrence graph, user interest graph, item co-occurrence graph, item semantic graph)

        # Construct a user co-occurrence matrix with several items of common interaction between all users
        self.user_co_occ_matrix = load_or_create_matrix(self.logger, "User", " co-occurrence matrix",
                                                        self.dataset_name, "user_co_occ_matrix", creat_co_occur_matrix,
                                                        "user", self.train_data, 0, self.n_users)
        # Construct an item co-occurrence matrix with several users who interact in common between all items
        self.item_co_occ_matrix = load_or_create_matrix(self.logger, "Item", " co-occurrence matrix",
                                                        self.dataset_name, "item_co_occ_matrix", creat_co_occur_matrix,
                                                        "item", self.train_data, self.n_users, self.n_items)

        # Construct a dictionary of user graphs, taking the first 200
        self.dict_user_co_occ_graph = load_or_create_matrix(self.logger, "User", " co-occurrence dict graph",
                                                            self.dataset_name, "dict_user_co_occ_graph",
                                                            creat_dict_graph,
                                                            self.user_co_occ_matrix, self.n_users)
        # Construct a dictionary of item graphs, taking the first 200
        self.dict_item_co_occ_graph = load_or_create_matrix(self.logger, "Item", " co-occurrence dict graph",
                                                            self.dataset_name, "dict_item_co_occ_graph",
                                                            creat_dict_graph,
                                                            self.item_co_occ_matrix, self.n_items)
        # ***************************************************************************************

        # Get the sparse interaction matrix of the training set
        sp_inter_m = sparse_mx_to_torch_sparse_tensor(self.sparse_inter_matrix(form='coo')).to(self.device)
        # Construct a item weight graph
        if self.i_v_feat is not None:  # 4096
            # Construct user visual interest similarity graphs
            self.u_v_interest = tsp.mm(sp_inter_m, self.i_v_feat) / tsp.sum(sp_inter_m, [1]).unsqueeze(dim=1).to_dense()
            u_v_adj = get_knn_adj_mat(self.u_v_interest, self.user_knn_k, self.device)
            i_v_adj = get_knn_adj_mat(self.i_v_feat, self.item_knn_k, self.device)
            self.i_mm_adj = i_v_adj
            self.u_mm_adj = u_v_adj
        if self.i_t_feat is not None:  # 384
            # Construct a user text interest similarity graph
            self.u_t_interest = tsp.mm(sp_inter_m, self.i_t_feat) / tsp.sum(sp_inter_m, [1]).unsqueeze(dim=1).to_dense()
            u_t_adj = get_knn_adj_mat(self.u_t_interest, self.user_knn_k, self.device)
            i_t_adj = get_knn_adj_mat(self.i_t_feat, self.item_knn_k, self.device)
            self.i_mm_adj = i_t_adj
            self.u_mm_adj = u_t_adj
        if self.i_v_feat is not None and self.i_t_feat is not None:
            self.i_mm_adj = self.i_mm_image_weight * i_v_adj + (1.0 - self.i_mm_image_weight) * i_t_adj
            self.u_mm_adj = self.u_mm_image_weight * u_v_adj + (1.0 - self.u_mm_image_weight) * u_t_adj
            del i_t_adj, i_v_adj, u_t_adj, u_v_adj
            torch.cuda.empty_cache()

    # ***************************************************************************************
    def __len__(self):
        return len(self.train_data)

    def __getitem__(self, index):
        user, pos_item = self.train_data[index]
        neg_item = random.sample(self.all_set - set(self.user_items_dict[user]), 1)[0]
        return torch.LongTensor([user, user]), torch.LongTensor([pos_item, neg_item])


class Load_eval_dataset(BaseDataset):
    def __init__(self, v_or_t, config, eval_dataset):
        super().__init__(config)
        self.eval_dataset = eval_dataset
        self.step = config.eval_batch_size
        self.inter_pr = 0  # Markup of the number of interactions that have been computed
        self.eval_items_per_u = []
        self.eval_len_list = []
        self.train_pos_len_list = []
        self.eval_u = list(set(eval_dataset[:, 0]))  # Total users index
        self.t_data = self.train_data
        self.pos_items_per_u = self.train_items_per_u(self.eval_u)
        self.evalute_items_per_u(self.eval_u)

        self.s_idx = 0  # eval start index  s_idx=pr

        self.eval_users = len(set(eval_dataset[:, 0]))
        self.eval_items = len(set(eval_dataset[:, 1]))

        self.n_inters = eval_dataset.shape[0]  # num_interactions  n_inters=pr_end
        # Print statistical information
        self.log_info(v_or_t, self.eval_dataset, eval_dataset[:, 0], eval_dataset[:, 1])

    def __len__(self):
        return math.ceil(self.n_inters / self.step)

    def __iter__(self):
        return self

    def __next__(self):
        if self.s_idx >= self.n_inters:
            self.s_idx = 0
            self.inter_pr = 0
            raise StopIteration()
        return self._next_batch_data()

    def _next_batch_data(self):
        # Calculate the total number of interactions between the training set from A to B
        inter_cnt = sum(self.train_pos_len_list[self.s_idx: self.s_idx + self.step])
        batch_users = self.eval_u[self.s_idx: self.s_idx + self.step]
        batch_mask_matrix = self.pos_items_per_u[:, self.inter_pr: self.inter_pr + inter_cnt].clone()
        # user_ids to index(Always keep the index value at 0-self.step in preparation for evaluating the mask later on)
        batch_mask_matrix[0] -= self.s_idx
        self.inter_pr += inter_cnt  # Update the starting index of the fetch data interaction data
        self.s_idx += self.step  # Update the starting index of the fetching user before fetching the data interaction data

        return [batch_users, batch_mask_matrix]

    def train_items_per_u(self, eval_users):
        u_ids, i_ids = list(), list()
        for i, u in enumerate(eval_users):
            # Search for the number of items the training set has interacted with in order
            u_ls = self.t_data[np.where(self.t_data[:, 0] == u), 1][0]
            i_len = len(u_ls)
            self.train_pos_len_list.append(i_len)
            u_ids.extend([i] * i_len)
            i_ids.extend(u_ls)
        return torch.tensor([u_ids, i_ids]).type(torch.LongTensor)

    def evalute_items_per_u(self, eval_users):
        for u in eval_users:
            u_ls = self.eval_dataset[np.where(self.eval_dataset[:, 0] == u), 1][0]
            self.eval_len_list.append(len(u_ls))
            self.eval_items_per_u.append(u_ls - self.n_users)  # Items per user interaction
        self.eval_len_list = np.asarray(self.eval_len_list)