Spaces:

AdhyaSuman
/

DTECT

Running

App Files Files Community

DTECT / backend /datasets /dynamic_dataset.py

AdhyaSuman

Initial commit with Git LFS for large files

11c72a2 9 months ago

raw

history blame contribute delete

3.93 kB

	import torch
	from torch.utils.data import Dataset, DataLoader
	import numpy as np
	import scipy.sparse
	import scipy.io
	from backend.datasets.data import file_utils


	class _SequentialDataset(Dataset):
	def __init__(self, bow, times, time_wordfreq):
	super().__init__()
	self.bow = bow
	self.times = times
	self.time_wordfreq = time_wordfreq

	def __len__(self):
	return len(self.bow)

	def __getitem__(self, index):
	return_dict = {
	'bow': self.bow[index],
	'times': self.times[index],
	'time_wordfreq': self.time_wordfreq[self.times[index]],
	}

	return return_dict


	class DynamicDataset:
	def __init__(self, dataset_dir, batch_size=200, read_labels=False, use_partition=False, device='cuda', as_tensor=True):

	self.load_data(dataset_dir, read_labels, use_partition)

	self.vocab_size = len(self.vocab)
	self.train_size = len(self.train_bow)
	self.num_times = int(self.train_times.max()) + 1 # assuming train_times is a numpy array
	self.train_time_wordfreq = self.get_time_wordfreq(self.train_bow, self.train_times)

	print('train size: ', len(self.train_bow))
	if use_partition:
	print('test size: ', len(self.test_bow))
	print('vocab size: ', len(self.vocab))
	print('average length: {:.3f}'.format(self.train_bow.sum(1).mean().item()))
	print('num of each time slice: ', self.num_times, np.bincount(self.train_times))

	if as_tensor:
	self.train_bow = torch.from_numpy(self.train_bow).float().to(device)
	self.train_times = torch.from_numpy(self.train_times).long().to(device)
	self.train_time_wordfreq = torch.from_numpy(self.train_time_wordfreq).float().to(device)

	if use_partition:
	self.test_bow = torch.from_numpy(self.test_bow).float().to(device)
	self.test_times = torch.from_numpy(self.test_times).long().to(device)

	self.train_dataset = _SequentialDataset(self.train_bow, self.train_times, self.train_time_wordfreq)

	if use_partition:
	self.test_dataset = _SequentialDataset(self.test_bow, self.test_times, self.train_time_wordfreq)

	self.train_dataloader = DataLoader(self.train_dataset, batch_size=batch_size, shuffle=True)

	def load_data(self, path, read_labels, use_partition=False):
	self.train_bow = scipy.sparse.load_npz(f'{path}/train_bow.npz').toarray().astype('float32')
	self.train_texts = file_utils.read_text(f'{path}/train_texts.txt')
	self.train_times = np.loadtxt(f'{path}/train_times.txt').astype('int32')
	self.vocab = file_utils.read_text(f'{path}/vocab.txt')
	self.word_embeddings = scipy.sparse.load_npz(f'{path}/word_embeddings.npz').toarray().astype('float32')

	self.pretrained_WE = self.word_embeddings # preserve compatibility

	if read_labels:
	self.train_labels = np.loadtxt(f'{path}/train_labels.txt').astype('int32')

	if use_partition:
	self.test_bow = scipy.sparse.load_npz(f'{path}/test_bow.npz').toarray().astype('float32')
	self.test_texts = file_utils.read_text(f'{path}/test_texts.txt')
	self.test_times = np.loadtxt(f'{path}/test_times.txt').astype('int32')
	if read_labels:
	self.test_labels = np.loadtxt(f'{path}/test_labels.txt').astype('int32')

	# word frequency at each time slice.
	def get_time_wordfreq(self, bow, times):
	train_time_wordfreq = np.zeros((self.num_times, self.vocab_size))
	for time in range(self.num_times):
	idx = np.where(times == time)[0]
	train_time_wordfreq[time] += bow[idx].sum(0)
	cnt_times = np.bincount(times)

	train_time_wordfreq = train_time_wordfreq / cnt_times[:, np.newaxis]
	return train_time_wordfreq