Abigail99216
/

EasyTemporalPointProcess-main

Model card Files Files and versions

EasyTemporalPointProcess-main / examples /script_data_processing /make_hf_dataset.py

Abigail99216's picture

Upload folder using huggingface_hub

f43af3c verified 1 day ago

history blame contribute delete

2.36 kB

	import json

	import numpy as np

	from easy_tpp.utils import load_pickle


	def make_json_serializable(input_dict):
	for k, v in input_dict.items():
	if isinstance(v, np.float32):
	input_dict[k] = float(v)
	elif isinstance(v, np.int32):
	input_dict[k] = int(v)

	return input_dict


	def make_hf_dataset(source_dir, target_dir, split='test'):
	data_pkl = load_pickle(source_dir)

	dim_process = int(data_pkl['dim_process'])

	data_json = []
	for idx, seq in enumerate(data_pkl[split]):
	seq_len = len(seq)
	time_since_start, time_since_last_event, type_event = [], [], []
	for idx_event, event in enumerate(data_pkl[split][idx]):
	# if idx_event == 0 and event['time_since_start'] > 0:
	# start_timestamp = event['time_since_start']
	# else:
	# start_timestamp = 0
	if idx_event == 0 and event['time_since_last_event'] > 0:
	event['time_since_last_event'] = 0

	# event['time_since_start'] -= start_timestamp

	event = make_json_serializable(event)
	time_since_start.append(time_since_start)
	time_since_last_event.append(event['time_since_last_event'])
	type_event.append(event['type_event'])

	# re-calculate the time_since start
	from itertools import accumulate
	time_since_start = list(accumulate(time_since_last_event))

	temp_dict = {'dim_process': dim_process,
	'seq_idx': idx,
	'seq_len': seq_len,
	'time_since_start': time_since_start,
	'time_since_last_event': time_since_last_event,
	'type_event': type_event}
	data_json.append(temp_dict)

	with open(target_dir, "w") as outfile:
	json.dump(data_json, outfile)

	return


	if __name__ == '__main__':
	test_data_dir = ['amazon/test.pkl', 'amazon/test.json']
	dev_data_dir = ['amazon/dev.pkl', 'amazon/dev.json']
	train_data_dir = ['amazon/train.pkl', 'amazon/train.json']
	make_hf_dataset(source_dir=test_data_dir[0], target_dir=test_data_dir[1])
	make_hf_dataset(source_dir=dev_data_dir[0], target_dir=dev_data_dir[1], split='dev')
	make_hf_dataset(source_dir=train_data_dir[0], target_dir=train_data_dir[1], split='train')