|
|
import numpy as np |
|
|
from easy_tpp.utils.misc import save_json |
|
|
|
|
|
def generate_synthetic_data(n_nodes=3, end_time=1000, baseline=0.1, adjacency=0.5, decay=1.0): |
|
|
""" |
|
|
Generates synthetic data using a multivariate Hawkes process with exponential kernels. |
|
|
|
|
|
Args: |
|
|
n_nodes (int): Number of nodes (or dimensions) in the Hawkes process. |
|
|
end_time (float): The time until which the process is simulated. |
|
|
baseline (float): Baseline intensity for each node. |
|
|
adjacency (float): Adjacency matrix value for the influence between nodes. |
|
|
decay (float): Decay parameter for the exponential kernel. |
|
|
|
|
|
Returns: |
|
|
list: A list of lists, where each sublist contains dictionaries representing events for a node. |
|
|
""" |
|
|
baseline_vector = np.full(n_nodes, baseline) |
|
|
adjacency_matrix = np.full((n_nodes, n_nodes), adjacency) |
|
|
events = [[] for _ in range(n_nodes)] |
|
|
current_time = 0 |
|
|
|
|
|
while current_time < end_time: |
|
|
|
|
|
intensities = baseline_vector.copy() |
|
|
for i in range(n_nodes): |
|
|
for j in range(n_nodes): |
|
|
if events[j]: |
|
|
last_event_time = events[j][-1]['time_since_start'] |
|
|
intensities[i] += adjacency_matrix[i, j] * np.exp(-decay * (current_time - last_event_time)) |
|
|
|
|
|
|
|
|
total_intensity = np.sum(intensities) |
|
|
if total_intensity == 0: |
|
|
break |
|
|
time_to_next_event = np.random.exponential(1 / total_intensity) |
|
|
current_time += time_to_next_event |
|
|
|
|
|
if current_time >= end_time: |
|
|
break |
|
|
|
|
|
|
|
|
probabilities = intensities / total_intensity |
|
|
node = np.random.choice(n_nodes, p=probabilities) |
|
|
|
|
|
|
|
|
if events[node]: |
|
|
last_event_time = events[node][-1]['time_since_start'] |
|
|
else: |
|
|
last_event_time = 0 |
|
|
|
|
|
event = { |
|
|
'time_since_start': current_time, |
|
|
'time_since_last_event': current_time - last_event_time, |
|
|
'type_event': node |
|
|
} |
|
|
events[node].append(event) |
|
|
|
|
|
return events |
|
|
|
|
|
def format_tick_data_to_hf(events, dim_process, max_seq_len): |
|
|
""" |
|
|
Formats the synthetic data from a multivariate Hawkes process to the Hugging Face dataset format. |
|
|
|
|
|
Args: |
|
|
events (list): A list of lists, where each sublist contains dictionaries representing events for a node. |
|
|
dim_process (int): Number of nodes (or dimensions) in the Hawkes process. |
|
|
max_seq_len (int): Maximum sequence length. |
|
|
|
|
|
Returns: |
|
|
list: A list of dictionaries, where each dictionary represents a sequence. |
|
|
""" |
|
|
|
|
|
all_events = [event for node_events in events for event in node_events] |
|
|
|
|
|
|
|
|
all_events.sort(key=lambda x: x['time_since_start']) |
|
|
|
|
|
|
|
|
formatted_data = [] |
|
|
for seq_idx in range(0, len(all_events), max_seq_len): |
|
|
seq_events = all_events[seq_idx:seq_idx + max_seq_len] |
|
|
|
|
|
|
|
|
start_time = seq_events[0]['time_since_start'] |
|
|
time_since_start = [event['time_since_start'] - start_time for event in seq_events] |
|
|
time_since_last_event = [event['time_since_last_event'] for event in seq_events] |
|
|
type_event = [event['type_event'] for event in seq_events] |
|
|
|
|
|
temp_dict = { |
|
|
'dim_process': dim_process, |
|
|
'seq_idx': seq_idx // max_seq_len, |
|
|
'seq_len': len(seq_events), |
|
|
'time_since_start': time_since_start, |
|
|
'time_since_last_event': time_since_last_event, |
|
|
'type_event': type_event |
|
|
} |
|
|
formatted_data.append(temp_dict) |
|
|
|
|
|
return formatted_data |
|
|
|
|
|
def generate_and_save_json(n_nodes, end_time, baseline, adjacency, decay, max_seq_len, target_file): |
|
|
""" |
|
|
Generates synthetic data, formats it, and saves it to a file in Hugging Face format. |
|
|
|
|
|
Args: |
|
|
n_nodes (int): Number of nodes (or dimensions) in the Hawkes process. |
|
|
end_time (float): The time until which the process is simulated. |
|
|
baseline (float): Baseline intensity for each node. |
|
|
adjacency (float): Adjacency matrix value for the influence between nodes. |
|
|
decay (float): Decay parameter for the exponential kernel. |
|
|
max_seq_len (int): Maximum sequence length. |
|
|
target_file (str): Path to the file where the formatted data will be saved. |
|
|
|
|
|
Raises: |
|
|
IOError: If the file cannot be opened or written to. |
|
|
""" |
|
|
events = generate_synthetic_data(n_nodes, end_time, baseline, adjacency, decay) |
|
|
formatted_data = format_tick_data_to_hf(events, dim_process=n_nodes, max_seq_len=max_seq_len) |
|
|
save_json(formatted_data, target_file) |