File size: 4,950 Bytes

f43af3c

import numpy as np
from easy_tpp.utils.misc import save_json

def generate_synthetic_data(n_nodes=3, end_time=1000, baseline=0.1, adjacency=0.5, decay=1.0):
    """
    Generates synthetic data using a multivariate Hawkes process with exponential kernels.

    Args:
        n_nodes (int): Number of nodes (or dimensions) in the Hawkes process.
        end_time (float): The time until which the process is simulated.
        baseline (float): Baseline intensity for each node.
        adjacency (float): Adjacency matrix value for the influence between nodes.
        decay (float): Decay parameter for the exponential kernel.

    Returns:
        list: A list of lists, where each sublist contains dictionaries representing events for a node.
    """
    baseline_vector = np.full(n_nodes, baseline)
    adjacency_matrix = np.full((n_nodes, n_nodes), adjacency)
    events = [[] for _ in range(n_nodes)]
    current_time = 0

    while current_time < end_time:
        # Calculate the intensity for each node
        intensities = baseline_vector.copy()
        for i in range(n_nodes):
            for j in range(n_nodes):
                if events[j]:
                    last_event_time = events[j][-1]['time_since_start']
                    intensities[i] += adjacency_matrix[i, j] * np.exp(-decay * (current_time - last_event_time))

        # Determine the next event time
        total_intensity = np.sum(intensities)
        if total_intensity == 0:
            break
        time_to_next_event = np.random.exponential(1 / total_intensity)
        current_time += time_to_next_event

        if current_time >= end_time:
            break

        # Determine which node the event occurs in
        probabilities = intensities / total_intensity
        node = np.random.choice(n_nodes, p=probabilities)

        # Record the event as a dictionary
        if events[node]:
            last_event_time = events[node][-1]['time_since_start']
        else:
            last_event_time = 0

        event = {
            'time_since_start': current_time,
            'time_since_last_event': current_time - last_event_time,
            'type_event': node
        }
        events[node].append(event)

    return events

def format_tick_data_to_hf(events, dim_process, max_seq_len):
    """
    Formats the synthetic data from a multivariate Hawkes process to the Hugging Face dataset format.

    Args:
        events (list): A list of lists, where each sublist contains dictionaries representing events for a node.
        dim_process (int): Number of nodes (or dimensions) in the Hawkes process.
        max_seq_len (int): Maximum sequence length.

    Returns:
        list: A list of dictionaries, where each dictionary represents a sequence.
    """
    # Flatten all events into a single list
    all_events = [event for node_events in events for event in node_events]
    
    # Sort events by time_since_start
    all_events.sort(key=lambda x: x['time_since_start'])

    # Split into multiple sequences based on max_seq_len
    formatted_data = []
    for seq_idx in range(0, len(all_events), max_seq_len):
        seq_events = all_events[seq_idx:seq_idx + max_seq_len]
        
        # Adjust time_since_start to have zero start timestamps
        start_time = seq_events[0]['time_since_start']
        time_since_start = [event['time_since_start'] - start_time for event in seq_events]
        time_since_last_event = [event['time_since_last_event'] for event in seq_events]
        type_event = [event['type_event'] for event in seq_events]

        temp_dict = {
            'dim_process': dim_process,
            'seq_idx': seq_idx // max_seq_len,
            'seq_len': len(seq_events),
            'time_since_start': time_since_start,
            'time_since_last_event': time_since_last_event,
            'type_event': type_event
        }
        formatted_data.append(temp_dict)

    return formatted_data

def generate_and_save_json(n_nodes, end_time, baseline, adjacency, decay, max_seq_len, target_file):
    """
    Generates synthetic data, formats it, and saves it to a file in Hugging Face format.

    Args:
        n_nodes (int): Number of nodes (or dimensions) in the Hawkes process.
        end_time (float): The time until which the process is simulated.
        baseline (float): Baseline intensity for each node.
        adjacency (float): Adjacency matrix value for the influence between nodes.
        decay (float): Decay parameter for the exponential kernel.
        max_seq_len (int): Maximum sequence length.
        target_file (str): Path to the file where the formatted data will be saved.

    Raises:
        IOError: If the file cannot be opened or written to.
    """
    events = generate_synthetic_data(n_nodes, end_time, baseline, adjacency, decay)
    formatted_data = format_tick_data_to_hf(events, dim_process=n_nodes, max_seq_len=max_seq_len)
    save_json(formatted_data, target_file)