saracandu
/

stldec_random

Safetensors

stldec1024

custom_code

Model card Files Files and versions

xet

Community

saracandu commited on Aug 19, 2025

Commit

21c7f66

verified ·

1 Parent(s): 8d3eaaf

Update modeling.py

Browse files

Files changed (1) hide show

modeling.py +675 -124

modeling.py CHANGED Viewed

@@ -32,12 +32,597 @@ from transformers.modeling_outputs import (
     Seq2SeqModelOutput,
 )
-from configuration import STLConfig
 from nltk.translate.bleu_score import sentence_bleu
-from stl import *
 import networkx as nx
 from datasets import load_dataset
 # from anchor_set_generation import anchorGeneration
@@ -152,10 +737,9 @@ def from_string_to_formula(st):
 def load_json(path: str) -> Union[Dict, List]:
     """
     Load a JSON file from the given path.
     Args:
         path (str): The path to the JSON file to be loaded.
     Returns:
         Union[Dict, List]: The parsed content of the JSON file, which could be a dictionary or a list.
     """
@@ -216,35 +800,28 @@ class StlGenerator:
     def sample(self, nvars):
         """
         Samples a random formula with distribution defined in class instance parameters
         Parameters
         ----------
         nvars : number of variables of input signals
             how many variables the formula is expected to consider.
         Returns
         -------
         TYPE
             A random formula.
         """
         return self._sample_internal_node(nvars)
     def bag_sample(self, bag_size, nvars):
         """
         Samples a bag of bag_size formulae
         Parameters
         ----------
         bag_size : INT
             number of formulae.
         nvars : INT
             number of vars in formulae.
         Returns
         -------
         a list of formulae.
         """
         formulae = []
         for _ in range(bag_size):
@@ -261,32 +838,32 @@ class StlGenerator:
         while True:
             if nodetype == "not":
                 n = self._sample_node(nvars)
-                node = stl.Not(n)
             elif nodetype == "and":
                 n1 = self._sample_node(nvars)
                 n2 = self._sample_node(nvars)
-                node = stl.And(n1, n2)
             elif nodetype == "or":
                 n1 = self._sample_node(nvars)
                 n2 = self._sample_node(nvars)
-                node = stl.Or(n1, n2)
             elif nodetype == "always":
                 n = self._sample_node(nvars)
                 unbound, right_unbound, left_time_bound, right_time_bound = self._get_temporal_parameters()
-                node = stl.Globally(
                     n, unbound, right_unbound, left_time_bound, right_time_bound, self.adaptive_unbound_temporal_ops
                 )
             elif nodetype == "eventually":
                 n = self._sample_node(nvars)
                 unbound, right_unbound, left_time_bound, right_time_bound = self._get_temporal_parameters()
-                node = stl.Eventually(
                     n, unbound, right_unbound, left_time_bound, right_time_bound, self.adaptive_unbound_temporal_ops
                 )
             elif nodetype == "until":
                 n1 = self._sample_node(nvars)
                 n2 = self._sample_node(nvars)
                 unbound, right_unbound, left_time_bound, right_time_bound = self._get_temporal_parameters()
-                node = stl.Until(
                     n1, n2, unbound, right_unbound, left_time_bound, right_time_bound
                 )
@@ -297,7 +874,7 @@ class StlGenerator:
         if rnd.rand() < self.leaf_prob:
             # sample a leaf
             var, thr, lte = self._get_atom(nvars)
-            return stl.Atom(var, thr, lte)
         else:
             return self._sample_internal_node(nvars)
@@ -328,7 +905,6 @@ class BaseMeasure(Measure):
         self, mu0=0.0, sigma0=1.0, mu1=0.0, sigma1=1.0, q=0.1, q0=0.5, device="cpu"
     ):
         """
         Parameters
         ----------
         mu0 : mean of normal distribution of initial state, optional
@@ -345,11 +921,9 @@ class BaseMeasure(Measure):
             probability of initial sign of  derivative. The default is 0.5.
         device : 'cpu' or 'cuda', optional
             device on which to run the algorithm. The default is 'cpu'.
         Returns
         -------
         None.
         """
         self.mu0 = mu0
         self.sigma0 = sigma0
@@ -363,7 +937,6 @@ class BaseMeasure(Measure):
         """
         Samples a set of trajectories from the basic measure space, with parameters
         passed to the sampler
         Parameters
         ----------
         points : INT, optional
@@ -372,13 +945,10 @@ class BaseMeasure(Measure):
             number of trajectories. The default is 100000.
         varn : INT, optional
             number of variables per trajectory. The default is 2.
         Returns
         -------
         signal : samples x varn x points double pytorch tensor
             The sampled signals.
         """
         if self.device == "cuda" and not torch.cuda.is_available():
             raise RuntimeError("GPU card or CUDA library not available!")
@@ -513,8 +1083,6 @@ class StlKernel:
             return kernel_matrix.cpu(), rhos1, selfk1, len1
         else:
             return kernel_matrix.cpu()
-    def _compute_robustness_time(self, phis):
         n = self.samples
         p = self.points
         k = len(phis)
@@ -576,6 +1144,12 @@ class StlKernel:
         kernel_matrix = kernel_matrix / normalize
         return kernel_matrix
     def _exponentiate(self, kernel_matrix, selfk1, selfk2, sigma2=None):
         if sigma2 is None:
             sigma2 = self.sigma2
@@ -706,13 +1280,13 @@ def anchorGeneration(diff_init = False, # to control whether we want formulae to
                      leaf_prob: float = 0.4, # complexity of the generated formula
                      cosine_similarity_threshold: float = 0.8 # if two formulae cosine similarity exceeds 0.9, then discard one of the two
                     ) -> str:
     # initialize STL formula generator
     sampler = StlGenerator(leaf_prob)
     # effective anchor set generation
     if diff_init:
         # initialize the anchor set with a randomly sampled formula
         diff_anchor_set = [sampler.sample(nvars=n_vars)]
@@ -728,35 +1302,35 @@ def anchorGeneration(diff_init = False, # to control whether we want formulae to
         while len(diff_anchor_set) < embed_dim:
             # sample the 'remaining' formulae to reach the desired number of `embed_dim` formulae:
             candidate_anchors = sampler.bag_sample(embed_dim - len(diff_anchor_set), nvars = n_vars)
             # compute robustness of candidate anchor formulae on the same signals as previous anchor set
             candidate_robs = torch.cat([phi.quantitative(signals, normalize=True).unsqueeze(0) for phi in candidate_anchors], 0)
             # compute cosine similarity between current anchor set and candidate new formulae
             cos_simil = torch.tril(normalize(candidate_robs) @ normalize(anchor_rob_vectors).t(), diagonal=-1)
             # check which formulae are similar (i.e. greater cosine similarity then threshold) w.r.t. current anchors
             # NOTA: chiedere a gaia se cosine similarities negative vanno ammazzate con un valore assoluto o meno!
             similar_idx = [torch.where(cos_simil[r, :] > cosine_similarity_threshold)[0].tolist() for r in range(cos_simil.shape[0])]
             # keep only those who are semantically distant
             keep_idx = list(set(np.arange(len(candidate_anchors)).tolist()).difference(set([i for sublist in similar_idx for i in sublist])))
             diff_anchor_set += [copy.deepcopy(candidate_anchors[i]) for i in keep_idx]
             # Convert keep_idx to a tensor on the same device as candidate_robs
             keep_idx_tensor = torch.tensor(keep_idx, device=candidate_robs.device)
             # Use index_select to pick the relevant rows
             selected_robs = torch.index_select(candidate_robs, 0, keep_idx_tensor)
             # Concatenate on the same device
             anchor_rob_vectors = torch.cat([anchor_rob_vectors, copy.deepcopy(selected_robs)], dim=0)
             anchor_set = diff_anchor_set[:embed_dim]
     else:
-        anchor_set = sampler.bag_sample(bag_size=embed_dim, nvars=n_vars)
     filename = f'anchor_set_no_diff_{embed_dim}_dim'
     dump_pickle(filename, anchor_set)
@@ -764,19 +1338,16 @@ def anchorGeneration(diff_init = False, # to control whether we want formulae to
 ####
-class STLTokenizer(PreTrainedTokenizer):
     """
     A custom tokenizer class that extends `PreTrainedTokenizer` to handle a specific vocabulary and tokenization process.
-    This tokenizer can load a vocabulary from a JSON file, tokenize text, convert tokens to IDs,
     and handle padding and special tokens.
     """
-    def __init__(self, vocab_path: str, unk_token: str = "unk", pad_token: str = "pad",
                  bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, *args, **kwargs):
         """
         Initializes the STLTokenizer with a given vocabulary and special tokens.
         Args:
             vocab_path (str): The path to the JSON file containing the vocabulary.
             unk_token (str, optional): The token used for unknown words. Defaults to "unk".
@@ -791,14 +1362,13 @@ class STLTokenizer(PreTrainedTokenizer):
         self.eos_token = eos_token
         self.model_max_length = model_max_length
         self.id_to_token = {v: k for k, v in self.vocab.items()}  # Reverse mapping
-        super().__init__(unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token,
                          model_max_length=model_max_length, *args, **kwargs)
     @property
     def vocab_size(self) -> int:
         """
         Returns the size of the vocabulary.
         Returns:
             int: The number of tokens in the vocabulary.
         """
@@ -807,11 +1377,9 @@ class STLTokenizer(PreTrainedTokenizer):
     def prepad_sequence(self, sequence, space_token = ' ', new_space_token = '@', undo = False):
         """
         Replaces spaces in the input sequence with a specified token.
         Args:
             sequence (str): The input sequence.
             undo (bool): If True, replace the padding token with spaces. Defaults to False, which pads the spaces.
         Returns:
             str: The preprocessed sequence with spaces or padding tokens replaced.
         """
@@ -823,10 +1391,8 @@ class STLTokenizer(PreTrainedTokenizer):
     def add_bos_eos(self, sequence: str) -> str:
         """
         Aggiunge i token BOS all'inizio e EOS alla fine della sequenza.
         Args:
             sequence (str): La sequenza di input.
         Returns:
             str: La sequenza con i token BOS ed EOS.
         """
@@ -835,19 +1401,15 @@ class STLTokenizer(PreTrainedTokenizer):
     def tokenize(self, text: str) -> List[str]:
         """
         Tokenizes the input text into a list of tokens.
-        The method preprocesses the input text by replacing spaces with padding tokens and then tries to
         find the longest possible match for each substring in the vocabulary.
         Args:
             text (str): The input text to be tokenized.
         Returns:
             List[str]: A list of tokens representing the tokenized text.
         """
         text = self.add_bos_eos(text)
         text = self.prepad_sequence(text)
         tokens = []
         i = 0
         while i < len(text):
@@ -868,10 +1430,8 @@ class STLTokenizer(PreTrainedTokenizer):
     def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
         """
         Converts a list of tokens into a list of token IDs.
         Args:
             tokens (List[str]): A list of tokens to be converted into IDs.
         Returns:
             List[int]: A list of corresponding token IDs.
         """
@@ -880,10 +1440,8 @@ class STLTokenizer(PreTrainedTokenizer):
     def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
         """
         Converts a list of token IDs into a list of tokens.
         Args:
             ids (List[int]): A list of token IDs to be converted into tokens.
         Returns:
             List[str]: A list of corresponding tokens.
         """
@@ -892,14 +1450,14 @@ class STLTokenizer(PreTrainedTokenizer):
     def encode(self, sequence: str) -> List[int]:
         """
         Encodes a string sequence into a list of token IDs.
-        This method tokenizes the input sequence using the `tokenize` method,
-        and then converts the resulting tokens into their corresponding token IDs
         using the `convert_tokens_to_ids` method.
         Args:
             sequence (str): The input sequence (text) to be encoded.
         Returns:
             List[int]: A list of token IDs corresponding to the input sequence.
         """
@@ -908,8 +1466,8 @@ class STLTokenizer(PreTrainedTokenizer):
     def postpad_sequence(self, sequence, pad_token_id):
        """
-       Fills the sequence up to max_length padding elements
-       """
        num_extra_elements = self.model_max_length - len(sequence) -1
        if num_extra_elements > 0:
            sequence.extend([pad_token_id] * num_extra_elements)
@@ -918,14 +1476,11 @@ class STLTokenizer(PreTrainedTokenizer):
     def decode(self, token_ids: List[int]) -> str:
         """
         Decodes a list of token IDs into a string of text.
-        The method converts the IDs to tokens and joins them to form a string.
         It also restores the original spaces or padding tokens if `undo` is True.
         Args:
             token_ids (List[int]): A list of token IDs to be decoded.
             skip_special_tokens (bool, optional): Whether to skip special tokens during decoding. Defaults to False.
         Returns:
             str: The decoded string.
         """
@@ -935,16 +1490,13 @@ class STLTokenizer(PreTrainedTokenizer):
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
-        Saves the tokenizer's vocabulary to a file.
-        Useful only when the vocabulary has to be retrieved and is not given
         (thus this is not the case: here to further improvements with sentencepiece).
-        This method saves the vocabulary to a JSON file in the specified directory.
         Args:
             save_directory (str): The directory where the vocabulary file will be saved.
             filename_prefix (Optional[str]): An optional prefix for the filename.
         Returns:
             Tuple[str]: A tuple containing the path to the saved vocabulary file.
         """
@@ -956,7 +1508,6 @@ class STLTokenizer(PreTrainedTokenizer):
     def get_vocab(self) -> dict:
         """
         Retrieves the vocabulary used by the tokenizer.
         Returns:
             dict: The vocabulary as a dictionary.
         """
@@ -985,7 +1536,6 @@ class STLSinusoidalPositionalEmbedding(nn.Embedding):
         out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
         out.detach_()
         return out
     @torch.no_grad()
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
         """`input_ids_shape` is expected to be [bsz x seqlen]."""
@@ -998,40 +1548,39 @@ class STLSinusoidalPositionalEmbedding(nn.Embedding):
 class STLAttention(nn.Module):
     """ Multi-Head Attention as depicted from 'Attention is all you need' """
-    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0,
                  is_decoder: bool = False, bias: bool = False, is_causal: bool = False):
         super().__init__()
         self.embed_dim = embed_dim  # overall embedding dimension -> to be divided between multiple heads
         self.num_heads = num_heads
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
-        assert (self.head_dim * num_heads) == self.embed_dim
         self.scaling = self.head_dim ** -0.5  # used to normalize values when projected using `W_` matrices
         self.is_decoder = is_decoder
         self.is_causal = is_causal
-        # 'roleplaying' matrices
-        self.W_k = nn.Linear(embed_dim, embed_dim, bias = bias)
         self.W_q = nn.Linear(embed_dim, embed_dim, bias = bias)
         self.W_v = nn.Linear(embed_dim, embed_dim, bias = bias)
         # to project the heads' outputs into a single vector
-        self.W_o = nn.Linear(embed_dim, embed_dim, bias = bias)
     def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
         return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-    def forward(self,
                 hidden_states: torch.Tensor,  # previous values, passed to the multi-head attn layer
                 key_value_states: Optional[torch.Tensor] = None,  # different key, value items (used in cross-attn)
-                past_key_value: Optional[Tuple[torch.Tensor]] = None,  # stores the key and values of previous steps
                 attention_mask: Optional[torch.Tensor] = None,  # masks non-allowed items (padded or future ones)
                 layer_head_mask: Optional[torch.Tensor] = None,  # used to de-activate specific attn heads
                 output_attentions: bool = False  # flag to control the output of the attn values,
-                **kwargs,
                 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         is_cross_attention = key_value_states is not None  # cross-attn if key_value_states is not None
@@ -1055,19 +1604,18 @@ class STLAttention(nn.Module):
         else:
             key = self._shape(self.W_k(hidden_states), -1, batch_size)
             value = self._shape(self.W_v(hidden_states), -1, batch_size)
         if self.is_decoder:
             past_key_value = (key, value)
         proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
-        query = self._shape(query, tgt_len, batch_size).view(*proj_shape)
         key = key.reshape(*proj_shape)
         value = value.reshape(*proj_shape)
         src_len = key.size(1)
         ######################################################################################################
         # 'traditional' attention computation
@@ -1079,7 +1627,7 @@ class STLAttention(nn.Module):
         if attention_mask is not None:
             attn_weights = attn_weights.view(batch_size, self.num_heads, tgt_len, src_len) + attention_mask
             attn_weights = attn_weights.view(batch_size * self.num_heads, tgt_len, src_len)
         # Normalize values on the `key` axis (dim=-1)
         attn_weights = F.softmax(attn_weights, dim=-1)
@@ -1098,18 +1646,18 @@ class STLAttention(nn.Module):
         attn_output = attn_output.transpose(1, 2)
         attn_output = attn_output.reshape(batch_size, tgt_len, self.embed_dim)
-        attn_output = self.W_o(attn_output)
         return attn_output, None, past_key_value
 ####
 class STLEncoder():
-    def __init__(self,
                  embed_dim: int,
                  anchor_filename: Optional[str] = None,
                  n_vars: int = 3):
         self.n_vars = n_vars # passaglielo in input
         self.embed_dim = embed_dim
         self.anchorset_filename = anchor_filename
@@ -1117,8 +1665,8 @@ class STLEncoder():
         self.mu = BaseMeasure(device=self.device)
         self.kernel = StlKernel(self.mu, varn=self.n_vars)
-        if anchor_filename is None:
-            anchor_filename = anchorGeneration(diff_init = True, embed_dim = self.embed_dim, n_vars = self.n_vars)
             anchor_filename+='.pickle'
         # TO DO: check on the dimensions of the anchor set and the `embed_dim` and `n_vars` values
@@ -1132,8 +1680,8 @@ class STLEncoder():
         return self.kernel.compute_bag_bag(formula, self.anchor_set)
 class STLModel(PreTrainedModel):
-    config_class = STLConfig
-    base_model_prefix = "model"
     supports_gradient_checkpointing = True
     # initializes the weights of `nn.Linear`, `nn.Embedding` and `STLSinusoidalPositionalEmbedding`
@@ -1162,22 +1710,22 @@ class STLModel(PreTrainedModel):
         return dummy_inputs
 class STLDecoderBlock(nn.Module):
-    def __init__(self, embed_dim: int,
                 num_decoder_attention_heads: int,
                 num_decoder_ffn_dim: int,
                 dropout: float = 0.0,
                 attention_dropout: float = 0.0,
                 activation_dropout: float = 0.0,
                 ):
         super().__init__()
         self.embed_dim = embed_dim
-        # first block
         self.self_attn = STLAttention(
-            embed_dim=self.embed_dim,
             num_heads=num_decoder_attention_heads,
             dropout=dropout,
             is_decoder=True, # not used, debugging purposes
@@ -1234,26 +1782,26 @@ class STLDecoderBlock(nn.Module):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
         """
         ###################################################################
-        # BLOCK 1: processing what has been previously generated
         # previous state is stored into an auxiliary variable `residual`
         residual = hidden_states
-        # tries to exploit previous K, V values if there are any
         # (practically picks up to the first 2 values stored in `past_key_value` vector)
         self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         # masked MHSA on the already generated sequence
-        # invokes `forward` method to transform the original vector accordingly
         hidden_states, self_attn_weights, present_key_value = self.self_attn.forward(
             hidden_states=hidden_states, # Q
             past_key_value=self_attn_past_key_value, # K, V
             attention_mask=attention_mask, # passed as input of the decoder layer
-            layer_head_mask=layer_head_mask, # to deactivate certain attn layers
-            output_attentions=output_attentions,
         )
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -1268,7 +1816,7 @@ class STLDecoderBlock(nn.Module):
         # BLOCK 2: cross-attn between already generated input and previous information (from the encoder)
         # initialize K, Q, attn_weights for this new attn operation
-        cross_attn_present_key_value = None
         cross_attn_weights = None
         # the important condition is that the encoder carries some information
@@ -1346,7 +1894,7 @@ class STLDecoder(STLModel):
         attention_dropout = config.attention_dropout
         activation_dropout = config.activation_dropout
         decoder_layerdrop = config.decoder_layerdrop
         self.dropout = dropout
         self.layerdrop = decoder_layerdrop
         self.padding_idx = pad_token_id
@@ -1355,16 +1903,16 @@ class STLDecoder(STLModel):
         # Initialize the input embedding (if not passed already)
         self.embed_tokens = nn.Embedding(decoder_vocab_size, embed_dim, self.padding_idx)
         # Initialize positional embedding also
         self.embed_positions = STLSinusoidalPositionalEmbedding(
             max_position_embeddings, embed_dim, self.padding_idx
         )
         # Initialize decoder layers (of a prespecified number)
-        self.layers = nn.ModuleList([STLDecoderBlock(embed_dim, num_decoder_attention_heads,
-                                                      num_decoder_ffn_dim, dropout,
-                                                      attention_dropout, activation_dropout)
                                      for _ in range(num_decoder_layers)])
         self.gradient_checkpointing = False
@@ -1386,7 +1934,7 @@ class STLDecoder(STLModel):
         return_dict: Optional[bool] = None,
         **kwargs,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1509,7 +2057,7 @@ class STLDecoder(STLModel):
             cross_attentions=all_cross_attentions,
         )
-####
 class STLForCausalLM(STLModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
@@ -1518,7 +2066,7 @@ class STLForCausalLM(STLModel, GenerationMixin):
         config = copy.deepcopy(config)
         config.is_decoder = True
         config.is_encoder_decoder = False
         super().__init__(config)
         self.model = STLDecoder(config)
@@ -1615,3 +2163,6 @@ class STLForCausalLM(STLModel, GenerationMixin):
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
         return reordered_past

     Seq2SeqModelOutput,
 )
+from .configuration import STLConfig
 from nltk.translate.bleu_score import sentence_bleu
+# from stl import *
 import networkx as nx
 from datasets import load_dataset
+### from custom_typing.py
+realnum = Union[float, int]
+### from stl.py
+# For tensor functions
+import torch
+from torch import Tensor
+import torch.nn.functional as F
+def eventually(x: Tensor, time_span: int) -> Tensor:
+    """
+    STL operator 'eventually' in 1D.
+    Parameters
+    ----------
+    x: torch.Tensor
+        Signal
+    time_span: any numeric type
+        Timespan duration
+    Returns
+    -------
+    torch.Tensor
+    A tensor containing the result of the operation.
+    """
+    return F.max_pool1d(x, kernel_size=time_span, stride=1)
+class Node:
+    """Abstract node class for STL semantics tree."""
+    def __init__(self) -> None:
+        # Must be overloaded.
+        pass
+    def __str__(self) -> str:
+        # Must be overloaded.
+        pass
+    def boolean(self, x: Tensor, evaluate_at_all_times: bool = False) -> Tensor:
+        """
+        Evaluates the boolean semantics at the node.
+        Parameters
+        ----------
+        x : torch.Tensor, of size N_samples x N_vars x N_sampling_points
+            The input signals, stored as a batch tensor with trhee dimensions.
+        evaluate_at_all_times: bool
+            Whether to evaluate the semantics at all times (True) or
+            just at t=0 (False).
+        Returns
+        -------
+        torch.Tensor
+        A tensor with the boolean semantics for the node.
+        """
+        z: Tensor = self._boolean(x)
+        if evaluate_at_all_times:
+            return z
+        else:
+            return self._extract_semantics_at_time_zero(z)
+    def quantitative(
+        self,
+        x: Tensor,
+        normalize: bool = False,
+        evaluate_at_all_times: bool = False,
+    ) -> Tensor:
+        """
+        Evaluates the quantitative semantics at the node.
+        Parameters
+        ----------
+        x : torch.Tensor, of size N_samples x N_vars x N_sampling_points
+            The input signals, stored as a batch tensor with three dimensions.
+        normalize: bool
+            Whether the measure of robustness if normalized (True) or
+            not (False). Currently not in use.
+        evaluate_at_all_times: bool
+            Whether to evaluate the semantics at all times (True) or
+            just at t=0 (False).
+        Returns
+        -------
+        torch.Tensor
+        A tensor with the quantitative semantics for the node.
+        """
+        z: Tensor = self._quantitative(x, normalize)
+        if evaluate_at_all_times:
+            return z
+        else:
+            return self._extract_semantics_at_time_zero(z)
+    def set_normalizing_flag(self, value: bool = True) -> None:
+        """
+        Setter for the 'normalization of robustness of the formula' flag.
+        Currently not in use.
+        """
+    def time_depth(self) -> int:
+        """Returns time depth of bounded temporal operators only."""
+        # Must be overloaded.
+    def _quantitative(self, x: Tensor, normalize: bool = False) -> Tensor:
+        """Private method equivalent to public one for inner call."""
+        # Must be overloaded.
+    def _boolean(self, x: Tensor) -> Tensor:
+        """Private method equivalent to public one for inner call."""
+        # Must be overloaded.
+    @staticmethod
+    def _extract_semantics_at_time_zero(x: Tensor) -> Tensor:
+        """Extrapolates the vector of truth values at time zero"""
+        return torch.reshape(x[:, 0, 0], (-1,))
+class Atom(Node):
+    """Atomic formula node; for now of the form X<=t or X>=t"""
+    def __init__(self, var_index: int, threshold: realnum, lte: bool = False) -> None:
+        super().__init__()
+        self.var_index: int = var_index
+        self.threshold: realnum = threshold
+        self.lte: bool = lte
+    def __str__(self) -> str:
+        s: str = (
+            "x_"
+            + str(self.var_index)
+            + (" <= " if self.lte else " >= ")
+            + str(round(self.threshold, 4))
+        )
+        return s
+    def time_depth(self) -> int:
+        return 0
+    def _boolean(self, x: Tensor) -> Tensor:
+        # extract tensor of the same dimension as data, but with only one variable
+        xj: Tensor = x[:, self.var_index, :]
+        xj: Tensor = xj.view(xj.size()[0], 1, -1)
+        if self.lte:
+            z: Tensor = torch.le(xj, self.threshold)
+        else:
+            z: Tensor = torch.ge(xj, self.threshold)
+        return z
+    def _quantitative(self, x: Tensor, normalize: bool = False) -> Tensor:
+        # extract tensor of the same dimension as data, but with only one variable
+        xj: Tensor = x[:, self.var_index, :]
+        xj: Tensor = xj.view(xj.size()[0], 1, -1)
+        if self.lte:
+            z: Tensor = -xj + self.threshold
+        else:
+            z: Tensor = xj - self.threshold
+        if normalize:
+            z: Tensor = torch.tanh(z)
+        return z
+class Not(Node):
+    """Negation node."""
+    def __init__(self, child: Node) -> None:
+        super().__init__()
+        self.child: Node = child
+    def __str__(self) -> str:
+        s: str = "not ( " + self.child.__str__() + " )"
+        return s
+    def time_depth(self) -> int:
+        return self.child.time_depth()
+    def _boolean(self, x: Tensor) -> Tensor:
+        z: Tensor = ~self.child._boolean(x)
+        return z
+    def _quantitative(self, x: Tensor, normalize: bool = False) -> Tensor:
+        z: Tensor = -self.child._quantitative(x, normalize)
+        return z
+class And(Node):
+    """Conjunction node."""
+    def __init__(self, left_child: Node, right_child: Node) -> None:
+        super().__init__()
+        self.left_child: Node = left_child
+        self.right_child: Node = right_child
+    def __str__(self) -> str:
+        s: str = (
+            "( "
+            + self.left_child.__str__()
+            + " and "
+            + self.right_child.__str__()
+            + " )"
+        )
+        return s
+    def time_depth(self) -> int:
+        return max(self.left_child.time_depth(), self.right_child.time_depth())
+    def _boolean(self, x: Tensor) -> Tensor:
+        z1: Tensor = self.left_child._boolean(x)
+        z2: Tensor = self.right_child._boolean(x)
+        size: int = min(z1.size()[2], z2.size()[2])
+        z1: Tensor = z1[:, :, :size]
+        z2: Tensor = z2[:, :, :size]
+        z: Tensor = torch.logical_and(z1, z2)
+        return z
+    def _quantitative(self, x: Tensor, normalize: bool = False) -> Tensor:
+        z1: Tensor = self.left_child._quantitative(x, normalize)
+        z2: Tensor = self.right_child._quantitative(x, normalize)
+        size: int = min(z1.size()[2], z2.size()[2])
+        z1: Tensor = z1[:, :, :size]
+        z2: Tensor = z2[:, :, :size]
+        z: Tensor = torch.min(z1, z2)
+        return z
+class Not(Node):
+    """Negation node."""
+    def __init__(self, child: Node) -> None:
+        super().__init__()
+        self.child: Node = child
+    def __str__(self) -> str:
+        s: str = "not ( " + self.child.__str__() + " )"
+        return s
+    def time_depth(self) -> int:
+        return self.child.time_depth()
+    def _boolean(self, x: Tensor) -> Tensor:
+        z: Tensor = ~self.child._boolean(x)
+        return z
+    def _quantitative(self, x: Tensor, normalize: bool = False) -> Tensor:
+        z: Tensor = -self.child._quantitative(x, normalize)
+        return z
+class And(Node):
+    """Conjunction node."""
+    def __init__(self, left_child: Node, right_child: Node) -> None:
+        super().__init__()
+        self.left_child: Node = left_child
+        self.right_child: Node = right_child
+    def __str__(self) -> str:
+        s: str = (
+            "( "
+            + self.left_child.__str__()
+            + " and "
+            + self.right_child.__str__()
+            + " )"
+        )
+        return s
+    def time_depth(self) -> int:
+        return max(self.left_child.time_depth(), self.right_child.time_depth())
+    def _boolean(self, x: Tensor) -> Tensor:
+        z1: Tensor = self.left_child._boolean(x)
+        z2: Tensor = self.right_child._boolean(x)
+        size: int = min(z1.size()[2], z2.size()[2])
+        z1: Tensor = z1[:, :, :size]
+        z2: Tensor = z2[:, :, :size]
+        z: Tensor = torch.logical_and(z1, z2)
+        return z
+    def _quantitative(self, x: Tensor, normalize: bool = False) -> Tensor:
+        z1: Tensor = self.left_child._quantitative(x, normalize)
+        z2: Tensor = self.right_child._quantitative(x, normalize)
+        size: int = min(z1.size()[2], z2.size()[2])
+        z1: Tensor = z1[:, :, :size]
+        z2: Tensor = z2[:, :, :size]
+        z: Tensor = torch.min(z1, z2)
+        return z
+class Or(Node):
+    """Disjunction node."""
+    def __init__(self, left_child: Node, right_child: Node) -> None:
+        super().__init__()
+        self.left_child: Node = left_child
+        self.right_child: Node = right_child
+    def __str__(self) -> str:
+        s: str = (
+            "( "
+            + self.left_child.__str__()
+            + " or "
+            + self.right_child.__str__()
+            + " )"
+        )
+        return s
+    def time_depth(self) -> int:
+        return max(self.left_child.time_depth(), self.right_child.time_depth())
+    def _boolean(self, x: Tensor) -> Tensor:
+        z1: Tensor = self.left_child._boolean(x)
+        z2: Tensor = self.right_child._boolean(x)
+        size: int = min(z1.size()[2], z2.size()[2])
+        z1: Tensor = z1[:, :, :size]
+        z2: Tensor = z2[:, :, :size]
+        z: Tensor = torch.logical_or(z1, z2)
+        return z
+    def _quantitative(self, x: Tensor, normalize: bool = False) -> Tensor:
+        z1: Tensor = self.left_child._quantitative(x, normalize)
+        z2: Tensor = self.right_child._quantitative(x, normalize)
+        size: int = min(z1.size()[2], z2.size()[2])
+        z1: Tensor = z1[:, :, :size]
+        z2: Tensor = z2[:, :, :size]
+        z: Tensor = torch.max(z1, z2)
+        return z
+class Globally(Node):
+    """Globally node."""
+    def __init__(
+        self,
+        child: Node,
+        unbound: bool = False,
+        right_unbound: bool = False,
+        left_time_bound: int = 0,
+        right_time_bound: int = 1,
+        adapt_unbound: bool = True,
+    ) -> None:
+        super().__init__()
+        self.child: Node = child
+        self.unbound: bool = unbound
+        self.right_unbound: bool = right_unbound
+        self.left_time_bound: int = left_time_bound
+        self.right_time_bound: int = right_time_bound + 1
+        self.adapt_unbound: bool = adapt_unbound
+    def __str__(self) -> str:
+        s_left = "[" + str(self.left_time_bound) + ","
+        s_right = str(self.right_time_bound) if not self.right_unbound else "inf"
+        s0: str = s_left + s_right + "]" if not self.unbound else ""
+        s: str = "always" + s0 + " ( " + self.child.__str__() + " )"
+        return s
+    def time_depth(self) -> int:
+        if self.unbound:
+            return self.child.time_depth()
+        elif self.right_unbound:
+            return self.child.time_depth() + self.left_time_bound
+        else:
+            # diff = torch.le(torch.tensor([self.left_time_bound]), 0).float()
+            return self.child.time_depth() + self.right_time_bound - 1
+            # (self.right_time_bound - self.left_time_bound + 1) - diff
+    def _boolean(self, x: Tensor) -> Tensor:
+        z1: Tensor = self.child._boolean(x[:, :, self.left_time_bound:])  # nested temporal parameters
+        # z1 = z1[:, :, self.left_time_bound:]
+        if self.unbound or self.right_unbound:
+            if self.adapt_unbound:
+                z: Tensor
+                _: Tensor
+                z, _ = torch.cummin(torch.flip(z1, [2]), dim=2)
+                z: Tensor = torch.flip(z, [2])
+            else:
+                z: Tensor
+                _: Tensor
+                z, _ = torch.min(z1, 2, keepdim=True)
+        else:
+            z: Tensor = torch.ge(1.0 - eventually((~z1).double(), self.right_time_bound - self.left_time_bound), 0.5)
+        return z
+    def _quantitative(self, x: Tensor, normalize: bool = False) -> Tensor:
+        z1: Tensor = self.child._quantitative(x[:, :, self.left_time_bound:], normalize)
+        # z1 = z1[:, :, self.left_time_bound:]
+        if self.unbound or self.right_unbound:
+            if self.adapt_unbound:
+                z: Tensor
+                _: Tensor
+                z, _ = torch.cummin(torch.flip(z1, [2]), dim=2)
+                z: Tensor = torch.flip(z, [2])
+            else:
+                z: Tensor
+                _: Tensor
+                z, _ = torch.min(z1, 2, keepdim=True)
+        else:
+            z: Tensor = -eventually(-z1, self.right_time_bound - self.left_time_bound)
+        return z
+class Eventually(Node):
+    """Eventually node."""
+    def __init__(
+        self,
+        child: Node,
+        unbound: bool = False,
+        right_unbound: bool = False,
+        left_time_bound: int = 0,
+        right_time_bound: int = 1,
+        adapt_unbound: bool = True,
+    ) -> None:
+        super().__init__()
+        self.child: Node = child
+        self.unbound: bool = unbound
+        self.right_unbound: bool = right_unbound
+        self.left_time_bound: int = left_time_bound
+        self.right_time_bound: int = right_time_bound + 1
+        self.adapt_unbound: bool = adapt_unbound
+        if (self.unbound is False) and (self.right_unbound is False) and \
+                (self.right_time_bound <= self.left_time_bound):
+            raise ValueError("Temporal thresholds are incorrect: right parameter is higher than left parameter")
+    def __str__(self) -> str:
+        s_left = "[" + str(self.left_time_bound) + ","
+        s_right = str(self.right_time_bound) if not self.right_unbound else "inf"
+        s0: str = s_left + s_right + "]" if not self.unbound else ""
+        s: str = "eventually" + s0 + " ( " + self.child.__str__() + " )"
+        return s
+    def time_depth(self) -> int:
+        if self.unbound:
+            return self.child.time_depth()
+        elif self.right_unbound:
+            return self.child.time_depth() + self.left_time_bound
+        else:
+            # diff = torch.le(torch.tensor([self.left_time_bound]), 0).float()
+            return self.child.time_depth() + self.right_time_bound - 1
+            # (self.right_time_bound - self.left_time_bound + 1) - diff
+    def _boolean(self, x: Tensor) -> Tensor:
+        z1: Tensor = self.child._boolean(x[:, :, self.left_time_bound:])
+        if self.unbound or self.right_unbound:
+            if self.adapt_unbound:
+                z: Tensor
+                _: Tensor
+                z, _ = torch.cummax(torch.flip(z1, [2]), dim=2)
+                z: Tensor = torch.flip(z, [2])
+            else:
+                z: Tensor
+                _: Tensor
+                z, _ = torch.max(z1, 2, keepdim=True)
+        else:
+            z: Tensor = torch.ge(eventually(z1.double(), self.right_time_bound - self.left_time_bound), 0.5)
+        return z
+    def _quantitative(self, x: Tensor, normalize: bool = False) -> Tensor:
+        z1: Tensor = self.child._quantitative(x[:, :, self.left_time_bound:], normalize)
+        if self.unbound or self.right_unbound:
+            if self.adapt_unbound:
+                z: Tensor
+                _: Tensor
+                z, _ = torch.cummax(torch.flip(z1, [2]), dim=2)
+                z: Tensor = torch.flip(z, [2])
+            else:
+                z: Tensor
+                _: Tensor
+                z, _ = torch.max(z1, 2, keepdim=True)
+        else:
+            z: Tensor = eventually(z1, self.right_time_bound - self.left_time_bound)
+        return z
+class Until(Node):
+    """Until node."""
+    def __init__(
+        self,
+        left_child: Node,
+        right_child: Node,
+        unbound: bool = False,
+        right_unbound: bool = False,
+        left_time_bound: int = 0,
+        right_time_bound: int = 1,
+    ) -> None:
+        super().__init__()
+        self.left_child: Node = left_child
+        self.right_child: Node = right_child
+        self.unbound: bool = unbound
+        self.right_unbound: bool = right_unbound
+        self.left_time_bound: int = left_time_bound
+        self.right_time_bound: int = right_time_bound + 1
+        if (self.unbound is False) and (self.right_unbound is False) and \
+                (self.right_time_bound <= self.left_time_bound):
+            raise ValueError("Temporal thresholds are incorrect: right parameter is higher than left parameter")
+    def __str__(self) -> str:
+        s_left = "[" + str(self.left_time_bound) + ","
+        s_right = str(self.right_time_bound) if not self.right_unbound else "inf"
+        s0: str = s_left + s_right + "]" if not self.unbound else ""
+        s: str = "( " + self.left_child.__str__() + " until" + s0 + " " + self.right_child.__str__() + " )"
+        return s
+    def time_depth(self) -> int:
+        sum_children_depth: int = self.left_child.time_depth() + self.right_child.time_depth()
+        if self.unbound:
+            return sum_children_depth
+        elif self.right_unbound:
+            return sum_children_depth + self.left_time_bound
+        else:
+            # diff = torch.le(torch.tensor([self.left_time_bound]), 0).float()
+            return sum_children_depth + self.right_time_bound - 1
+            # (self.right_time_bound - self.left_time_bound + 1) - diff
+    def _boolean(self, x: Tensor) -> Tensor:
+        if self.unbound:
+            z1: Tensor = self.left_child._boolean(x)
+            z2: Tensor = self.right_child._boolean(x)
+            size: int = min(z1.size()[2], z2.size()[2])
+            z1: Tensor = z1[:, :, :size]
+            z2: Tensor = z2[:, :, :size]
+            z1_rep = torch.repeat_interleave(z1.unsqueeze(2), z1.unsqueeze(2).shape[-1], 2)
+            z1_tril = torch.tril(z1_rep.transpose(2, 3), diagonal=-1)
+            z1_triu = torch.triu(z1_rep)
+            z1_def = torch.cummin(z1_tril + z1_triu, dim=3)[0]
+            z2_rep = torch.repeat_interleave(z2.unsqueeze(2), z2.unsqueeze(2).shape[-1], 2)
+            z2_tril = torch.tril(z2_rep.transpose(2, 3), diagonal=-1)
+            z2_triu = torch.triu(z2_rep)
+            z2_def = z2_tril + z2_triu
+            z: Tensor = torch.max(torch.min(torch.cat([z1_def.unsqueeze(-1), z2_def.unsqueeze(-1)], dim=-1), dim=-1)[0],
+                                  dim=-1)[0]
+        elif self.right_unbound:
+            timed_until: Node = And(Globally(self.left_child, left_time_bound=0, right_time_bound=self.left_time_bound),
+                                    And(Eventually(self.right_child, right_unbound=True,
+                                                   left_time_bound=self.left_time_bound),
+                                        Eventually(Until(self.left_child, self.right_child, unbound=True),
+                                                   left_time_bound=self.left_time_bound, right_unbound=True)))
+            z: Tensor = timed_until._boolean(x)
+        else:
+            timed_until: Node = And(Globally(self.left_child, left_time_bound=0, right_time_bound=self.left_time_bound),
+                                    And(Eventually(self.right_child, left_time_bound=self.left_time_bound,
+                                                   right_time_bound=self.right_time_bound - 1),
+                                        Eventually(Until(self.left_child, self.right_child, unbound=True),
+                                                   left_time_bound=self.left_time_bound, right_unbound=True)))
+            z: Tensor = timed_until._boolean(x)
+        return z
+    def _quantitative(self, x: Tensor, normalize: bool = False) -> Tensor:
+        if self.unbound:
+            z1: Tensor = self.left_child._quantitative(x, normalize)
+            z2: Tensor = self.right_child._quantitative(x, normalize)
+            size: int = min(z1.size()[2], z2.size()[2])
+            z1: Tensor = z1[:, :, :size]
+            z2: Tensor = z2[:, :, :size]
+            # z1_rep = torch.repeat_interleave(z1.unsqueeze(2), z1.unsqueeze(2).shape[-1], 2)
+            # z1_tril = torch.tril(z1_rep.transpose(2, 3), diagonal=-1)
+            # z1_triu = torch.triu(z1_rep)
+            # z1_def = torch.cummin(z1_tril + z1_triu, dim=3)[0]
+            # z2_rep = torch.repeat_interleave(z2.unsqueeze(2), z2.unsqueeze(2).shape[-1], 2)
+            # z2_tril = torch.tril(z2_rep.transpose(2, 3), diagonal=-1)
+            # z2_triu = torch.triu(z2_rep)
+            # z2_def = z2_tril + z2_triu
+            # z: Tensor = torch.max(torch.min(torch.cat([z1_def.unsqueeze(-1), z2_def.unsqueeze(-1)], dim=-1), dim=-1)[0],
+            #                       dim=-1)[0]
+            z: Tensor = torch.cat([torch.max(torch.min(
+                torch.cat([torch.cummin(z1[:, :, t:].unsqueeze(-1), dim=2)[0], z2[:, :, t:].unsqueeze(-1)], dim=-1),
+                dim=-1)[0], dim=2, keepdim=True)[0] for t in range(size)], dim=2)
+        elif self.right_unbound:
+            timed_until: Node = And(Globally(self.left_child, left_time_bound=0, right_time_bound=self.left_time_bound),
+                                    And(Eventually(self.right_child, right_unbound=True,
+                                                   left_time_bound=self.left_time_bound),
+                                        Eventually(Until(self.left_child, self.right_child, unbound=True),
+                                                   left_time_bound=self.left_time_bound, right_unbound=True)))
+            z: Tensor = timed_until._quantitative(x, normalize=normalize)
+        else:
+            timed_until: Node = And(Globally(self.left_child, left_time_bound=0, right_time_bound=self.left_time_bound),
+                                    And(Eventually(self.right_child, left_time_bound=self.left_time_bound,
+                                                   right_time_bound=self.right_time_bound-1),
+                                        Eventually(Until(self.left_child, self.right_child, unbound=True),
+                                                   left_time_bound=self.left_time_bound, right_unbound=True)))
+            z: Tensor = timed_until._quantitative(x, normalize=normalize)
+        return z
 # from anchor_set_generation import anchorGeneration
 def load_json(path: str) -> Union[Dict, List]:
     """
     Load a JSON file from the given path.
     Args:
         path (str): The path to the JSON file to be loaded.
     Returns:
         Union[Dict, List]: The parsed content of the JSON file, which could be a dictionary or a list.
     """
     def sample(self, nvars):
         """
         Samples a random formula with distribution defined in class instance parameters
         Parameters
         ----------
         nvars : number of variables of input signals
             how many variables the formula is expected to consider.
         Returns
         -------
         TYPE
             A random formula.
         """
         return self._sample_internal_node(nvars)
     def bag_sample(self, bag_size, nvars):
         """
         Samples a bag of bag_size formulae
         Parameters
         ----------
         bag_size : INT
             number of formulae.
         nvars : INT
             number of vars in formulae.
         Returns
         -------
         a list of formulae.
         """
         formulae = []
         for _ in range(bag_size):
         while True:
             if nodetype == "not":
                 n = self._sample_node(nvars)
+                node = Not(n)
             elif nodetype == "and":
                 n1 = self._sample_node(nvars)
                 n2 = self._sample_node(nvars)
+                node = And(n1, n2)
             elif nodetype == "or":
                 n1 = self._sample_node(nvars)
                 n2 = self._sample_node(nvars)
+                node = Or(n1, n2)
             elif nodetype == "always":
                 n = self._sample_node(nvars)
                 unbound, right_unbound, left_time_bound, right_time_bound = self._get_temporal_parameters()
+                node = Globally(
                     n, unbound, right_unbound, left_time_bound, right_time_bound, self.adaptive_unbound_temporal_ops
                 )
             elif nodetype == "eventually":
                 n = self._sample_node(nvars)
                 unbound, right_unbound, left_time_bound, right_time_bound = self._get_temporal_parameters()
+                node = Eventually(
                     n, unbound, right_unbound, left_time_bound, right_time_bound, self.adaptive_unbound_temporal_ops
                 )
             elif nodetype == "until":
                 n1 = self._sample_node(nvars)
                 n2 = self._sample_node(nvars)
                 unbound, right_unbound, left_time_bound, right_time_bound = self._get_temporal_parameters()
+                node = Until(
                     n1, n2, unbound, right_unbound, left_time_bound, right_time_bound
                 )
         if rnd.rand() < self.leaf_prob:
             # sample a leaf
             var, thr, lte = self._get_atom(nvars)
+            return Atom(var, thr, lte)
         else:
             return self._sample_internal_node(nvars)
         self, mu0=0.0, sigma0=1.0, mu1=0.0, sigma1=1.0, q=0.1, q0=0.5, device="cpu"
     ):
         """
         Parameters
         ----------
         mu0 : mean of normal distribution of initial state, optional
             probability of initial sign of  derivative. The default is 0.5.
         device : 'cpu' or 'cuda', optional
             device on which to run the algorithm. The default is 'cpu'.
         Returns
         -------
         None.
         """
         self.mu0 = mu0
         self.sigma0 = sigma0
         """
         Samples a set of trajectories from the basic measure space, with parameters
         passed to the sampler
         Parameters
         ----------
         points : INT, optional
             number of trajectories. The default is 100000.
         varn : INT, optional
             number of variables per trajectory. The default is 2.
         Returns
         -------
         signal : samples x varn x points double pytorch tensor
             The sampled signals.
         """
         if self.device == "cuda" and not torch.cuda.is_available():
             raise RuntimeError("GPU card or CUDA library not available!")
             return kernel_matrix.cpu(), rhos1, selfk1, len1
         else:
             return kernel_matrix.cpu()
         n = self.samples
         p = self.points
         k = len(phis)
         kernel_matrix = kernel_matrix / normalize
         return kernel_matrix
+    @staticmethod
+    def _normalize(kernel_matrix, selfk1, selfk2):
+        normalize = torch.sqrt(torch.matmul(selfk1, torch.transpose(selfk2, 0, 1)))
+        kernel_matrix = kernel_matrix / normalize
+        return kernel_matrix
     def _exponentiate(self, kernel_matrix, selfk1, selfk2, sigma2=None):
         if sigma2 is None:
             sigma2 = self.sigma2
                      leaf_prob: float = 0.4, # complexity of the generated formula
                      cosine_similarity_threshold: float = 0.8 # if two formulae cosine similarity exceeds 0.9, then discard one of the two
                     ) -> str:
     # initialize STL formula generator
     sampler = StlGenerator(leaf_prob)
     # effective anchor set generation
     if diff_init:
         # initialize the anchor set with a randomly sampled formula
         diff_anchor_set = [sampler.sample(nvars=n_vars)]
         while len(diff_anchor_set) < embed_dim:
             # sample the 'remaining' formulae to reach the desired number of `embed_dim` formulae:
             candidate_anchors = sampler.bag_sample(embed_dim - len(diff_anchor_set), nvars = n_vars)
             # compute robustness of candidate anchor formulae on the same signals as previous anchor set
             candidate_robs = torch.cat([phi.quantitative(signals, normalize=True).unsqueeze(0) for phi in candidate_anchors], 0)
             # compute cosine similarity between current anchor set and candidate new formulae
             cos_simil = torch.tril(normalize(candidate_robs) @ normalize(anchor_rob_vectors).t(), diagonal=-1)
             # check which formulae are similar (i.e. greater cosine similarity then threshold) w.r.t. current anchors
             # NOTA: chiedere a gaia se cosine similarities negative vanno ammazzate con un valore assoluto o meno!
             similar_idx = [torch.where(cos_simil[r, :] > cosine_similarity_threshold)[0].tolist() for r in range(cos_simil.shape[0])]
             # keep only those who are semantically distant
             keep_idx = list(set(np.arange(len(candidate_anchors)).tolist()).difference(set([i for sublist in similar_idx for i in sublist])))
             diff_anchor_set += [copy.deepcopy(candidate_anchors[i]) for i in keep_idx]
             # Convert keep_idx to a tensor on the same device as candidate_robs
             keep_idx_tensor = torch.tensor(keep_idx, device=candidate_robs.device)
             # Use index_select to pick the relevant rows
             selected_robs = torch.index_select(candidate_robs, 0, keep_idx_tensor)
             # Concatenate on the same device
             anchor_rob_vectors = torch.cat([anchor_rob_vectors, copy.deepcopy(selected_robs)], dim=0)
             anchor_set = diff_anchor_set[:embed_dim]
     else:
+        anchor_set = sampler.bag_sample(bag_size=embed_dim, nvars=n_vars)
     filename = f'anchor_set_no_diff_{embed_dim}_dim'
     dump_pickle(filename, anchor_set)
 ####
     """
     A custom tokenizer class that extends `PreTrainedTokenizer` to handle a specific vocabulary and tokenization process.
+    This tokenizer can load a vocabulary from a JSON file, tokenize text, convert tokens to IDs,
     and handle padding and special tokens.
     """
+    def __init__(self, vocab_path: str, unk_token: str = "unk", pad_token: str = "pad",
                  bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, *args, **kwargs):
         """
         Initializes the STLTokenizer with a given vocabulary and special tokens.
         Args:
             vocab_path (str): The path to the JSON file containing the vocabulary.
             unk_token (str, optional): The token used for unknown words. Defaults to "unk".
         self.eos_token = eos_token
         self.model_max_length = model_max_length
         self.id_to_token = {v: k for k, v in self.vocab.items()}  # Reverse mapping
+        super().__init__(unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token,
                          model_max_length=model_max_length, *args, **kwargs)
     @property
     def vocab_size(self) -> int:
         """
         Returns the size of the vocabulary.
         Returns:
             int: The number of tokens in the vocabulary.
         """
     def prepad_sequence(self, sequence, space_token = ' ', new_space_token = '@', undo = False):
         """
         Replaces spaces in the input sequence with a specified token.
         Args:
             sequence (str): The input sequence.
             undo (bool): If True, replace the padding token with spaces. Defaults to False, which pads the spaces.
         Returns:
             str: The preprocessed sequence with spaces or padding tokens replaced.
         """
     def add_bos_eos(self, sequence: str) -> str:
         """
         Aggiunge i token BOS all'inizio e EOS alla fine della sequenza.
         Args:
             sequence (str): La sequenza di input.
         Returns:
             str: La sequenza con i token BOS ed EOS.
         """
     def tokenize(self, text: str) -> List[str]:
         """
         Tokenizes the input text into a list of tokens.
+        The method preprocesses the input text by replacing spaces with padding tokens and then tries to
         find the longest possible match for each substring in the vocabulary.
         Args:
             text (str): The input text to be tokenized.
         Returns:
             List[str]: A list of tokens representing the tokenized text.
         """
         text = self.add_bos_eos(text)
         text = self.prepad_sequence(text)
         tokens = []
         i = 0
         while i < len(text):
     def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
         """
         Converts a list of tokens into a list of token IDs.
         Args:
             tokens (List[str]): A list of tokens to be converted into IDs.
         Returns:
             List[int]: A list of corresponding token IDs.
         """
     def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
         """
         Converts a list of token IDs into a list of tokens.
         Args:
             ids (List[int]): A list of token IDs to be converted into tokens.
         Returns:
             List[str]: A list of corresponding tokens.
         """
     def encode(self, sequence: str) -> List[int]:
         """
         Encodes a string sequence into a list of token IDs.
+        This method tokenizes the input sequence using the `tokenize` method,
+        and then converts the resulting tokens into their corresponding token IDs
         using the `convert_tokens_to_ids` method.
         Args:
             sequence (str): The input sequence (text) to be encoded.
         Returns:
             List[int]: A list of token IDs corresponding to the input sequence.
         """
     def postpad_sequence(self, sequence, pad_token_id):
        """
+       Fills the sequence up to max_length padding elements
+       """
        num_extra_elements = self.model_max_length - len(sequence) -1
        if num_extra_elements > 0:
            sequence.extend([pad_token_id] * num_extra_elements)
     def decode(self, token_ids: List[int]) -> str:
         """
         Decodes a list of token IDs into a string of text.
+        The method converts the IDs to tokens and joins them to form a string.
         It also restores the original spaces or padding tokens if `undo` is True.
         Args:
             token_ids (List[int]): A list of token IDs to be decoded.
             skip_special_tokens (bool, optional): Whether to skip special tokens during decoding. Defaults to False.
         Returns:
             str: The decoded string.
         """
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
+        Saves the tokenizer's vocabulary to a file.
+        Useful only when the vocabulary has to be retrieved and is not given
         (thus this is not the case: here to further improvements with sentencepiece).
+        This method saves the vocabulary to a JSON file in the specified directory.
         Args:
             save_directory (str): The directory where the vocabulary file will be saved.
             filename_prefix (Optional[str]): An optional prefix for the filename.
         Returns:
             Tuple[str]: A tuple containing the path to the saved vocabulary file.
         """
     def get_vocab(self) -> dict:
         """
         Retrieves the vocabulary used by the tokenizer.
         Returns:
             dict: The vocabulary as a dictionary.
         """
         out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
         out.detach_()
         return out
     @torch.no_grad()
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
         """`input_ids_shape` is expected to be [bsz x seqlen]."""
 class STLAttention(nn.Module):
     """ Multi-Head Attention as depicted from 'Attention is all you need' """
+    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0,
                  is_decoder: bool = False, bias: bool = False, is_causal: bool = False):
         super().__init__()
         self.embed_dim = embed_dim  # overall embedding dimension -> to be divided between multiple heads
         self.num_heads = num_heads
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
+        assert (self.head_dim * num_heads) == self.embed_dim
         self.scaling = self.head_dim ** -0.5  # used to normalize values when projected using `W_` matrices
         self.is_decoder = is_decoder
         self.is_causal = is_causal
+        # 'roleplaying' matrices
+        self.W_k = nn.Linear(embed_dim, embed_dim, bias = bias)
         self.W_q = nn.Linear(embed_dim, embed_dim, bias = bias)
         self.W_v = nn.Linear(embed_dim, embed_dim, bias = bias)
         # to project the heads' outputs into a single vector
+        self.W_o = nn.Linear(embed_dim, embed_dim, bias = bias)
     def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
         return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(self,
                 hidden_states: torch.Tensor,  # previous values, passed to the multi-head attn layer
                 key_value_states: Optional[torch.Tensor] = None,  # different key, value items (used in cross-attn)
+                past_key_value: Optional[Tuple[torch.Tensor]] = None,  # stores the key and values of previous steps
                 attention_mask: Optional[torch.Tensor] = None,  # masks non-allowed items (padded or future ones)
                 layer_head_mask: Optional[torch.Tensor] = None,  # used to de-activate specific attn heads
                 output_attentions: bool = False  # flag to control the output of the attn values,
                 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         is_cross_attention = key_value_states is not None  # cross-attn if key_value_states is not None
         else:
             key = self._shape(self.W_k(hidden_states), -1, batch_size)
             value = self._shape(self.W_v(hidden_states), -1, batch_size)
         if self.is_decoder:
             past_key_value = (key, value)
         proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query = self._shape(query, tgt_len, batch_size).view(*proj_shape)
         key = key.reshape(*proj_shape)
         value = value.reshape(*proj_shape)
         src_len = key.size(1)
         ######################################################################################################
         # 'traditional' attention computation
         if attention_mask is not None:
             attn_weights = attn_weights.view(batch_size, self.num_heads, tgt_len, src_len) + attention_mask
             attn_weights = attn_weights.view(batch_size * self.num_heads, tgt_len, src_len)
         # Normalize values on the `key` axis (dim=-1)
         attn_weights = F.softmax(attn_weights, dim=-1)
         attn_output = attn_output.transpose(1, 2)
         attn_output = attn_output.reshape(batch_size, tgt_len, self.embed_dim)
+        attn_output = self.W_o(attn_output)
         return attn_output, None, past_key_value
 ####
 class STLEncoder():
+    def __init__(self,
                  embed_dim: int,
                  anchor_filename: Optional[str] = None,
                  n_vars: int = 3):
         self.n_vars = n_vars # passaglielo in input
         self.embed_dim = embed_dim
         self.anchorset_filename = anchor_filename
         self.mu = BaseMeasure(device=self.device)
         self.kernel = StlKernel(self.mu, varn=self.n_vars)
+        if anchor_filename is None:
+            anchor_filename = anchorGeneration(diff_init = True, embed_dim = self.embed_dim, n_vars = self.n_vars)
             anchor_filename+='.pickle'
         # TO DO: check on the dimensions of the anchor set and the `embed_dim` and `n_vars` values
         return self.kernel.compute_bag_bag(formula, self.anchor_set)
 class STLModel(PreTrainedModel):
+    config_class = STLConfig
+    base_model_prefix = "model"
     supports_gradient_checkpointing = True
     # initializes the weights of `nn.Linear`, `nn.Embedding` and `STLSinusoidalPositionalEmbedding`
         return dummy_inputs
 class STLDecoderBlock(nn.Module):
+    def __init__(self, embed_dim: int,
                 num_decoder_attention_heads: int,
                 num_decoder_ffn_dim: int,
                 dropout: float = 0.0,
                 attention_dropout: float = 0.0,
                 activation_dropout: float = 0.0,
                 ):
         super().__init__()
         self.embed_dim = embed_dim
+       # first block
         self.self_attn = STLAttention(
+            embed_dim=self.embed_dim,
             num_heads=num_decoder_attention_heads,
             dropout=dropout,
             is_decoder=True, # not used, debugging purposes
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
         """
         ###################################################################
+        # BLOCK 1: processing what has been previously generated
         # previous state is stored into an auxiliary variable `residual`
         residual = hidden_states
+        # tries to exploit previous K, V values if there are any
         # (practically picks up to the first 2 values stored in `past_key_value` vector)
         self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         # masked MHSA on the already generated sequence
+        # invokes `forward` method to transform the original vector accordingly
         hidden_states, self_attn_weights, present_key_value = self.self_attn.forward(
             hidden_states=hidden_states, # Q
             past_key_value=self_attn_past_key_value, # K, V
             attention_mask=attention_mask, # passed as input of the decoder layer
+            layer_head_mask=layer_head_mask, # to deactivate certain attn layers
+            output_attentions=output_attentions,
         )
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         # BLOCK 2: cross-attn between already generated input and previous information (from the encoder)
         # initialize K, Q, attn_weights for this new attn operation
+        cross_attn_present_key_value = None
         cross_attn_weights = None
         # the important condition is that the encoder carries some information
         attention_dropout = config.attention_dropout
         activation_dropout = config.activation_dropout
         decoder_layerdrop = config.decoder_layerdrop
         self.dropout = dropout
         self.layerdrop = decoder_layerdrop
         self.padding_idx = pad_token_id
         # Initialize the input embedding (if not passed already)
         self.embed_tokens = nn.Embedding(decoder_vocab_size, embed_dim, self.padding_idx)
         # Initialize positional embedding also
         self.embed_positions = STLSinusoidalPositionalEmbedding(
             max_position_embeddings, embed_dim, self.padding_idx
         )
         # Initialize decoder layers (of a prespecified number)
+        self.layers = nn.ModuleList([STLDecoderBlock(embed_dim, num_decoder_attention_heads,
+                                                      num_decoder_ffn_dim, dropout,
+                                                      attention_dropout, activation_dropout)
                                      for _ in range(num_decoder_layers)])
         self.gradient_checkpointing = False
         return_dict: Optional[bool] = None,
         **kwargs,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
             cross_attentions=all_cross_attentions,
         )
+####
 class STLForCausalLM(STLModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
         config = copy.deepcopy(config)
         config.is_decoder = True
         config.is_encoder_decoder = False
         super().__init__(config)
         self.model = STLDecoder(config)
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
         return reordered_past