alverciito commited on Jan 15

Commit

dbd79bd

1 Parent(s): 4c7684b

upload safetensors and refactor research files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

config.json +16 -0
configurations.py +0 -0
model.py +176 -0
model.safetensors +3 -0
requirements.txt +1 -0
bench.py → research_files/bench.py +0 -0
research_files/benchmark/results/binseg_bert-base-multilingual-cased.json +0 -0
{benchmark → research_files/benchmark}/results/binseg_paraphrase-multilingual-MiniLM-L12-v2.json +0 -0
{benchmark → research_files/benchmark}/results/binseg_sentence_similarity_spanish_es.json +0 -0
research_files/benchmark/results/csim_bert-base-multilingual-cased.json +0 -0
{benchmark → research_files/benchmark}/results/csim_paraphrase-multilingual-MiniLM-L12-v2.json +0 -0
{benchmark → research_files/benchmark}/results/csim_sentence_similarity_spanish_es.json +0 -0
research_files/benchmark/results/pelt_LaBSE.json +0 -0
{benchmark → research_files/benchmark}/results/pelt_bert-base-multilingual-cased.json +0 -0
{benchmark → research_files/benchmark}/results/pelt_paraphrase-multilingual-MiniLM-L12-v2.json +0 -0
{benchmark → research_files/benchmark}/results/pelt_sentence_similarity_spanish_es.json +0 -0
{benchmark → research_files/benchmark}/results/proposed_method.json +0 -0
{benchmark → research_files/benchmark}/results/textile_baseline.json +0 -0
{benchmark → research_files/benchmark}/segmentation_benchmark/__init__.py +0 -0
{benchmark → research_files/benchmark}/segmentation_benchmark/heuristic.py +0 -0
{benchmark → research_files/benchmark}/segmentation_benchmark/load_dataset.py +0 -0
{benchmark → research_files/benchmark}/segmentation_benchmark/metrics.py +0 -0
{benchmark → research_files/benchmark}/segmentation_benchmark/proposed.py +2 -2
{benchmark → research_files/benchmark}/segmentation_benchmark/transformers.py +1 -1
{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_result_A001_1.json +0 -0
{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_result_A001_2.json +0 -0
{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_result_A001_3.json +0 -0
{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_result_A001_4.json +0 -0
{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_result_A001_5.json +0 -0
{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_threshold.py +2 -2
{benchmark → research_files/benchmark}/thresholding_benchmark/print_results.py +0 -0
{benchmark → research_files/benchmark}/wikipedia-es-A002/data-00000-of-00001.arrow +0 -0
{benchmark → research_files/benchmark}/wikipedia-es-A002/dataset_info.json +0 -0
{benchmark → research_files/benchmark}/wikipedia-es-A002/state.json +0 -0
{inference → research_files/inference}/__init__.py +0 -0
{inference → research_files/inference}/config.py +181 -181
{inference → research_files/inference}/load.py +0 -0
{inference → research_files/inference}/model_state.pt +0 -0
{inference → research_files/inference}/pipeline.py +1 -1
{inference → research_files/inference}/tokenizer_32768.json +0 -0
research_files/torch_to_hf.py +27 -0
{train → research_files/train}/config.py +2 -2
{train → research_files/train}/train_logs/config.json +0 -0
{train → research_files/train}/train_logs/logfile.log +0 -0
{train → research_files/train}/train_logs/tensorboard_logs.zip +0 -0
{train → research_files/train}/train_model.py +3 -3
special_tokens_map.json +7 -0
src/dataset/__init__.py +13 -13
src/dataset/config.py +29 -29
src/dataset/dataset.py +199 -199

config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "architectures": [
+    "CoseNetTransformer"
+  ],
+  "dropout": 0.0,
+  "emb_dim": 256,
+  "model_type": "sentence_transformer",
+  "seq_len": ...,
+  "torch_dtype": "float32",
+  "transformers_version": "4.57.3",
+  "vocab_size": 32768,
+  "auto_map": {
+    "AutoConfig": "configurations.SentenceCoseNetConfig",
+    "AutoModel": "model.SentenceCoseNet"
+  }
+}

configurations.py ADDED Viewed

File without changes

model.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                                                           #
+#   This file was created by: Alberto Palomo Alonso         #
+# Universidad de Alcalá - Escuela Politécnica Superior      #
+#                                                           #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+# Import statements:
+import torch
+from src.model.config import ModelConfig
+from src.model.cosenet import CosineDistanceLayer, CoSeNet
+from src.model.transformers import EncoderBlock, PositionalEncoding, MaskedMeanPooling
+class CoseNetTransformer(torch.nn.Module):
+    """
+    Segmentation network combining Transformer encoders with CoSeNet.
+    This model integrates token embeddings and positional encodings with
+    a stack of Transformer encoder blocks to produce contextualized
+    representations. These representations are then processed by a
+    CoSeNet module to perform structured segmentation, followed by a
+    cosine-based distance computation.
+    The final output is a pair-wise distance matrix suitable for
+    segmentation or boundary detection tasks.
+    """
+    def __init__(self, model_config: ModelConfig, **kwargs):
+        """
+        Initialize the segmentation network.
+        The network is composed of an embedding layer, positional encoding,
+        multiple Transformer encoder blocks, a CoSeNet segmentation module,
+        and a cosine distance layer.
+        Args:
+            model_config (ModelConfig): Configuration object containing all
+                hyperparameters required to build the model, including
+                vocabulary size, model dimensionality, transformer settings,
+                and CoSeNet parameters.
+            **kwargs: Additional keyword arguments forwarded to
+                `torch.nn.Module`.
+        """
+        super().__init__(**kwargs)
+        self.valid_padding = model_config.valid_padding
+        # Build layers:
+        self.embedding = torch.nn.Embedding(
+            model_config.vocab_size,
+            model_config.model_dim
+        )
+        self.positional_encoding = PositionalEncoding(
+            emb_dim=model_config.model_dim,
+            max_len=model_config.max_tokens
+        )
+        self.cosenet = CoSeNet(
+            trainable=model_config.cosenet.trainable,
+            init_scale=model_config.cosenet.init_scale
+        )
+        self.distance_layer = CosineDistanceLayer()
+        self.pooling = MaskedMeanPooling(valid_pad=model_config.valid_padding)
+        # Build encoder blocks:
+        module_list = list()
+        for transformer_config in model_config.transformers:
+            encoder_block = EncoderBlock(
+                feature_dim=model_config.model_dim,
+                attention_heads=transformer_config.attention_heads,
+                feed_forward_multiplier=transformer_config.feed_forward_multiplier,
+                dropout=transformer_config.dropout,
+                valid_padding=model_config.valid_padding,
+                pre_normalize=transformer_config.pre_normalize
+            )
+            module_list.append(encoder_block)
+        self.encoder_blocks = torch.nn.ModuleList(module_list)
+    def encode(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
+        """
+        Encode input sequences into contextualized representations.
+        The input token indices are embedded and enriched with positional
+        information, then processed by a stack of Transformer encoder
+        blocks.
+        Args:
+            x (torch.Tensor): Input tensor of token indices with shape
+                (batch_size, max_tokens).
+            mask (torch.Tensor, optional): Optional mask tensor indicating
+                valid or padded positions, depending on the configuration
+                of the Transformer blocks. Defaults to None. Dimensions should be
+                (batch_size, max_tokens).
+        """
+        # Convert to type:
+        x = x.int()
+        # Embedding and positional encoding:
+        x = self.embedding(x)
+        x = self.positional_encoding(x)
+        # Check mask inversion:
+        if mask[0, 0] == 0:
+            mask = torch.logical_not(mask)
+        # Encode:
+        for encoder in self.encoder_blocks:
+            x = encoder(x, mask=mask)
+        return x
+    def forward(self, x: torch.Tensor, mask: torch.Tensor = None, candidate_mask: torch.Tensor = None) -> torch.Tensor:
+        """
+        Forward pass of the segmentation network.
+        The input token indices are embedded and enriched with positional
+        information, then processed by a stack of Transformer encoder
+        blocks. The resulting representations are segmented using CoSeNet
+        and finally transformed into a pair-wise distance representation.
+        Args:
+            x (torch.Tensor): Input tensor of token indices with shape
+                (batch_size, sequence_length).
+            mask (torch.Tensor, optional): Optional mask tensor indicating
+                valid or padded positions, depending on the configuration
+                of the Transformer blocks. Defaults to None.
+                If `valid_padding` is disabled, the mask is inverted before being
+                passed to CoSeNet to match its masking convention.
+            candidate_mask (torch.Tensor, optional): Optional mask tensor for
+                candidate positions in CoSeNet. Defaults to None.
+                If `valid_padding` is disabled, the mask is inverted before being
+                passed to CoSeNet to match its masking convention.
+        Returns:
+            torch.Tensor: Output tensor containing pairwise distance values
+            derived from the segmented representations.
+        """
+        # Convert to type:
+        x = x.int()
+        # Embedding and positional encoding:
+        x = self.embedding(x)
+        x = self.positional_encoding(x)
+        # Reshape x and mask:
+        _b, _s, _t, _d = x.shape
+        x = x.reshape(_b * _s, _t, _d)
+        if mask is not None:
+            mask = mask.reshape(_b * _s, _t).bool()
+        # Encode the sequence:
+        for encoder in self.encoder_blocks:
+            x = encoder(x, mask=mask)
+        # Reshape x and mask:
+        x = x.reshape(_b, _s, _t, _d)
+        if mask is not None:
+            mask = mask.reshape(_b, _s, _t)
+            mask = torch.logical_not(mask) if not self.valid_padding else mask
+        # Apply pooling:
+        x, mask = self.pooling(x, mask=mask)
+        # Compute distances:
+        x = self.distance_layer(x)
+        # Pass through CoSeNet:
+        x = self.cosenet(x, mask=mask)
+        # Apply candidate mask if provided:
+        if candidate_mask is not None:
+            candidate_mask = candidate_mask.bool() if not self.valid_padding else torch.logical_not(candidate_mask.bool())
+            candidate_mask = candidate_mask.to(device=x.device)
+            x = x.masked_fill(candidate_mask, 0)
+        return x
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                        END OF FILE                        #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6db78280c80f27b94434a1d1e17296ecddc1d21705ec6be3b8bd0bc49991f27f
+size 44485604

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 ruptures
 sentence-transformers
 numpy==2.3.5

+safetensors
 ruptures
 sentence-transformers
 numpy==2.3.5

bench.py → research_files/bench.py RENAMED Viewed

File without changes

research_files/benchmark/results/binseg_bert-base-multilingual-cased.json ADDED Viewed

The diff for this file is too large to render. See raw diff

{benchmark → research_files/benchmark}/results/binseg_paraphrase-multilingual-MiniLM-L12-v2.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/results/binseg_sentence_similarity_spanish_es.json RENAMED Viewed

File without changes

research_files/benchmark/results/csim_bert-base-multilingual-cased.json ADDED Viewed

The diff for this file is too large to render. See raw diff

{benchmark → research_files/benchmark}/results/csim_paraphrase-multilingual-MiniLM-L12-v2.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/results/csim_sentence_similarity_spanish_es.json RENAMED Viewed

File without changes

research_files/benchmark/results/pelt_LaBSE.json ADDED Viewed

The diff for this file is too large to render. See raw diff

{benchmark → research_files/benchmark}/results/pelt_bert-base-multilingual-cased.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/results/pelt_paraphrase-multilingual-MiniLM-L12-v2.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/results/pelt_sentence_similarity_spanish_es.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/results/proposed_method.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/results/textile_baseline.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/segmentation_benchmark/__init__.py RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/segmentation_benchmark/heuristic.py RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/segmentation_benchmark/load_dataset.py RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/segmentation_benchmark/metrics.py RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/segmentation_benchmark/proposed.py RENAMED Viewed

@@ -9,10 +9,10 @@ import os
 import json
 import numpy as np
 import torch
-from datasets import tqdm
 from .metrics import precision_recall_f1_wd
 from .load_dataset import load_dataset
-from inference import load_model
 def evaluate_proposed(

 import json
 import numpy as np
 import torch
+from tqdm import tqdm
 from .metrics import precision_recall_f1_wd
 from .load_dataset import load_dataset
+from research_files.inference import load_model
 def evaluate_proposed(

{benchmark → research_files/benchmark}/segmentation_benchmark/transformers.py RENAMED Viewed

@@ -9,7 +9,7 @@ import os
 import json
 import numpy as np
 import torch
-from datasets import tqdm
 from .metrics import precision_recall_f1_wd
 from .load_dataset import load_dataset
 from sentence_transformers import SentenceTransformer

 import json
 import numpy as np
 import torch
+from tqdm import tqdm
 from .metrics import precision_recall_f1_wd
 from .load_dataset import load_dataset
 from sentence_transformers import SentenceTransformer

{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_result_A001_1.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_result_A001_2.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_result_A001_3.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_result_A001_4.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_result_A001_5.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/thresholding_benchmark/benchmark_threshold.py RENAMED Viewed

@@ -10,8 +10,8 @@ import tqdm
 import json
 from datasets import load_from_disk
 from src.model import SegmentationNetwork, MaskedBCELoss, WindowDiffLoss
-from src.dataset import SegmentationTokenizer, SentenceSegmenter, TokenizedSegmentationDataset
-from train.config import configuration
 # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - #

 import json
 from datasets import load_from_disk
 from src.model import SegmentationNetwork, MaskedBCELoss, WindowDiffLoss
+from dataset import SegmentationTokenizer, SentenceSegmenter, TokenizedSegmentationDataset
+from research_files.train.config import configuration
 # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - #

{benchmark → research_files/benchmark}/thresholding_benchmark/print_results.py RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/wikipedia-es-A002/data-00000-of-00001.arrow RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/wikipedia-es-A002/dataset_info.json RENAMED Viewed

File without changes

{benchmark → research_files/benchmark}/wikipedia-es-A002/state.json RENAMED Viewed

File without changes

{inference → research_files/inference}/__init__.py RENAMED Viewed

File without changes

{inference → research_files/inference}/config.py RENAMED Viewed

@@ -1,181 +1,181 @@
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                                                           #
-#   This file was created by: Alberto Palomo Alonso         #
-# Universidad de Alcalá - Escuela Politécnica Superior      #
-#                                                           #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-# Import statements:
-from dataclasses import dataclass
-from src.model import ModelConfig, CoSeNetConfig, TransformerConfig
-from src.dataset import DatasetConfig
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                  SETUP CONFIGURATION                      #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-@dataclass
-class SetupConfig:
-    """
-    Configuration parameters related to the execution environment and logging.
-    This configuration controls device selection, checkpointing behavior,
-    reproducibility settings, and logging paths for an experiment.
-    """
-    device_number: int = 0
-    save_model_each: int = 0
-    seed: int = None
-    logging_path: str = None
-    reload_checkpoint: bool = False
-def overwrite_setup_config() -> SetupConfig:
-    """
-    Create and override the default setup configuration.
-    This function customizes execution-level parameters such as logging
-    paths, checkpoint reloading, and model saving frequency.
-    Returns:
-        SetupConfig: The configured setup configuration object.
-    """
-    config = SetupConfig()
-    config.logging_path = r'/workspace/logs'
-    config.reload_checkpoint = True
-    config.save_model_each = 1
-    return config
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                  TRAINING CONFIGURATION                   #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-@dataclass
-class TrainConfig:
-    """
-    Training configuration container.
-    This dataclass aggregates model, dataset, and setup configurations,
-    together with optimization and training hyperparameters.
-    """
-    # Linked configurations:
-    model_config: ModelConfig | None = None
-    dataset_config: DatasetConfig | None = None
-    setup_config: SetupConfig | None = None
-    # Training parameters:
-    batch_size: int = 32
-    num_epochs: int = 100
-    # Optimizer parameters:
-    learning_rate: float = 1e-4
-    learning_rate_min: float = 1e-5
-    weight_decay: float = 1e-8
-    betas: tuple[float, float] = (0.5, 0.999)
-def overwrite_train_config() -> TrainConfig:
-    """
-    Create and override the default training configuration.
-    This function customizes batch size, number of epochs, and optimizer
-    hyperparameters for the training process.
-    Returns:
-        TrainConfig: The configured training configuration object.
-    """
-    config = TrainConfig()
-    config.batch_size = 4
-    config.num_epochs = 200
-    config.learning_rate = 5e-4
-    config.learning_rate_min = 5e-5
-    config.weight_decay = 1e-6
-    return config
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                  DATASET CONFIGURATION                    #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-def overwrite_dataset_config() -> DatasetConfig:
-    """
-    Create and override the dataset configuration.
-    This function sets the file paths and usage percentages for training,
-    validation, and test datasets.
-    Returns:
-        DatasetConfig: The configured dataset configuration object.
-    """
-    config = DatasetConfig()
-    config.train_data_path = r"/workspace/data/tokens-A000-segmentation"
-    config.val_data_path = r"/workspace/data/tokens-A001-segmentation"
-    config.test_data_path = r"/workspace/data/tokens-A002-segmentation"
-    config.train_percentage = 0.4
-    config.val_percentage = 0.4
-    config.test_percentage = 1.0
-    return config
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                  MODEL CONFIGURATION                      #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-def overwrite_model_config() -> ModelConfig:
-    """
-    Create and override the model configuration.
-    This function defines the architecture-level parameters, including
-    vocabulary size, embedding dimensionality, CoSeNet settings, and
-    the stack of Transformer encoder configurations.
-    Returns:
-        ModelConfig: The configured model configuration object.
-    """
-    config = ModelConfig()
-    # High-level params:
-    config.vocab_size = 32_768
-    config.model_dim = 256
-    config.valid_padding = True
-    # CoSeNet params:
-    config.cosenet = CoSeNetConfig(
-        trainable=True,
-        init_scale=5.0
-    )
-    # Transformer params:
-    config.transformers = [
-        TransformerConfig(**cfg)
-        for cfg in [
-            {
-                "attention_heads": 16,
-                "feed_forward_multiplier": 8,
-                "dropout": 0.0,
-                "pre_normalize": True
-            },
-            {
-                "attention_heads": 16,
-                "feed_forward_multiplier": 8,
-                "dropout": 0.0,
-                "pre_normalize": True
-            }
-        ]
-    ]
-    return config
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                  WHOLE CONFIGURATION                      #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-def configuration() -> TrainConfig:
-    """
-    Create the experiment configuration
-    :return: A TrainConfig configuration object
-    """
-    config = overwrite_train_config()
-    config.setup_config = overwrite_setup_config()
-    config.model_config = overwrite_model_config()
-    config.dataset_config = overwrite_dataset_config()
-    return config
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                        END OF FILE                        #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                                                           #
+#   This file was created by: Alberto Palomo Alonso         #
+# Universidad de Alcalá - Escuela Politécnica Superior      #
+#                                                           #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+# Import statements:
+from dataclasses import dataclass
+from src.model import ModelConfig, CoSeNetConfig, TransformerConfig
+from src.dataset import DatasetConfig
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                  SETUP CONFIGURATION                      #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+@dataclass
+class SetupConfig:
+    """
+    Configuration parameters related to the execution environment and logging.
+    This configuration controls device selection, checkpointing behavior,
+    reproducibility settings, and logging paths for an experiment.
+    """
+    device_number: int = 0
+    save_model_each: int = 0
+    seed: int = None
+    logging_path: str = None
+    reload_checkpoint: bool = False
+def overwrite_setup_config() -> SetupConfig:
+    """
+    Create and override the default setup configuration.
+    This function customizes execution-level parameters such as logging
+    paths, checkpoint reloading, and model saving frequency.
+    Returns:
+        SetupConfig: The configured setup configuration object.
+    """
+    config = SetupConfig()
+    config.logging_path = r'/workspace/logs'
+    config.reload_checkpoint = True
+    config.save_model_each = 1
+    return config
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                  TRAINING CONFIGURATION                   #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+@dataclass
+class TrainConfig:
+    """
+    Training configuration container.
+    This dataclass aggregates model, dataset, and setup configurations,
+    together with optimization and training hyperparameters.
+    """
+    # Linked configurations:
+    model_config: ModelConfig | None = None
+    dataset_config: DatasetConfig | None = None
+    setup_config: SetupConfig | None = None
+    # Training parameters:
+    batch_size: int = 32
+    num_epochs: int = 100
+    # Optimizer parameters:
+    learning_rate: float = 1e-4
+    learning_rate_min: float = 1e-5
+    weight_decay: float = 1e-8
+    betas: tuple[float, float] = (0.5, 0.999)
+def overwrite_train_config() -> TrainConfig:
+    """
+    Create and override the default training configuration.
+    This function customizes batch size, number of epochs, and optimizer
+    hyperparameters for the training process.
+    Returns:
+        TrainConfig: The configured training configuration object.
+    """
+    config = TrainConfig()
+    config.batch_size = 4
+    config.num_epochs = 200
+    config.learning_rate = 5e-4
+    config.learning_rate_min = 5e-5
+    config.weight_decay = 1e-6
+    return config
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                  DATASET CONFIGURATION                    #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+def overwrite_dataset_config() -> DatasetConfig:
+    """
+    Create and override the dataset configuration.
+    This function sets the file paths and usage percentages for training,
+    validation, and test datasets.
+    Returns:
+        DatasetConfig: The configured dataset configuration object.
+    """
+    config = DatasetConfig()
+    config.train_data_path = r"/workspace/data/tokens-A000-segmentation"
+    config.val_data_path = r"/workspace/data/tokens-A001-segmentation"
+    config.test_data_path = r"/workspace/data/tokens-A002-segmentation"
+    config.train_percentage = 0.4
+    config.val_percentage = 0.4
+    config.test_percentage = 1.0
+    return config
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                  MODEL CONFIGURATION                      #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+def overwrite_model_config() -> ModelConfig:
+    """
+    Create and override the model configuration.
+    This function defines the architecture-level parameters, including
+    vocabulary size, embedding dimensionality, CoSeNet settings, and
+    the stack of Transformer encoder configurations.
+    Returns:
+        ModelConfig: The configured model configuration object.
+    """
+    config = ModelConfig()
+    # High-level params:
+    config.vocab_size = 32_768
+    config.model_dim = 256
+    config.valid_padding = True
+    # CoSeNet params:
+    config.cosenet = CoSeNetConfig(
+        trainable=True,
+        init_scale=5.0
+    )
+    # Transformer params:
+    config.transformers = [
+        TransformerConfig(**cfg)
+        for cfg in [
+            {
+                "attention_heads": 16,
+                "feed_forward_multiplier": 8,
+                "dropout": 0.0,
+                "pre_normalize": True
+            },
+            {
+                "attention_heads": 16,
+                "feed_forward_multiplier": 8,
+                "dropout": 0.0,
+                "pre_normalize": True
+            }
+        ]
+    ]
+    return config
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                  WHOLE CONFIGURATION                      #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+def configuration() -> TrainConfig:
+    """
+    Create the experiment configuration
+    :return: A TrainConfig configuration object
+    """
+    config = overwrite_train_config()
+    config.setup_config = overwrite_setup_config()
+    config.model_config = overwrite_model_config()
+    config.dataset_config = overwrite_dataset_config()
+    return config
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                        END OF FILE                        #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

{inference → research_files/inference}/load.py RENAMED Viewed

File without changes

{inference → research_files/inference}/model_state.pt RENAMED Viewed

File without changes

{inference → research_files/inference}/pipeline.py RENAMED Viewed

@@ -8,7 +8,7 @@
 import numpy as np
 import torch
 from src.model import SegmentationNetwork
-from src.dataset import SegmentationTokenizer, SentenceSegmenter
 # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - #

 import numpy as np
 import torch
 from src.model import SegmentationNetwork
+from dataset import SegmentationTokenizer, SentenceSegmenter
 # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - #

{inference → research_files/inference}/tokenizer_32768.json RENAMED Viewed

File without changes

research_files/torch_to_hf.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                                                           #
+#   This file was created by: Alberto Palomo Alonso         #
+# Universidad de Alcalá - Escuela Politécnica Superior      #
+#                                                           #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+# Import statements:
+import os
+from research_files.inference import load_model
+from safetensors.torch import save_file
+def convert_model(save_path: str, model_path: str = None, tokenizer_path: str = None):
+    # Load model:
+    model, tokenizer, segmenter = load_model(model_path, tokenizer_path)
+    state_dict = model.state_dict()
+    save_file(state_dict, os.path.join(save_path, "model.safetensors"))
+    tokenizer._hf_tokenizer.save_pretrained(os.path.join(save_path))
+if __name__ == "__main__":
+    # Convert and save:
+    convert_model("./")
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                        END OF FILE                        #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

{train → research_files/train}/config.py RENAMED Viewed

@@ -7,8 +7,8 @@
 # Import statements:
 import os
 from dataclasses import dataclass
-from src.model import ModelConfig, CoSeNetConfig, TransformerConfig
-from src.dataset import DatasetConfig
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

 # Import statements:
 import os
 from dataclasses import dataclass
+from model import ModelConfig, CoSeNetConfig, TransformerConfig
+from dataset import DatasetConfig
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

{train → research_files/train}/train_logs/config.json RENAMED Viewed

File without changes

{train → research_files/train}/train_logs/logfile.log RENAMED Viewed

File without changes

{train → research_files/train}/train_logs/tensorboard_logs.zip RENAMED Viewed

File without changes

{train → research_files/train}/train_model.py RENAMED Viewed

@@ -7,10 +7,10 @@
 # Import statements:
 import torch
 import tqdm
-from train.config import configuration, TrainConfig
 from src.model import SegmentationNetwork, MaskedBCELoss
-from src.dataset import TokenizedSegmentationDataset
-from src.dlutils import Setup, train_step, validation_step
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

 # Import statements:
 import torch
 import tqdm
+from research_files.train.config import configuration, TrainConfig
 from src.model import SegmentationNetwork, MaskedBCELoss
+from dataset import TokenizedSegmentationDataset
+from dlutils import Setup, train_step, validation_step
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

src/dataset/__init__.py CHANGED Viewed

@@ -1,13 +1,13 @@
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                                                           #
-#   This file was created by: Alberto Palomo Alonso         #
-# Universidad de Alcalá - Escuela Politécnica Superior      #
-#                                                           #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-from .tokenizer import SegmentationTokenizer, SentenceSegmenter
-from .dataset import SegmentationDataset
-from .tokenized_dataset import TokenizedSegmentationDataset
-from .config import DatasetConfig
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                        END OF FILE                        #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                                                           #
+#   This file was created by: Alberto Palomo Alonso         #
+# Universidad de Alcalá - Escuela Politécnica Superior      #
+#                                                           #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+from .tokenizer import SegmentationTokenizer, SentenceSegmenter
+from .dataset import SegmentationDataset
+from .tokenized_dataset import TokenizedSegmentationDataset
+from .config import DatasetConfig
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                        END OF FILE                        #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

src/dataset/config.py CHANGED Viewed

@@ -1,29 +1,29 @@
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                                                           #
-#   This file was created by: Alberto Palomo Alonso         #
-# Universidad de Alcalá - Escuela Politécnica Superior      #
-#                                                           #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-# Import statements:
-from dataclasses import dataclass
-@dataclass
-class DatasetConfig:
-    # Paths:
-    train_data_path: str = None
-    val_data_path: str = None
-    test_data_path: str = None
-    # Percentages:
-    train_percentage: float = 1.0
-    val_percentage: float = 1.0
-    test_percentage: float = 1.0
-    # Other parameters:
-    num_workers: int = 0
-    shuffle_train: bool = True
-    shuffle_val: bool = True
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                        END OF FILE                        #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                                                           #
+#   This file was created by: Alberto Palomo Alonso         #
+# Universidad de Alcalá - Escuela Politécnica Superior      #
+#                                                           #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+# Import statements:
+from dataclasses import dataclass
+@dataclass
+class DatasetConfig:
+    # Paths:
+    train_data_path: str = None
+    val_data_path: str = None
+    test_data_path: str = None
+    # Percentages:
+    train_percentage: float = 1.0
+    val_percentage: float = 1.0
+    test_percentage: float = 1.0
+    # Other parameters:
+    num_workers: int = 0
+    shuffle_train: bool = True
+    shuffle_val: bool = True
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                        END OF FILE                        #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

src/dataset/dataset.py CHANGED Viewed

@@ -1,199 +1,199 @@
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                                                           #
-#   This file was created by: Alberto Palomo Alonso         #
-# Universidad de Alcalá - Escuela Politécnica Superior      #
-#                                                           #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-# Import statements:
-import logging
-from torch.utils.data import Dataset, DataLoader
-from datasets import Dataset as HfDataset
-from datasets import load_from_disk
-from .tokenizer import SegmentationTokenizer, SentenceSegmenter
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                                                           #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-class SegmentationDataset(Dataset):
-    def __init__(
-            self,
-            huggingface_dataset: str | HfDataset,
-            tokenizer: SegmentationTokenizer,
-            segmenter: SentenceSegmenter,
-            logger: logging.Logger = None,
-            percentage: float = 1.0,
-            return_type: type = dict
-    ):
-        """
-        A segmentation dataset takes a huggingface dataset or a path to a dataset on disk with the
-        wikipedia-segmentation format. It loads the dataset and prepares it for training.
-        Wikipedia-segmentation format:
-        - The dataset is expected to be a huggingface dataset or a path to a dataset on disk.
-        - The dataset should contain the following fields:
-        >>> sample = {
-        >>>    'text': ['Article 1', 'Article 2', ...],
-        >>>    'titles': ['Title 1', 'Title 2', ...],
-        >>>    'id': str,
-        >>>    'words': int
-        >>>    'paragraphs': int
-        >>>    'sentences': int
-        >>> }
-        - The dataset should be a list of dictionaries, where each dictionary contains the fields above.
-        Parameters
-        ----------
-        huggingface_dataset : str | HfDataset
-            A huggingface dataset or a path to a dataset on disk with the wikipedia-segmentation format.
-        tokenizer : callable
-            A tokenizer function that takes a string and returns a list of tokens.
-        logger : logging.Logger, optional
-            Logger instance. If not provided, a null logger will be used.
-        percentage : float
-            Percentage of the dataset to use. Default is 1.0 (100%).
-        return_type : type
-            The return type of __getitem__, either dict or tuple. Default is dict.
-        Raises
-        ------
-        ValueError
-            If the huggingface_dataset is not a string or a HfDataset.
-        ValueError
-            If the tokenizer is not a callable function or class.
-        ValueError
-            If the sentence_tokenizer is not a callable function or class.
-        ValueError
-            If the dtype is not a type.
-        """
-        # Null logging:
-        if not isinstance(logger, logging.Logger):
-            self.logger = logging.getLogger("null")
-            self.logger.addHandler(logging.NullHandler())
-        else:
-            self.logger = logger
-        # Loading:
-        if isinstance(huggingface_dataset, HfDataset):
-            self.huggingface_dataset = huggingface_dataset
-        elif isinstance(huggingface_dataset, str):
-            self.huggingface_dataset = load_from_disk(huggingface_dataset)
-        else:
-            self.logger.error(f'[SegmentationDataset] huggingface_dataset must be either a string or a HfDataset.')
-            raise ValueError(f'[SegmentationDataset] huggingface_dataset must be either a string or a HfDataset.')
-        self.logger.info(f'[SegmentationDataset] Loaded dataset: {self.huggingface_dataset}')
-        self.logger.info(f'[SegmentationDataset] Loaded dataset length: {self.huggingface_dataset.num_rows}')
-        # Tokenizer:
-        if callable(tokenizer):
-            self.tokenizer = tokenizer
-        else:
-            self.logger.error(f'[SegmentationDataset] Tokenizer must be a callable function.')
-            raise ValueError(f'[SegmentationDataset] Tokenizer must be a callable function.')
-        # Segmenter:
-        if not isinstance(segmenter, SentenceSegmenter):
-            self.logger.error(f'[SegmentationDataset] Segmenter must be a SentenceSegmenter instance.')
-            raise ValueError(f'[SegmentationDataset] Segmenter must be a SentenceSegmenter instance.')
-        else:
-            self.segmenter = segmenter
-        # Percentage:
-        if not (0.0 < percentage <= 1.0):
-            self.logger.error(f'[SegmentationDataset] Percentage must be between 0.0 and 1.0.')
-            raise ValueError(f'[SegmentationDataset] Percentage must be between 0.0 and 1.0.')
-        else:
-            self.percentage = percentage
-        # Return type:
-        if not isinstance(return_type, type):
-            self.logger.error(f'[SegmentationDataset] return_type must be a type.')
-            raise ValueError(f'[SegmentationDataset] return_type must be a type.')
-        elif return_type not in [dict, tuple]:
-            self.logger.error(f'[SegmentationDataset] return_type must be either dict or tuple.')
-            raise ValueError(f'[SegmentationDataset] return_type must be either dict or tuple.')
-        else:
-            self.return_type = return_type
-    def get_loader(self, batch_size=8, shuffle=True, num_workers=0, **kwargs) -> DataLoader:
-        """
-        Returns a PyTorch DataLoader for this dataset.
-        Parameters
-        ----------
-        batch_size : int
-            Number of samples per batch.
-        shuffle : bool
-            Whether to shuffle the dataset.
-        num_workers : int
-            Number of worker processes.
-        **kwargs
-            Additional arguments for DataLoader.
-        Returns
-        -------
-        [torch.utils.data.DataLoader
-            Configured DataLoader.
-        """
-        # Size handling:
-        return DataLoader(self, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers,
-                          pin_memory=True, **kwargs)
-    def __len__(self) -> int:
-        """
-        Returns the number of samples in the dataset.
-        Returns
-        -------
-        int
-            Total number of samples.
-        """
-        return int(self.huggingface_dataset.num_rows * self.percentage)
-    def __getitem__(self, idx) -> dict | tuple:
-        """
-        Retrieves a single sample and generates segmentation labels.
-        Parameters
-        ----------
-        idx : int
-            Index of the sample.
-        Returns
-        -------
-        tuple
-            A tuple or dict (x_i, y_i, mask_x) with noisy input and corresponding target.
-        """
-        sample = self.huggingface_dataset[idx]['text']
-        sentences = self.segmenter(sample)
-        tokenized = self.tokenizer(sentences['sentences'])
-        if self.return_type == tuple:
-            return (
-                tokenized['input_ids'],                 # x
-                sentences['sentence_boundaries'],       # y
-                tokenized['attention_mask'],            # x_mask
-                sentences['sentence_mask'],             # y_mask
-                sentences['sentence_candidates'],       # y_prime_mask
-            )
-        elif self.return_type == dict:
-            return_value = {
-                'input': tokenized['input_ids'],
-                'input_mask': tokenized['attention_mask'],
-                'labels': sentences['sentence_boundaries'],
-                'output_mask': sentences['sentence_mask'],
-                'candidate_mask': sentences['sentence_candidates']
-            }
-        else:
-            raise ValueError(f'[SegmentationDataset] return_type must be either dict or tuple.')
-        return return_value
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-#                        END OF FILE                        #
-# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                                                           #
+#   This file was created by: Alberto Palomo Alonso         #
+# Universidad de Alcalá - Escuela Politécnica Superior      #
+#                                                           #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+# Import statements:
+import logging
+from torch.utils.data import Dataset, DataLoader
+from datasets import Dataset as HfDataset
+from datasets import load_from_disk
+from .tokenizer import SegmentationTokenizer, SentenceSegmenter
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                                                           #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+class SegmentationDataset(Dataset):
+    def __init__(
+            self,
+            huggingface_dataset: str | HfDataset,
+            tokenizer: SegmentationTokenizer,
+            segmenter: SentenceSegmenter,
+            logger: logging.Logger = None,
+            percentage: float = 1.0,
+            return_type: type = dict
+    ):
+        """
+        A segmentation dataset takes a huggingface dataset or a path to a dataset on disk with the
+        wikipedia-segmentation format. It loads the dataset and prepares it for training.
+        Wikipedia-segmentation format:
+        - The dataset is expected to be a huggingface dataset or a path to a dataset on disk.
+        - The dataset should contain the following fields:
+        >>> sample = {
+        >>>    'text': ['Article 1', 'Article 2', ...],
+        >>>    'titles': ['Title 1', 'Title 2', ...],
+        >>>    'id': str,
+        >>>    'words': int
+        >>>    'paragraphs': int
+        >>>    'sentences': int
+        >>> }
+        - The dataset should be a list of dictionaries, where each dictionary contains the fields above.
+        Parameters
+        ----------
+        huggingface_dataset : str | HfDataset
+            A huggingface dataset or a path to a dataset on disk with the wikipedia-segmentation format.
+        tokenizer : callable
+            A tokenizer function that takes a string and returns a list of tokens.
+        logger : logging.Logger, optional
+            Logger instance. If not provided, a null logger will be used.
+        percentage : float
+            Percentage of the dataset to use. Default is 1.0 (100%).
+        return_type : type
+            The return type of __getitem__, either dict or tuple. Default is dict.
+        Raises
+        ------
+        ValueError
+            If the huggingface_dataset is not a string or a HfDataset.
+        ValueError
+            If the tokenizer is not a callable function or class.
+        ValueError
+            If the sentence_tokenizer is not a callable function or class.
+        ValueError
+            If the dtype is not a type.
+        """
+        # Null logging:
+        if not isinstance(logger, logging.Logger):
+            self.logger = logging.getLogger("null")
+            self.logger.addHandler(logging.NullHandler())
+        else:
+            self.logger = logger
+        # Loading:
+        if isinstance(huggingface_dataset, HfDataset):
+            self.huggingface_dataset = huggingface_dataset
+        elif isinstance(huggingface_dataset, str):
+            self.huggingface_dataset = load_from_disk(huggingface_dataset)
+        else:
+            self.logger.error(f'[SegmentationDataset] huggingface_dataset must be either a string or a HfDataset.')
+            raise ValueError(f'[SegmentationDataset] huggingface_dataset must be either a string or a HfDataset.')
+        self.logger.info(f'[SegmentationDataset] Loaded dataset: {self.huggingface_dataset}')
+        self.logger.info(f'[SegmentationDataset] Loaded dataset length: {self.huggingface_dataset.num_rows}')
+        # Tokenizer:
+        if callable(tokenizer):
+            self.tokenizer = tokenizer
+        else:
+            self.logger.error(f'[SegmentationDataset] Tokenizer must be a callable function.')
+            raise ValueError(f'[SegmentationDataset] Tokenizer must be a callable function.')
+        # Segmenter:
+        if not isinstance(segmenter, SentenceSegmenter):
+            self.logger.error(f'[SegmentationDataset] Segmenter must be a SentenceSegmenter instance.')
+            raise ValueError(f'[SegmentationDataset] Segmenter must be a SentenceSegmenter instance.')
+        else:
+            self.segmenter = segmenter
+        # Percentage:
+        if not (0.0 < percentage <= 1.0):
+            self.logger.error(f'[SegmentationDataset] Percentage must be between 0.0 and 1.0.')
+            raise ValueError(f'[SegmentationDataset] Percentage must be between 0.0 and 1.0.')
+        else:
+            self.percentage = percentage
+        # Return type:
+        if not isinstance(return_type, type):
+            self.logger.error(f'[SegmentationDataset] return_type must be a type.')
+            raise ValueError(f'[SegmentationDataset] return_type must be a type.')
+        elif return_type not in [dict, tuple]:
+            self.logger.error(f'[SegmentationDataset] return_type must be either dict or tuple.')
+            raise ValueError(f'[SegmentationDataset] return_type must be either dict or tuple.')
+        else:
+            self.return_type = return_type
+    def get_loader(self, batch_size=8, shuffle=True, num_workers=0, **kwargs) -> DataLoader:
+        """
+        Returns a PyTorch DataLoader for this dataset.
+        Parameters
+        ----------
+        batch_size : int
+            Number of samples per batch.
+        shuffle : bool
+            Whether to shuffle the dataset.
+        num_workers : int
+            Number of worker processes.
+        **kwargs
+            Additional arguments for DataLoader.
+        Returns
+        -------
+        [torch.utils.data.DataLoader
+            Configured DataLoader.
+        """
+        # Size handling:
+        return DataLoader(self, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers,
+                          pin_memory=True, **kwargs)
+    def __len__(self) -> int:
+        """
+        Returns the number of samples in the dataset.
+        Returns
+        -------
+        int
+            Total number of samples.
+        """
+        return int(self.huggingface_dataset.num_rows * self.percentage)
+    def __getitem__(self, idx) -> dict | tuple:
+        """
+        Retrieves a single sample and generates segmentation labels.
+        Parameters
+        ----------
+        idx : int
+            Index of the sample.
+        Returns
+        -------
+        tuple
+            A tuple or dict (x_i, y_i, mask_x) with noisy input and corresponding target.
+        """
+        sample = self.huggingface_dataset[idx]['text']
+        sentences = self.segmenter(sample)
+        tokenized = self.tokenizer(sentences['sentences'])
+        if self.return_type == tuple:
+            return (
+                tokenized['input_ids'],                 # x
+                sentences['sentence_boundaries'],       # y
+                tokenized['attention_mask'],            # x_mask
+                sentences['sentence_mask'],             # y_mask
+                sentences['sentence_candidates'],       # y_prime_mask
+            )
+        elif self.return_type == dict:
+            return_value = {
+                'input': tokenized['input_ids'],
+                'input_mask': tokenized['attention_mask'],
+                'labels': sentences['sentence_boundaries'],
+                'output_mask': sentences['sentence_mask'],
+                'candidate_mask': sentences['sentence_candidates']
+            }
+        else:
+            raise ValueError(f'[SegmentationDataset] return_type must be either dict or tuple.')
+        return return_value
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                        END OF FILE                        #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #