Alverciito
/

wikipedia_segmentation

@@ -1,7 +1,15 @@
 ---
 license: apache-2.0
 ---
 ## Baseline Comparison
 | Category | Model / Method | Spanish Support | Training |
 |---|---|---|----------|

 ---
+library_name: transformers
+pipeline_tag: sentence-similarity
+tags:
+- sentence-embeddings
+- information-retrieval
+- semantic-search
 license: apache-2.0
 ---
+# SentenceCoseNet
 ## Baseline Comparison
 | Category | Model / Method | Spanish Support | Training |
 |---|---|---|----------|

__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                                                           #
+#   This file was created by: Alberto Palomo Alonso         #
+# Universidad de Alcalá - Escuela Politécnica Superior      #
+#                                                           #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+from .model import SentenceCoseNet, SentenceCoseNetConfig
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                        END OF FILE                        #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

config.json CHANGED Viewed

@@ -1,16 +1,26 @@
 {
-  "architectures": [
-    "CoseNetTransformer"
-  ],
-  "dropout": 0.0,
   "emb_dim": 256,
-  "model_type": "sentence_transformer",
-  "seq_len": ...,
   "torch_dtype": "float32",
   "transformers_version": "4.57.3",
-  "vocab_size": 32768,
   "auto_map": {
-    "AutoConfig": "configurations.SentenceCoseNetConfig",
     "AutoModel": "model.SentenceCoseNet"
   }
 }

 {
+  "architectures": ["SentenceCoseNet"],
+  "model_type": "sentence_cosenet",
+  "vocab_size": 32768,
+  "hidden_size": 256,
   "emb_dim": 256,
+  "max_position_embeddings": 382,
+  "seq_len": 382,
+  "dropout": 0.0,
+  "pad_token_id": 0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
   "torch_dtype": "float32",
   "transformers_version": "4.57.3",
   "auto_map": {
+    "AutoConfig": "model.SentenceCoseNetConfig",
     "AutoModel": "model.SentenceCoseNet"
   }
 }

configurations.py DELETED Viewed

File without changes

model.py CHANGED Viewed

@@ -4,173 +4,269 @@
 # Universidad de Alcalá - Escuela Politécnica Superior      #
 #                                                           #
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
-# Import statements:
 import torch
-from src.model.config import ModelConfig
-from src.model.cosenet import CosineDistanceLayer, CoSeNet
-from src.model.transformers import EncoderBlock, PositionalEncoding, MaskedMeanPooling
-class CoseNetTransformer(torch.nn.Module):
     """
-    Segmentation network combining Transformer encoders with CoSeNet.
-    This model integrates token embeddings and positional encodings with
-    a stack of Transformer encoder blocks to produce contextualized
-    representations. These representations are then processed by a
-    CoSeNet module to perform structured segmentation, followed by a
-    cosine-based distance computation.
-    The final output is a pair-wise distance matrix suitable for
-    segmentation or boundary detection tasks.
     """
-    def __init__(self, model_config: ModelConfig, **kwargs):
-        """
-        Initialize the segmentation network.
-        The network is composed of an embedding layer, positional encoding,
-        multiple Transformer encoder blocks, a CoSeNet segmentation module,
-        and a cosine distance layer.
         Args:
-            model_config (ModelConfig): Configuration object containing all
-                hyperparameters required to build the model, including
-                vocabulary size, model dimensionality, transformer settings,
-                and CoSeNet parameters.
-            **kwargs: Additional keyword arguments forwarded to
-                `torch.nn.Module`.
         """
         super().__init__(**kwargs)
-        self.valid_padding = model_config.valid_padding
-        # Build layers:
-        self.embedding = torch.nn.Embedding(
-            model_config.vocab_size,
-            model_config.model_dim
-        )
-        self.positional_encoding = PositionalEncoding(
-            emb_dim=model_config.model_dim,
-            max_len=model_config.max_tokens
-        )
-        self.cosenet = CoSeNet(
-            trainable=model_config.cosenet.trainable,
-            init_scale=model_config.cosenet.init_scale
-        )
-        self.distance_layer = CosineDistanceLayer()
-        self.pooling = MaskedMeanPooling(valid_pad=model_config.valid_padding)
-        # Build encoder blocks:
-        module_list = list()
-        for transformer_config in model_config.transformers:
-            encoder_block = EncoderBlock(
-                feature_dim=model_config.model_dim,
-                attention_heads=transformer_config.attention_heads,
-                feed_forward_multiplier=transformer_config.feed_forward_multiplier,
-                dropout=transformer_config.dropout,
-                valid_padding=model_config.valid_padding,
-                pre_normalize=transformer_config.pre_normalize
-            )
-            module_list.append(encoder_block)
-        self.encoder_blocks = torch.nn.ModuleList(module_list)
-    def encode(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
         """
-        Encode input sequences into contextualized representations.
-        The input token indices are embedded and enriched with positional
-        information, then processed by a stack of Transformer encoder
-        blocks.
         Args:
-            x (torch.Tensor): Input tensor of token indices with shape
-                (batch_size, max_tokens).
-            mask (torch.Tensor, optional): Optional mask tensor indicating
-                valid or padded positions, depending on the configuration
-                of the Transformer blocks. Defaults to None. Dimensions should be
-                (batch_size, max_tokens).
         """
-        # Convert to type:
-        x = x.int()
-        # Embedding and positional encoding:
-        x = self.embedding(x)
-        x = self.positional_encoding(x)
-        # Check mask inversion:
-        if mask[0, 0] == 0:
-            mask = torch.logical_not(mask)
-        # Encode:
-        for encoder in self.encoder_blocks:
-            x = encoder(x, mask=mask)
-        return x
-    def forward(self, x: torch.Tensor, mask: torch.Tensor = None, candidate_mask: torch.Tensor = None) -> torch.Tensor:
         """
-        Forward pass of the segmentation network.
-        The input token indices are embedded and enriched with positional
-        information, then processed by a stack of Transformer encoder
-        blocks. The resulting representations are segmented using CoSeNet
-        and finally transformed into a pair-wise distance representation.
         Args:
-            x (torch.Tensor): Input tensor of token indices with shape
-                (batch_size, sequence_length).
-            mask (torch.Tensor, optional): Optional mask tensor indicating
-                valid or padded positions, depending on the configuration
-                of the Transformer blocks. Defaults to None.
-                If `valid_padding` is disabled, the mask is inverted before being
-                passed to CoSeNet to match its masking convention.
-            candidate_mask (torch.Tensor, optional): Optional mask tensor for
-                candidate positions in CoSeNet. Defaults to None.
-                If `valid_padding` is disabled, the mask is inverted before being
-                passed to CoSeNet to match its masking convention.
-        Returns:
-            torch.Tensor: Output tensor containing pairwise distance values
-            derived from the segmented representations.
         """
-        # Convert to type:
-        x = x.int()
-        # Embedding and positional encoding:
-        x = self.embedding(x)
-        x = self.positional_encoding(x)
-        # Reshape x and mask:
-        _b, _s, _t, _d = x.shape
-        x = x.reshape(_b * _s, _t, _d)
-        if mask is not None:
-            mask = mask.reshape(_b * _s, _t).bool()
-        # Encode the sequence:
-        for encoder in self.encoder_blocks:
-            x = encoder(x, mask=mask)
-        # Reshape x and mask:
-        x = x.reshape(_b, _s, _t, _d)
-        if mask is not None:
-            mask = mask.reshape(_b, _s, _t)
-            mask = torch.logical_not(mask) if not self.valid_padding else mask
-        # Apply pooling:
-        x, mask = self.pooling(x, mask=mask)
-        # Compute distances:
-        x = self.distance_layer(x)
-        # Pass through CoSeNet:
-        x = self.cosenet(x, mask=mask)
-        # Apply candidate mask if provided:
-        if candidate_mask is not None:
-            candidate_mask = candidate_mask.bool() if not self.valid_padding else torch.logical_not(candidate_mask.bool())
-            candidate_mask = candidate_mask.to(device=x.device)
-            x = x.masked_fill(candidate_mask, 0)
-        return x
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
 #                        END OF FILE                        #
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #

 # Universidad de Alcalá - Escuela Politécnica Superior      #
 #                                                           #
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
 import torch
+from transformers import PreTrainedModel, PretrainedConfig
+from src.model import SegmentationNetwork
+from src.model.config import ModelConfig, TransformerConfig, CoSeNetConfig
+class SentenceCoseNetConfig(PretrainedConfig):
     """
+    Configuration class for SentenceCoseNet.
+    This class stores all hyperparameters needed to initialize
+    a `SentenceCoseNet` model. It follows Hugging Face's
+    `PretrainedConfig` interface so the model can be saved,
+    loaded, and shared via the Hub.
+    Attributes:
+        model_type (str):
+            Identifier used by Hugging Face to register the model.
+        vocab_size (int):
+            Size of the tokenizer vocabulary.
+        emb_dim (int):
+            Dimensionality of token embeddings.
+        seq_len (int):
+            Maximum input sequence length supported by the model.
+        dropout (float):
+            Dropout probability applied in Transformer blocks.
+        valid_padding (bool):
+            Whether padding tokens are treated as valid positions.
+        cosenet (dict):
+            Configuration of the cosine-similarity network head.
+        transformers (list[dict]):
+            List of Transformer encoder block configurations.
     """
+    model_type = "sentence_cosenet"
+    def __init__(
+        self,
+        vocab_size: int = 32768,
+        emb_dim: int = 256,
+        seq_len: int = 382,
+        dropout: float = 0.0,
+        valid_padding: bool = True,
+        cosenet: dict | None = None,
+        transformers: list | None = None,
+        **kwargs,
+    ):
+        """
+        Initialize SentenceCoseNet configuration.
         Args:
+            vocab_size:
+                Size of the tokenizer vocabulary.
+            emb_dim:
+                Dimension of token embeddings.
+            seq_len:
+                Maximum number of tokens per input sequence.
+            dropout:
+                Dropout probability used throughout the network.
+            valid_padding:
+                Whether padded tokens should be considered valid.
+            cosenet:
+                Optional configuration dictionary for the cosine
+                similarity network head.
+            transformers:
+                Optional list of dictionaries describing each
+                Transformer encoder block.
+            **kwargs:
+                Additional keyword arguments passed to
+                `PretrainedConfig`.
         """
         super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.seq_len = seq_len
+        self.dropout = dropout
+        self.valid_padding = valid_padding
+        self.cosenet = cosenet or {
+            "trainable": True,
+            "init_scale": 5.0
+        }
+        self.transformers = transformers or [
+            {
+                "attention_heads": 16,
+                "feed_forward_multiplier": 8,
+                "dropout": 0.0,
+                "pre_normalize": True
+            },
+            {
+                "attention_heads": 16,
+                "feed_forward_multiplier": 8,
+                "dropout": 0.0,
+                "pre_normalize": True
+            }
+        ]
+        self.hidden_size = emb_dim
+        self.max_position_embeddings = seq_len
+class SentenceCoseNet(PreTrainedModel):
+    """
+    Sentence-level encoder model based on CoseNet.
+    This class wraps a custom PyTorch segmentation network
+    and exposes it as a Hugging Face `PreTrainedModel`,
+    enabling interoperability with the Transformers ecosystem.
+    The model is intended for:
+    - Sentence embeddings
+    - Semantic search
+    - Information retrieval
+    - Similarity learning
+    """
+    config_class = SentenceCoseNetConfig
+    base_model_prefix = "cosenet"
+    def __init__(self, config: SentenceCoseNetConfig):
         """
+        Initialize the SentenceCoseNet model.
         Args:
+            config:
+                Instance of `SentenceCoseNetConfig` containing
+                model hyperparameters.
         """
+        super().__init__(config)
+        # Core PyTorch model
+        self.model = SegmentationNetwork(to_model_config(config))
+        # Initialize weights following HF conventions
+        self.post_init()
+    def encode(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask=None
+    ) -> torch.Tensor:
         """
+        Encode input token sequences into contextualized embeddings.
+        This method performs embedding lookup, positional encoding,
+        and Transformer-based contextualization, returning token-level
+        representations.
         Args:
+            input_ids:
+                Tensor of token IDs with shape
+                `(batch_size, sequence_length)`.
+            attention_mask:
+                Optional attention mask indicating valid (1) and
+                padded (0) positions. Shape:
+                `(batch_size, sequence_length)`.
+        Returns:
+            torch.Tensor:
+                Contextualized token embeddings with shape
+                `(batch_size, sequence_length, emb_dim)`.
+        """
+        # Ensure integer type
+        x = input_ids.int()
+        # Embedding + positional encoding
+        x = self.model.embedding(x)
+        x = self.model.positional_encoding(x)
+        # Transformer encoder stack
+        for encoder in self.model.encoder_blocks:
+            x = encoder(x, mask=attention_mask)
+        return x
+    def get_sentence_embedding(
+            self,
+            input_ids: torch.Tensor,
+            attention_mask=None,
+    ) -> torch.Tensor:
         """
+        Compute sentence embeddings for zero-shot transfer and
+        information retrieval.
+        Args:
+            input_ids (torch.Tensor):
+                Tensor of shape (B, T)
+            attention_mask (torch.Tensor, optional):
+                Boolean or binary mask of shape (B, T)
+        Returns:
+            torch.Tensor:
+                Sentence embeddings of shape (B, D)
+        """
+        # 1) Token-level encoding: (B, T, D)
+        token_embeddings = self.encode(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        # 2) Pooling using the already-configured model pooling
+        pooled, _ = self.model.pooling(
+            token_embeddings,
+            attention_mask
+        )
+        return pooled
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask=None,
+        candidate_mask=None,
+        **kwargs,
+    ):
+        """
+        Forward pass of the SentenceCoseNet model.
+        This method delegates execution to the underlying
+        `SegmentationNetwork`.
+        Args:
+            input_ids:
+                Tensor of token IDs with shape
+                `(batch_size, sequence_length)`.
+            attention_mask:
+                Optional attention mask tensor.
+            candidate_mask:
+                Optional mask indicating candidate segments or spans.
+            **kwargs:
+                Additional arguments forwarded to the core model.
+        Returns:
+            Model-specific output as produced by `SegmentationNetwork`.
+        """
+        return self.model(
+            x=input_ids,
+            mask=attention_mask,
+            candidate_mask=candidate_mask,
+            **kwargs,
+        )
+def to_model_config(self) -> ModelConfig:
+    """
+    Convert Hugging Face config to internal ModelConfig.
+    """
+    mc = ModelConfig()
+    # Core dimensions
+    mc.vocab_size = self.vocab_size
+    mc.model_dim = self.emb_dim
+    mc.valid_padding = self.valid_padding
+    # CoSeNet config
+    mc.cosenet = CoSeNetConfig(**self.cosenet)
+    # Transformer stack
+    mc.transformers = [
+        TransformerConfig(**cfg)
+        for cfg in self.transformers
+    ]
+    return mc
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
 #                        END OF FILE                        #
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #