File size: 10,913 Bytes

dbd79bd
 
 
 
 
 
 
da14095
 
 
dbd79bd
 
da14095
dbd79bd
da14095
dbd79bd
da14095
 
 
 
dbd79bd
da14095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbd79bd
 
da14095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbd79bd
 
da14095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbd79bd
 
 
da14095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbd79bd
da14095
dbd79bd
 
da14095
 
 
dbd79bd
da14095
 
 
8068e2f
869db96
da14095
33c844e
dbd79bd
4aa1cf7
 
 
da14095
 
 
 
 
dbd79bd
da14095
dbd79bd
da14095
 
 
dbd79bd
 
da14095
 
 
 
 
 
 
dbd79bd
da14095
 
 
 
 
34f99b8
 
27cdddc
34f99b8
 
 
 
 
 
 
 
 
 
 
dbd79bd
da14095
 
 
 
27cdddc
da14095
dbd79bd
da14095
 
dbd79bd
da14095
 
 
 
 
9a7365b
 
da14095
 
 
 
 
34f99b8
 
 
27cdddc
 
34f99b8
 
 
27cdddc
 
d0a3e2d
 
 
 
dd4f3a8
 
 
 
 
 
d0a3e2d
 
 
 
00f1b20
d0a3e2d
27cdddc
0d019f1
27cdddc
 
 
00f1b20
27cdddc
da14095
 
 
 
 
 
 
 
 
dbd79bd
da14095
 
dbd79bd
da14095
 
 
 
 
 
 
 
 
 
dbd79bd
da14095
 
 
34f99b8
da14095
 
 
 
 
 
dbd79bd
34f99b8
 
 
 
 
 
 
 
 
 
 
 
6faa82b
 
 
34f99b8
 
 
 
 
 
 
 
8068e2f
 
b8b34d9
 
 
 
 
 
8068e2f
 
 
dbd79bd
b8b34d9
8068e2f
dbd79bd
b8b34d9
 
 
8068e2f
b8b34d9
dbd79bd
b8b34d9
dbd79bd

# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
#                                                           #
#   This file was created by: Alberto Palomo Alonso         #
# Universidad de Alcalá - Escuela Politécnica Superior      #
#                                                           #
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
import torch
from transformers import PreTrainedModel, PretrainedConfig
from src.model import SegmentationNetwork
from src.model.config import ModelConfig, TransformerConfig, CoSeNetConfig


class SentenceCoseNetConfig(PretrainedConfig):
    """
    Configuration class for SentenceCoseNet.

    This class stores all hyperparameters needed to initialize
    a `SentenceCoseNet` model. It follows Hugging Face's
    `PretrainedConfig` interface so the model can be saved,
    loaded, and shared via the Hub.

    Attributes:
        model_type (str):
            Identifier used by Hugging Face to register the model.
        vocab_size (int):
            Size of the tokenizer vocabulary.
        emb_dim (int):
            Dimensionality of token embeddings.
        seq_len (int):
            Maximum input sequence length supported by the model.
        dropout (float):
            Dropout probability applied in Transformer blocks.
        valid_padding (bool):
            Whether padding tokens are treated as valid positions.
        cosenet (dict):
            Configuration of the cosine-similarity network head.
        transformers (list[dict]):
            List of Transformer encoder block configurations.
    """

    model_type = "sentence_cosenet"

    def __init__(
        self,
        vocab_size: int = 32768,
        emb_dim: int = 256,
        seq_len: int = 382,
        dropout: float = 0.0,
        valid_padding: bool = True,
        cosenet: dict | None = None,
        transformers: list | None = None,
        **kwargs,
    ):
        """
        Initialize SentenceCoseNet configuration.

        Args:
            vocab_size:
                Size of the tokenizer vocabulary.
            emb_dim:
                Dimension of token embeddings.
            seq_len:
                Maximum number of tokens per input sequence.
            dropout:
                Dropout probability used throughout the network.
            valid_padding:
                Whether padded tokens should be considered valid.
            cosenet:
                Optional configuration dictionary for the cosine
                similarity network head.
            transformers:
                Optional list of dictionaries describing each
                Transformer encoder block.
            **kwargs:
                Additional keyword arguments passed to
                `PretrainedConfig`.
        """
        super().__init__(**kwargs)

        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.seq_len = seq_len
        self.dropout = dropout
        self.valid_padding = valid_padding

        self.cosenet = cosenet or {
            "trainable": True,
            "init_scale": 5.0
        }

        self.transformers = transformers or [
            {
                "attention_heads": 16,
                "feed_forward_multiplier": 8,
                "dropout": 0.0,
                "pre_normalize": True
            },
            {
                "attention_heads": 16,
                "feed_forward_multiplier": 8,
                "dropout": 0.0,
                "pre_normalize": True
            }
        ]

        self.hidden_size = emb_dim
        self.max_position_embeddings = seq_len


class SentenceCoseNet(PreTrainedModel):
    """
    Sentence-level encoder model based on CoseNet.

    This class wraps a custom PyTorch segmentation network
    and exposes it as a Hugging Face `PreTrainedModel`,
    enabling interoperability with the Transformers ecosystem.

    The model is intended for:
    - Sentence embeddings
    - Semantic search
    - Information retrieval
    - Similarity learning
    """

    config_class = SentenceCoseNetConfig
    base_model_prefix = "cosenet"

    def __init__(self, config: SentenceCoseNetConfig):
        """
        Initialize the SentenceCoseNet model.

        Args:
            config:
                Instance of `SentenceCoseNetConfig` containing
                model hyperparameters.
        """
        super().__init__(config)
        
        # Core PyTorch model
        self.model = SegmentationNetwork(self.to_model_config(config))

        # Initialize weights following HF conventions
        self.post_init()

        # Set evaluation mode by default
        self.model.eval()

    def encode(
        self,
        input_ids: torch.Tensor,
        attention_mask=None
    ) -> torch.Tensor:
        """
        Encode input token sequences into contextualized embeddings.

        This method performs embedding lookup, positional encoding,
        and Transformer-based contextualization, returning token-level
        representations.

        Args:
            input_ids:
                Tensor of token IDs with shape
                `(batch_size, sequence_length)`.
            attention_mask:
                Optional attention mask indicating valid (1) and
                padded (0) positions. Shape:
                `(batch_size, sequence_length)`.

        Returns:
            torch.Tensor:
                Contextualized token embeddings with shape
                `(batch_size, sequence_length, emb_dim)`.
        """
        # Set the model task:
        self.model.task = 'token_encoding'
        # Convert to type:
        if len(input_ids.shape) == 2:
            x = input_ids.int().unsqueeze(1)
            mask = attention_mask.unsqueeze(1) if attention_mask is not None else None
            output = self.model(x=x, mask=mask).squeeze(1)
        elif len(input_ids.shape) == 3:
            x = input_ids.int()
            mask = attention_mask if attention_mask is not None else None
            output = self.model(x=x, mask=mask)
        else:
            raise ValueError("Input tensor must be of shape (Batch, Tokens) or (Batch, Sentences, Tokens).")
        return output

    def get_sentence_embedding(
            self,
            input_ids: torch.Tensor,
            attention_mask=None,
            normalize: bool = False,
    ) -> torch.Tensor:
        """
        Compute sentence embeddings for zero-shot transfer and
        information retrieval.

        Args:
            input_ids (torch.Tensor):
                Tensor of shape (B, T)
            attention_mask (torch.Tensor, optional):
                Boolean or binary mask of shape (B, T)
            normalize (bool, optional):
                Whether to L2-normalize the output embeddings.

        Returns:
            torch.Tensor:
                Sentence embeddings of shape (B, D)
        """
        # Set the model task:
        self.model.task = 'sentence_encoding'
        output = self.call(input_ids, attention_mask)

        if normalize:
            output = torch.nn.functional.normalize(output, p=2, dim=-1)

        return output

    def similarity(self, embeddings_1: torch.Tensor, embeddings_2: torch.Tensor) -> torch.Tensor:
        """
        Compute cosine similarity scores between two sets of embeddings.

        Args:
            embeddings_1 (torch.Tensor):
                Tensor of shape (B, S, D) containing the first set of
                embeddings concatenated along the first dimension.

            embeddings_2 (torch.Tensor):
                Tensor of shape (B, S, D) containing the second set of
                embeddings concatenated along the first dimension.

        Returns:
            torch.Tensor:
                Similarity scores of shape (B, S)
        """
        # Concatenate embeddings (B, S, 2, D)
        embeddings = torch.stack([embeddings_1, embeddings_2], dim=-2)
        # Compute distances (B, S, 2, 2):
        embeddings = self.model.distance_layer(embeddings)
        # Return cosine similarities (B, S):
        return (embeddings[..., 0, 1] + embeddings[..., 1, 0]) / 2

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask=None,
        candidate_mask=None,
        **kwargs,
    ):
        """
        Forward pass of the SentenceCoseNet model.

        This method delegates execution to the underlying
        `SegmentationNetwork`.

        Args:
            input_ids:
                Tensor of token IDs with shape
                `(batch_size, sequence_length)`.
            attention_mask:
                Optional attention mask tensor.
            candidate_mask:
                Optional mask indicating candidate segments or spans.
            **kwargs:
                Additional arguments forwarded to the core model.

        Returns:
            Model-specific output as produced by `SegmentationNetwork`.
        """
        self.model.task = 'segmentation'
        return self.model(
            x=input_ids,
            mask=attention_mask,
            candidate_mask=candidate_mask,
            **kwargs,
        )

    def call(self, input_ids: torch.Tensor, attention_mask=None) -> torch.Tensor:
        """
        Internal method to handle different input shapes (task already selected).
        Args:
            input_ids:
                Tensor of token IDs with shape
                `(batch_size, sequence_length)`.
            attention_mask:
                Optional attention mask tensor.
        """
        # Convert to type:
        if len(input_ids.shape) == 2:
            x = input_ids.int().unsqueeze(1)
            mask = attention_mask.unsqueeze(1) if attention_mask is not None else None
            output = self.model(x=x, mask=mask).squeeze(1)
        elif len(input_ids.shape) == 3:
            x = input_ids.int()
            mask = attention_mask if attention_mask is not None else None
            output = self.model(x=x, mask=mask)
        else:
            raise ValueError("Input tensor must be of shape (Batch, Tokens) or (Batch, Sentences, Tokens).")
        return output

    @staticmethod
    def to_model_config(config: SentenceCoseNetConfig) -> ModelConfig:
        """
        Convert Hugging Face config to internal ModelConfig.
        """
        mc = ModelConfig()

        # Core dimensions
        mc.vocab_size = config.vocab_size
        mc.model_dim = config.emb_dim
        mc.valid_padding = config.valid_padding

        # CoSeNet config
        mc.cosenet = CoSeNetConfig(**config.cosenet)

        # Transformer stack
        mc.transformers = [
            TransformerConfig(**cfg)
            for cfg in config.transformers
        ]

        return mc
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
#                        END OF FILE                        #
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #