alverciito
fix docstrings
00f1b20
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
# #
# This file was created by: Alberto Palomo Alonso #
# Universidad de Alcalá - Escuela Politécnica Superior #
# #
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
import torch
from transformers import PreTrainedModel, PretrainedConfig
from src.model import SegmentationNetwork
from src.model.config import ModelConfig, TransformerConfig, CoSeNetConfig
class SentenceCoseNetConfig(PretrainedConfig):
"""
Configuration class for SentenceCoseNet.
This class stores all hyperparameters needed to initialize
a `SentenceCoseNet` model. It follows Hugging Face's
`PretrainedConfig` interface so the model can be saved,
loaded, and shared via the Hub.
Attributes:
model_type (str):
Identifier used by Hugging Face to register the model.
vocab_size (int):
Size of the tokenizer vocabulary.
emb_dim (int):
Dimensionality of token embeddings.
seq_len (int):
Maximum input sequence length supported by the model.
dropout (float):
Dropout probability applied in Transformer blocks.
valid_padding (bool):
Whether padding tokens are treated as valid positions.
cosenet (dict):
Configuration of the cosine-similarity network head.
transformers (list[dict]):
List of Transformer encoder block configurations.
"""
model_type = "sentence_cosenet"
def __init__(
self,
vocab_size: int = 32768,
emb_dim: int = 256,
seq_len: int = 382,
dropout: float = 0.0,
valid_padding: bool = True,
cosenet: dict | None = None,
transformers: list | None = None,
**kwargs,
):
"""
Initialize SentenceCoseNet configuration.
Args:
vocab_size:
Size of the tokenizer vocabulary.
emb_dim:
Dimension of token embeddings.
seq_len:
Maximum number of tokens per input sequence.
dropout:
Dropout probability used throughout the network.
valid_padding:
Whether padded tokens should be considered valid.
cosenet:
Optional configuration dictionary for the cosine
similarity network head.
transformers:
Optional list of dictionaries describing each
Transformer encoder block.
**kwargs:
Additional keyword arguments passed to
`PretrainedConfig`.
"""
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.emb_dim = emb_dim
self.seq_len = seq_len
self.dropout = dropout
self.valid_padding = valid_padding
self.cosenet = cosenet or {
"trainable": True,
"init_scale": 5.0
}
self.transformers = transformers or [
{
"attention_heads": 16,
"feed_forward_multiplier": 8,
"dropout": 0.0,
"pre_normalize": True
},
{
"attention_heads": 16,
"feed_forward_multiplier": 8,
"dropout": 0.0,
"pre_normalize": True
}
]
self.hidden_size = emb_dim
self.max_position_embeddings = seq_len
class SentenceCoseNet(PreTrainedModel):
"""
Sentence-level encoder model based on CoseNet.
This class wraps a custom PyTorch segmentation network
and exposes it as a Hugging Face `PreTrainedModel`,
enabling interoperability with the Transformers ecosystem.
The model is intended for:
- Sentence embeddings
- Semantic search
- Information retrieval
- Similarity learning
"""
config_class = SentenceCoseNetConfig
base_model_prefix = "cosenet"
def __init__(self, config: SentenceCoseNetConfig):
"""
Initialize the SentenceCoseNet model.
Args:
config:
Instance of `SentenceCoseNetConfig` containing
model hyperparameters.
"""
super().__init__(config)
# Core PyTorch model
self.model = SegmentationNetwork(self.to_model_config(config))
# Initialize weights following HF conventions
self.post_init()
# Set evaluation mode by default
self.model.eval()
def encode(
self,
input_ids: torch.Tensor,
attention_mask=None
) -> torch.Tensor:
"""
Encode input token sequences into contextualized embeddings.
This method performs embedding lookup, positional encoding,
and Transformer-based contextualization, returning token-level
representations.
Args:
input_ids:
Tensor of token IDs with shape
`(batch_size, sequence_length)`.
attention_mask:
Optional attention mask indicating valid (1) and
padded (0) positions. Shape:
`(batch_size, sequence_length)`.
Returns:
torch.Tensor:
Contextualized token embeddings with shape
`(batch_size, sequence_length, emb_dim)`.
"""
# Set the model task:
self.model.task = 'token_encoding'
# Convert to type:
if len(input_ids.shape) == 2:
x = input_ids.int().unsqueeze(1)
mask = attention_mask.unsqueeze(1) if attention_mask is not None else None
output = self.model(x=x, mask=mask).squeeze(1)
elif len(input_ids.shape) == 3:
x = input_ids.int()
mask = attention_mask if attention_mask is not None else None
output = self.model(x=x, mask=mask)
else:
raise ValueError("Input tensor must be of shape (Batch, Tokens) or (Batch, Sentences, Tokens).")
return output
def get_sentence_embedding(
self,
input_ids: torch.Tensor,
attention_mask=None,
normalize: bool = False,
) -> torch.Tensor:
"""
Compute sentence embeddings for zero-shot transfer and
information retrieval.
Args:
input_ids (torch.Tensor):
Tensor of shape (B, T)
attention_mask (torch.Tensor, optional):
Boolean or binary mask of shape (B, T)
normalize (bool, optional):
Whether to L2-normalize the output embeddings.
Returns:
torch.Tensor:
Sentence embeddings of shape (B, D)
"""
# Set the model task:
self.model.task = 'sentence_encoding'
output = self.call(input_ids, attention_mask)
if normalize:
output = torch.nn.functional.normalize(output, p=2, dim=-1)
return output
def similarity(self, embeddings_1: torch.Tensor, embeddings_2: torch.Tensor) -> torch.Tensor:
"""
Compute cosine similarity scores between two sets of embeddings.
Args:
embeddings_1 (torch.Tensor):
Tensor of shape (B, S, D) containing the first set of
embeddings concatenated along the first dimension.
embeddings_2 (torch.Tensor):
Tensor of shape (B, S, D) containing the second set of
embeddings concatenated along the first dimension.
Returns:
torch.Tensor:
Similarity scores of shape (B, S)
"""
# Concatenate embeddings (B, S, 2, D)
embeddings = torch.stack([embeddings_1, embeddings_2], dim=-2)
# Compute distances (B, S, 2, 2):
embeddings = self.model.distance_layer(embeddings)
# Return cosine similarities (B, S):
return (embeddings[..., 0, 1] + embeddings[..., 1, 0]) / 2
def forward(
self,
input_ids: torch.Tensor,
attention_mask=None,
candidate_mask=None,
**kwargs,
):
"""
Forward pass of the SentenceCoseNet model.
This method delegates execution to the underlying
`SegmentationNetwork`.
Args:
input_ids:
Tensor of token IDs with shape
`(batch_size, sequence_length)`.
attention_mask:
Optional attention mask tensor.
candidate_mask:
Optional mask indicating candidate segments or spans.
**kwargs:
Additional arguments forwarded to the core model.
Returns:
Model-specific output as produced by `SegmentationNetwork`.
"""
self.model.task = 'segmentation'
return self.model(
x=input_ids,
mask=attention_mask,
candidate_mask=candidate_mask,
**kwargs,
)
def call(self, input_ids: torch.Tensor, attention_mask=None) -> torch.Tensor:
"""
Internal method to handle different input shapes (task already selected).
Args:
input_ids:
Tensor of token IDs with shape
`(batch_size, sequence_length)`.
attention_mask:
Optional attention mask tensor.
"""
# Convert to type:
if len(input_ids.shape) == 2:
x = input_ids.int().unsqueeze(1)
mask = attention_mask.unsqueeze(1) if attention_mask is not None else None
output = self.model(x=x, mask=mask).squeeze(1)
elif len(input_ids.shape) == 3:
x = input_ids.int()
mask = attention_mask if attention_mask is not None else None
output = self.model(x=x, mask=mask)
else:
raise ValueError("Input tensor must be of shape (Batch, Tokens) or (Batch, Sentences, Tokens).")
return output
@staticmethod
def to_model_config(config: SentenceCoseNetConfig) -> ModelConfig:
"""
Convert Hugging Face config to internal ModelConfig.
"""
mc = ModelConfig()
# Core dimensions
mc.vocab_size = config.vocab_size
mc.model_dim = config.emb_dim
mc.valid_padding = config.valid_padding
# CoSeNet config
mc.cosenet = CoSeNetConfig(**config.cosenet)
# Transformer stack
mc.transformers = [
TransformerConfig(**cfg)
for cfg in config.transformers
]
return mc
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
# END OF FILE #
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #