| """ | |
| GSLM Model Configuration | |
| """ | |
| import json | |
| import os | |
| from typing import Optional | |
| from transformers.configuration_utils import PretrainedConfig | |
| from transformers.utils import logging | |
| logger = logging.get_logger(__name__) | |
| class GSLMConfig(PretrainedConfig): | |
| """ | |
| Configuration class for GSLM (Generative Spoken Language Model). | |
| This configuration class stores all parameters needed to initialize a GSLMModel. | |
| """ | |
| model_type = "gslm" | |
| def __init__( | |
| self, | |
| vocab_size: int = 204, | |
| d_model: int = 1024, | |
| nhead: int = 16, | |
| num_layers: int = 12, | |
| dim_feedforward: int = 4096, | |
| dropout: float = 0.1, | |
| attention_dropout: float = 0.1, | |
| max_seq_length: int = 3072, | |
| pad_idx: int = 204, | |
| share_input_output_embed: bool = True, | |
| activation: str = "relu", | |
| architecture: str = "transformer_lm_big", | |
| **kwargs | |
| ): | |
| """ | |
| Initialize GSLM configuration. | |
| Args: | |
| vocab_size: Size of the vocabulary | |
| d_model: Dimensionality of the embeddings and hidden states | |
| nhead: Number of attention heads | |
| num_layers: Number of transformer layers | |
| dim_feedforward: Dimensionality of the feedforward network | |
| dropout: Dropout probability | |
| attention_dropout: Dropout probability for attention weights | |
| max_seq_length: Maximum sequence length | |
| pad_idx: Padding token index | |
| share_input_output_embed: Whether to share input and output embeddings | |
| activation: Activation function ("relu" or "gelu") | |
| architecture: Model architecture name | |
| """ | |
| self.vocab_size = vocab_size | |
| self.d_model = d_model | |
| self.nhead = nhead | |
| self.num_layers = num_layers | |
| self.dim_feedforward = dim_feedforward | |
| self.dropout = dropout | |
| self.attention_dropout = attention_dropout | |
| self.max_seq_length = max_seq_length | |
| self.pad_idx = pad_idx | |
| self.share_input_output_embed = share_input_output_embed | |
| self.activation = activation | |
| self.architecture = architecture | |
| super().__init__(**kwargs) | |