| """ |
| HuggingFace Configuration for Distilled Speech Encoder. |
| |
| This is a Data2Vec-style bidirectional speech encoder distilled from AuriStream. |
| """ |
|
|
| from transformers import PretrainedConfig |
|
|
|
|
| class DistilledSpeechConfig(PretrainedConfig): |
| """ |
| Configuration class for DistilledSpeechModel. |
| |
| This is a bidirectional transformer encoder for speech, trained via |
| Data2Vec-style distillation from AuriStream models. |
| |
| Architecture: |
| - 7-layer convolutional feature encoder (16kHz -> 50Hz) |
| - N-layer bidirectional transformer with RoPE |
| - Optional projection head (for distillation training) |
| |
| Args: |
| hidden_size (`int`, *optional*, defaults to 768): |
| Dimensionality of the encoder layers and the pooler layer. |
| num_hidden_layers (`int`, *optional*, defaults to 12): |
| Number of hidden layers in the Transformer encoder. |
| num_attention_heads (`int`, *optional*, defaults to 12): |
| Number of attention heads for each attention layer. |
| intermediate_size (`int`, *optional*, defaults to 3072): |
| Dimensionality of the "intermediate" (feed-forward) layer. |
| hidden_act (`str`, *optional*, defaults to `"gelu"`): |
| The non-linear activation function in the encoder. |
| hidden_dropout (`float`, *optional*, defaults to 0.1): |
| The dropout probability for all fully connected layers. |
| attention_dropout (`float`, *optional*, defaults to 0.1): |
| The dropout ratio for the attention probabilities. |
| layer_norm_eps (`float`, *optional*, defaults to 1e-5): |
| The epsilon used by the layer normalization layers. |
| conv_dim (`tuple`, *optional*): |
| Tuple of integers defining the number of channels in each conv layer. |
| conv_stride (`tuple`, *optional*): |
| Tuple of integers defining the stride of each conv layer. |
| conv_kernel (`tuple`, *optional*): |
| Tuple of integers defining the kernel size of each conv layer. |
| conv_bias (`bool`, *optional*, defaults to `False`): |
| Whether to use bias in conv layers. |
| feat_extract_norm (`str`, *optional*, defaults to `"group"`): |
| Normalization type for first conv layer ("group" or "layer"). |
| feat_extract_activation (`str`, *optional*, defaults to `"gelu"`): |
| Activation function for conv layers. |
| feat_proj_dropout (`float`, *optional*, defaults to 0.0): |
| Dropout for feature projection layer. |
| use_rope (`bool`, *optional*, defaults to `True`): |
| Whether to use Rotary Position Embeddings (RoPE). |
| rope_theta (`float`, *optional*, defaults to 10000.0): |
| Base frequency for RoPE. |
| mask_time_prob (`float`, *optional*, defaults to 0.065): |
| Probability of masking time steps (for training). |
| mask_time_length (`int`, *optional*, defaults to 10): |
| Length of masked time spans (for training). |
| """ |
| |
| model_type = "distilled_speech" |
| |
| def __init__( |
| self, |
| |
| hidden_size: int = 768, |
| num_hidden_layers: int = 12, |
| num_attention_heads: int = 12, |
| intermediate_size: int = 3072, |
| hidden_act: str = "gelu", |
| hidden_dropout: float = 0.1, |
| attention_dropout: float = 0.1, |
| activation_dropout: float = 0.0, |
| layer_norm_eps: float = 1e-5, |
| |
| |
| conv_dim: tuple = (512, 512, 512, 512, 512, 512, 512), |
| conv_stride: tuple = (5, 2, 2, 2, 2, 2, 2), |
| conv_kernel: tuple = (10, 3, 3, 3, 3, 2, 2), |
| conv_bias: bool = False, |
| feat_extract_norm: str = "group", |
| feat_extract_activation: str = "gelu", |
| feat_proj_dropout: float = 0.0, |
| |
| |
| use_rope: bool = True, |
| rope_theta: float = 10000.0, |
| |
| |
| mask_time_prob: float = 0.065, |
| mask_time_length: int = 10, |
| mask_time_min_masks: int = 2, |
| |
| |
| teacher_model_name: str = None, |
| teacher_hidden_size: int = None, |
| |
| |
| sample_rate: int = 16000, |
| |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| |
| self.hidden_size = hidden_size |
| self.num_hidden_layers = num_hidden_layers |
| self.num_attention_heads = num_attention_heads |
| self.intermediate_size = intermediate_size |
| self.hidden_act = hidden_act |
| self.hidden_dropout = hidden_dropout |
| self.attention_dropout = attention_dropout |
| self.activation_dropout = activation_dropout |
| self.layer_norm_eps = layer_norm_eps |
| |
| |
| self.conv_dim = list(conv_dim) |
| self.conv_stride = list(conv_stride) |
| self.conv_kernel = list(conv_kernel) |
| self.conv_bias = conv_bias |
| self.feat_extract_norm = feat_extract_norm |
| self.feat_extract_activation = feat_extract_activation |
| self.feat_proj_dropout = feat_proj_dropout |
| |
| |
| self.use_rope = use_rope |
| self.rope_theta = rope_theta |
| |
| |
| self.mask_time_prob = mask_time_prob |
| self.mask_time_length = mask_time_length |
| self.mask_time_min_masks = mask_time_min_masks |
| |
| |
| self.teacher_model_name = teacher_model_name |
| self.teacher_hidden_size = teacher_hidden_size |
| |
| |
| self.sample_rate = sample_rate |
| |
| @property |
| def output_hz(self) -> int: |
| """Output frequency of the model in Hz.""" |
| stride_product = 1 |
| for s in self.conv_stride: |
| stride_product *= s |
| return self.sample_rate // stride_product |
|
|