AuriStreamDistillLarge_100M40PredTeacher_bad / configuration_distilled_speech.py
klemenk's picture
Upload distilled speech model
6f980ab verified
raw
history blame
5.97 kB
"""
HuggingFace Configuration for Distilled Speech Encoder.
This is a Data2Vec-style bidirectional speech encoder distilled from AuriStream.
"""
from transformers import PretrainedConfig
class DistilledSpeechConfig(PretrainedConfig):
"""
Configuration class for DistilledSpeechModel.
This is a bidirectional transformer encoder for speech, trained via
Data2Vec-style distillation from AuriStream models.
Architecture:
- 7-layer convolutional feature encoder (16kHz -> 50Hz)
- N-layer bidirectional transformer with RoPE
- Optional projection head (for distillation training)
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (feed-forward) layer.
hidden_act (`str`, *optional*, defaults to `"gelu"`):
The non-linear activation function in the encoder.
hidden_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers.
attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
conv_dim (`tuple`, *optional*):
Tuple of integers defining the number of channels in each conv layer.
conv_stride (`tuple`, *optional*):
Tuple of integers defining the stride of each conv layer.
conv_kernel (`tuple`, *optional*):
Tuple of integers defining the kernel size of each conv layer.
conv_bias (`bool`, *optional*, defaults to `False`):
Whether to use bias in conv layers.
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
Normalization type for first conv layer ("group" or "layer").
feat_extract_activation (`str`, *optional*, defaults to `"gelu"`):
Activation function for conv layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
Dropout for feature projection layer.
use_rope (`bool`, *optional*, defaults to `True`):
Whether to use Rotary Position Embeddings (RoPE).
rope_theta (`float`, *optional*, defaults to 10000.0):
Base frequency for RoPE.
mask_time_prob (`float`, *optional*, defaults to 0.065):
Probability of masking time steps (for training).
mask_time_length (`int`, *optional*, defaults to 10):
Length of masked time spans (for training).
"""
model_type = "distilled_speech"
def __init__(
self,
# Transformer architecture
hidden_size: int = 768,
num_hidden_layers: int = 12,
num_attention_heads: int = 12,
intermediate_size: int = 3072,
hidden_act: str = "gelu",
hidden_dropout: float = 0.1,
attention_dropout: float = 0.1,
activation_dropout: float = 0.0,
layer_norm_eps: float = 1e-5,
# Convolutional feature encoder
conv_dim: tuple = (512, 512, 512, 512, 512, 512, 512),
conv_stride: tuple = (5, 2, 2, 2, 2, 2, 2),
conv_kernel: tuple = (10, 3, 3, 3, 3, 2, 2),
conv_bias: bool = False,
feat_extract_norm: str = "group",
feat_extract_activation: str = "gelu",
feat_proj_dropout: float = 0.0,
# Positional encoding
use_rope: bool = True,
rope_theta: float = 10000.0,
# Masking (for training, disabled by default for inference)
mask_time_prob: float = 0.065,
mask_time_length: int = 10,
mask_time_min_masks: int = 2,
# Teacher info (for reference, not used in inference)
teacher_model_name: str = None,
teacher_hidden_size: int = None,
# Audio
sample_rate: int = 16000,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.layer_norm_eps = layer_norm_eps
# Conv encoder
self.conv_dim = list(conv_dim)
self.conv_stride = list(conv_stride)
self.conv_kernel = list(conv_kernel)
self.conv_bias = conv_bias
self.feat_extract_norm = feat_extract_norm
self.feat_extract_activation = feat_extract_activation
self.feat_proj_dropout = feat_proj_dropout
# Position encoding
self.use_rope = use_rope
self.rope_theta = rope_theta
# Masking
self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
# Teacher info
self.teacher_model_name = teacher_model_name
self.teacher_hidden_size = teacher_hidden_size
# Audio
self.sample_rate = sample_rate
@property
def output_hz(self) -> int:
"""Output frequency of the model in Hz."""
stride_product = 1
for s in self.conv_stride:
stride_product *= s
return self.sample_rate // stride_product # 50 Hz for default config