AuriStreamDistillLarge_100M40PredTeacher_bad / configuration_distilled_speech.py

Upload distilled speech model

6f980ab verified 3 months ago

5.97 kB

	"""
	HuggingFace Configuration for Distilled Speech Encoder.

	This is a Data2Vec-style bidirectional speech encoder distilled from AuriStream.
	"""

	from transformers import PretrainedConfig


	class DistilledSpeechConfig(PretrainedConfig):
	"""
	Configuration class for DistilledSpeechModel.

	This is a bidirectional transformer encoder for speech, trained via
	Data2Vec-style distillation from AuriStream models.

	Architecture:
	- 7-layer convolutional feature encoder (16kHz -> 50Hz)
	- N-layer bidirectional transformer with RoPE
	- Optional projection head (for distillation training)

	Args:
	hidden_size (`int`, optional, defaults to 768):
	Dimensionality of the encoder layers and the pooler layer.
	num_hidden_layers (`int`, optional, defaults to 12):
	Number of hidden layers in the Transformer encoder.
	num_attention_heads (`int`, optional, defaults to 12):
	Number of attention heads for each attention layer.
	intermediate_size (`int`, optional, defaults to 3072):
	Dimensionality of the "intermediate" (feed-forward) layer.
	hidden_act (`str`, optional, defaults to `"gelu"`):
	The non-linear activation function in the encoder.
	hidden_dropout (`float`, optional, defaults to 0.1):
	The dropout probability for all fully connected layers.
	attention_dropout (`float`, optional, defaults to 0.1):
	The dropout ratio for the attention probabilities.
	layer_norm_eps (`float`, optional, defaults to 1e-5):
	The epsilon used by the layer normalization layers.
	conv_dim (`tuple`, optional):
	Tuple of integers defining the number of channels in each conv layer.
	conv_stride (`tuple`, optional):
	Tuple of integers defining the stride of each conv layer.
	conv_kernel (`tuple`, optional):
	Tuple of integers defining the kernel size of each conv layer.
	conv_bias (`bool`, optional, defaults to `False`):
	Whether to use bias in conv layers.
	feat_extract_norm (`str`, optional, defaults to `"group"`):
	Normalization type for first conv layer ("group" or "layer").
	feat_extract_activation (`str`, optional, defaults to `"gelu"`):
	Activation function for conv layers.
	feat_proj_dropout (`float`, optional, defaults to 0.0):
	Dropout for feature projection layer.
	use_rope (`bool`, optional, defaults to `True`):
	Whether to use Rotary Position Embeddings (RoPE).
	rope_theta (`float`, optional, defaults to 10000.0):
	Base frequency for RoPE.
	mask_time_prob (`float`, optional, defaults to 0.065):
	Probability of masking time steps (for training).
	mask_time_length (`int`, optional, defaults to 10):
	Length of masked time spans (for training).
	"""

	model_type = "distilled_speech"

	def __init__(
	self,
	# Transformer architecture
	hidden_size: int = 768,
	num_hidden_layers: int = 12,
	num_attention_heads: int = 12,
	intermediate_size: int = 3072,
	hidden_act: str = "gelu",
	hidden_dropout: float = 0.1,
	attention_dropout: float = 0.1,
	activation_dropout: float = 0.0,
	layer_norm_eps: float = 1e-5,

	# Convolutional feature encoder
	conv_dim: tuple = (512, 512, 512, 512, 512, 512, 512),
	conv_stride: tuple = (5, 2, 2, 2, 2, 2, 2),
	conv_kernel: tuple = (10, 3, 3, 3, 3, 2, 2),
	conv_bias: bool = False,
	feat_extract_norm: str = "group",
	feat_extract_activation: str = "gelu",
	feat_proj_dropout: float = 0.0,

	# Positional encoding
	use_rope: bool = True,
	rope_theta: float = 10000.0,

	# Masking (for training, disabled by default for inference)
	mask_time_prob: float = 0.065,
	mask_time_length: int = 10,
	mask_time_min_masks: int = 2,

	# Teacher info (for reference, not used in inference)
	teacher_model_name: str = None,
	teacher_hidden_size: int = None,

	# Audio
	sample_rate: int = 16000,

	**kwargs,
	):
	super().__init__(**kwargs)

	self.hidden_size = hidden_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.intermediate_size = intermediate_size
	self.hidden_act = hidden_act
	self.hidden_dropout = hidden_dropout
	self.attention_dropout = attention_dropout
	self.activation_dropout = activation_dropout
	self.layer_norm_eps = layer_norm_eps

	# Conv encoder
	self.conv_dim = list(conv_dim)
	self.conv_stride = list(conv_stride)
	self.conv_kernel = list(conv_kernel)
	self.conv_bias = conv_bias
	self.feat_extract_norm = feat_extract_norm
	self.feat_extract_activation = feat_extract_activation
	self.feat_proj_dropout = feat_proj_dropout

	# Position encoding
	self.use_rope = use_rope
	self.rope_theta = rope_theta

	# Masking
	self.mask_time_prob = mask_time_prob
	self.mask_time_length = mask_time_length
	self.mask_time_min_masks = mask_time_min_masks

	# Teacher info
	self.teacher_model_name = teacher_model_name
	self.teacher_hidden_size = teacher_hidden_size

	# Audio
	self.sample_rate = sample_rate

	@property
	def output_hz(self) -> int:
	"""Output frequency of the model in Hz."""
	stride_product = 1
	for s in self.conv_stride:
	stride_product *= s
	return self.sample_rate // stride_product # 50 Hz for default config