from transformers import HubertConfig, PretrainedConfig class AVHubertConfig(PretrainedConfig): model_type: str = "avhubert" def __init__( self, label_rate: int = 100, encoder_layers: int = 12, encoder_embed_dim: int = 768, encoder_ffn_embed_dim: int = 3072, encoder_attention_heads: int = 12, activation_fn: str = "gelu", dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.0, encoder_layerdrop: float = 0.0, dropout_input: float = 0.0, conv_dim: tuple[int, ...] = (512, 512, 512, 512, 512, 512, 512), conv_stride: tuple[int, ...] = (5, 2, 2, 2, 2, 2, 2), conv_kernel: tuple[int, ...] = (10, 3, 3, 3, 3, 2, 2), conv_bias: bool = False, conv_pos: int = 128, conv_pos_groups: int = 16, resnet_relu_type: str = "prelu", audio_feat_dim: int = 104, modality_fuse: str = "concat", decoder_embed_dim: int = 768, decoder_ffn_embed_dim: int = 3072, decoder_layers: int = 6, decoder_layerdrop: float = 0.0, decoder_attention_heads: int = 4, decoder_learned_pos: bool = False, decoder_normalize_before: bool = False, no_token_positional_embeddings: bool = False, decoder_dropout: float = 0.1, decoder_attention_dropout: float = 0.1, decoder_activation_dropout: float = 0.0, max_target_positions: int = 2048, share_decoder_input_output_embed: bool = False, no_scale_embedding: bool = True, sample_rate: int = 25, num_labels: int = 100, initializer_range: float = 0.02, do_stable_layer_norm: bool = False, vocab_size: int | None = None, freeze_feature_encoder: bool = False, freeze_base_model: bool = False, ctc_loss_reduction: str = "mean", ctc_zero_infinity: bool = False, ctc_loss_weight: float = 0.3, special_ids: list[int] | None = None, **kwargs, ): super().__init__(**kwargs) self.label_rate = label_rate self.encoder_layers = encoder_layers self.encoder_embed_dim = encoder_embed_dim self.encoder_ffn_embed_dim = encoder_ffn_embed_dim self.encoder_attention_heads = encoder_attention_heads self.activation_fn = activation_fn self.dropout = dropout self.attention_dropout = attention_dropout self.activation_dropout = activation_dropout self.encoder_layerdrop = encoder_layerdrop self.dropout_input = dropout_input self.conv_dim = conv_dim self.conv_kernel = conv_kernel self.conv_stride = conv_stride self.conv_bias = conv_bias self.conv_pos = conv_pos self.conv_pos_groups = conv_pos_groups self.resnet_relu_type = resnet_relu_type self.audio_feat_dim = audio_feat_dim self.modality_fuse = modality_fuse self.decoder_embed_dim = decoder_embed_dim self.decoder_ffn_embed_dim = decoder_ffn_embed_dim self.decoder_layers = decoder_layers self.decoder_layerdrop = decoder_layerdrop self.decoder_attention_heads = decoder_attention_heads self.decoder_learned_pos = decoder_learned_pos self.decoder_normalize_before = decoder_normalize_before self.no_token_positional_embeddings = no_token_positional_embeddings self.decoder_dropout = decoder_dropout self.decoder_attention_dropout = decoder_attention_dropout self.decoder_activation_dropout = decoder_activation_dropout self.max_target_positions = max_target_positions self.share_decoder_input_output_embed = share_decoder_input_output_embed self.no_scale_embedding = no_scale_embedding self.sample_rate = sample_rate self.num_labels = num_labels self.initializer_range = initializer_range self.do_stable_layer_norm = do_stable_layer_norm self.vocab_size = vocab_size self.freeze_feature_encoder = freeze_feature_encoder self.freeze_base_model = freeze_base_model self.ctc_loss_reduction = ctc_loss_reduction self.ctc_zero_infinity = ctc_zero_infinity self.ctc_loss_weight = ctc_loss_weight self.special_ids = special_ids @property def encoder_config(self) -> HubertConfig: return HubertConfig( hidden_size=self.encoder_embed_dim, num_hidden_layers=self.encoder_layers, num_attention_heads=self.encoder_attention_heads, intermediate_size=self.encoder_ffn_embed_dim, hidden_act=self.activation_fn, hidden_dropout=self.dropout, activation_dropout=self.activation_dropout, attention_dropout=self.attention_dropout, layerdrop=self.encoder_layerdrop, conv_dim=self.conv_dim, conv_kernel=self.conv_kernel, conv_stride=self.conv_stride, conv_bias=self.conv_bias, num_conv_pos_embeddings=self.conv_pos, num_conv_pos_embedding_groups=self.conv_pos_groups, feat_extract_activation="gelu", do_stable_layer_norm=self.do_stable_layer_norm, max_position_embeddings=self.max_target_positions, learned_pos=self.decoder_learned_pos, share_input_output_embed=self.share_decoder_input_output_embed, ) @property def decoder_config(self) -> HubertConfig: return HubertConfig( hidden_size=self.decoder_embed_dim, num_hidden_layers=self.decoder_layers, num_attention_heads=self.decoder_attention_heads, intermediate_size=self.decoder_ffn_embed_dim, hidden_act=self.activation_fn, hidden_dropout=self.decoder_dropout, activation_dropout=self.decoder_activation_dropout, attention_dropout=self.decoder_attention_dropout, layerdrop=self.decoder_layerdrop, conv_dim=self.conv_dim, conv_kernel=self.conv_kernel, conv_stride=self.conv_stride, conv_bias=self.conv_bias, num_conv_pos_embeddings=self.conv_pos, num_conv_pos_embedding_groups=self.conv_pos_groups, feat_extract_activation="gelu", do_stable_layer_norm=self.do_stable_layer_norm, max_position_embeddings=self.max_target_positions, learned_pos=self.decoder_learned_pos, share_input_output_embed=self.share_decoder_input_output_embed, )