|
|
|
|
|
|
|
|
from transformers import PretrainedConfig |
|
|
|
|
|
class EATConfig(PretrainedConfig): |
|
|
model_type = "eat" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
embed_dim=768, |
|
|
depth=12, |
|
|
num_heads=12, |
|
|
patch_size=16, |
|
|
stride=16, |
|
|
in_chans=1, |
|
|
mel_bins=128, |
|
|
max_length=768, |
|
|
num_classes=527, |
|
|
model_variant="pretrain", |
|
|
|
|
|
mlp_ratio=4.0, |
|
|
qkv_bias=True, |
|
|
drop_rate=0.0, |
|
|
attn_drop_rate=0.0, |
|
|
activation_dropout=0.0, |
|
|
post_mlp_drop=0.0, |
|
|
start_drop_path_rate=0.0, |
|
|
end_drop_path_rate=0.0, |
|
|
|
|
|
layer_norm_first=False, |
|
|
norm_eps=1e-6, |
|
|
norm_affine=True, |
|
|
fixed_positions=True, |
|
|
|
|
|
img_size=(1024, 128), |
|
|
|
|
|
**kwargs, |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
self.embed_dim = embed_dim |
|
|
self.depth = depth |
|
|
self.num_heads = num_heads |
|
|
self.patch_size = patch_size |
|
|
self.stride = stride |
|
|
self.in_chans = in_chans |
|
|
self.mel_bins = mel_bins |
|
|
self.max_length = max_length |
|
|
self.num_classes = num_classes |
|
|
self.model_variant = model_variant |
|
|
|
|
|
self.mlp_ratio = mlp_ratio |
|
|
self.qkv_bias = qkv_bias |
|
|
self.drop_rate = drop_rate |
|
|
self.attn_drop_rate = attn_drop_rate |
|
|
self.activation_dropout = activation_dropout |
|
|
self.post_mlp_drop = post_mlp_drop |
|
|
self.start_drop_path_rate = start_drop_path_rate |
|
|
self.end_drop_path_rate = end_drop_path_rate |
|
|
|
|
|
self.layer_norm_first = layer_norm_first |
|
|
self.norm_eps = norm_eps |
|
|
self.norm_affine = norm_affine |
|
|
self.fixed_positions = fixed_positions |
|
|
|
|
|
self.img_size = img_size |
|
|
|