| |
|
| | |
| | model: Emotion2vec |
| | model_conf: |
| | _name: data2vec_multi |
| | activation_dropout: 0.0 |
| | adversarial_hidden_dim: 128 |
| | adversarial_training: false |
| | adversarial_weight: 0.1 |
| | attention_dropout: 0.1 |
| | average_top_k_layers: 16 |
| | batch_norm_target_layer: false |
| | clone_batch: 12 |
| | cls_loss: 1.0 |
| | cls_type: chunk |
| | d2v_loss: 1.0 |
| | decoder_group: false |
| | depth: 8 |
| | dropout_input: 0.0 |
| | ema_anneal_end_step: 20000 |
| | ema_decay: 0.9997 |
| | ema_encoder_only: false |
| | ema_end_decay: 1.0 |
| | ema_same_dtype: true |
| | embed_dim: 1024 |
| | encoder_dropout: 0.1 |
| | end_drop_path_rate: 0.0 |
| | end_of_block_targets: false |
| | instance_norm_target_layer: true |
| | instance_norm_targets: false |
| | layer_norm_first: false |
| | layer_norm_target_layer: false |
| | layer_norm_targets: false |
| | layerdrop: 0.0 |
| | log_norms: true |
| | loss_beta: 0.0 |
| | loss_scale: null |
| | mae_init: false |
| | max_update: 100000 |
| | min_pred_var: 0.01 |
| | min_target_var: 0.1 |
| | mlp_ratio: 4.0 |
| | normalize: true |
| | modalities: |
| | _name: null |
| | audio: |
| | add_masks: false |
| | alibi_max_pos: null |
| | alibi_scale: 1.0 |
| | conv_pos_depth: 5 |
| | conv_pos_groups: 16 |
| | conv_pos_pre_ln: false |
| | conv_pos_width: 95 |
| | decoder: |
| | add_positions_all: false |
| | add_positions_masked: false |
| | decoder_dim: 768 |
| | decoder_groups: 16 |
| | decoder_kernel: 7 |
| | decoder_layers: 4 |
| | decoder_residual: true |
| | input_dropout: 0.1 |
| | projection_layers: 1 |
| | projection_ratio: 2.0 |
| | ema_local_encoder: false |
| | encoder_zero_mask: true |
| | end_drop_path_rate: 0.0 |
| | extractor_mode: layer_norm |
| | feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]' |
| | init_extra_token_zero: true |
| | inverse_mask: false |
| | keep_masked_pct: 0.0 |
| | learned_alibi: false |
| | learned_alibi_scale: true |
| | learned_alibi_scale_per_head: true |
| | learned_alibi_scale_per_layer: false |
| | local_grad_mult: 1.0 |
| | mask_channel_length: 64 |
| | mask_channel_prob: 0.0 |
| | mask_dropout: 0.0 |
| | mask_length: 5 |
| | mask_noise_std: 0.01 |
| | mask_prob: 0.55 |
| | mask_prob_adjust: 0.1 |
| | mask_prob_min: null |
| | model_depth: 8 |
| | num_alibi_heads: 16 |
| | num_extra_tokens: 10 |
| | prenet_depth: 4 |
| | prenet_dropout: 0.1 |
| | prenet_layerdrop: 0.0 |
| | remove_masks: false |
| | start_drop_path_rate: 0.0 |
| | type: AUDIO |
| | use_alibi_encoder: true |
| | image: |
| | add_masks: false |
| | alibi_dims: 2 |
| | alibi_distance: manhattan |
| | alibi_max_pos: null |
| | alibi_scale: 1.0 |
| | decoder: |
| | add_positions_all: false |
| | add_positions_masked: false |
| | decoder_dim: 384 |
| | decoder_groups: 16 |
| | decoder_kernel: 5 |
| | decoder_layers: 5 |
| | decoder_residual: true |
| | input_dropout: 0.1 |
| | projection_layers: 1 |
| | projection_ratio: 2.0 |
| | ema_local_encoder: false |
| | embed_dim: 768 |
| | enc_dec_transformer: false |
| | encoder_zero_mask: true |
| | end_drop_path_rate: 0.0 |
| | fixed_positions: true |
| | in_chans: 3 |
| | init_extra_token_zero: true |
| | input_size: 224 |
| | inverse_mask: false |
| | keep_masked_pct: 0.0 |
| | learned_alibi: false |
| | learned_alibi_scale: false |
| | learned_alibi_scale_per_head: false |
| | learned_alibi_scale_per_layer: false |
| | local_grad_mult: 1.0 |
| | mask_channel_length: 64 |
| | mask_channel_prob: 0.0 |
| | mask_dropout: 0.0 |
| | mask_length: 5 |
| | mask_noise_std: 0.01 |
| | mask_prob: 0.7 |
| | mask_prob_adjust: 0.0 |
| | mask_prob_min: null |
| | model_depth: 8 |
| | num_alibi_heads: 16 |
| | num_extra_tokens: 0 |
| | patch_size: 16 |
| | prenet_depth: 4 |
| | prenet_dropout: 0.0 |
| | prenet_layerdrop: 0.0 |
| | remove_masks: false |
| | start_drop_path_rate: 0.0 |
| | transformer_decoder: false |
| | type: IMAGE |
| | use_alibi_encoder: false |
| | text: |
| | add_masks: false |
| | alibi_max_pos: null |
| | alibi_scale: 1.0 |
| | decoder: |
| | add_positions_all: false |
| | add_positions_masked: false |
| | decoder_dim: 384 |
| | decoder_groups: 16 |
| | decoder_kernel: 5 |
| | decoder_layers: 5 |
| | decoder_residual: true |
| | input_dropout: 0.1 |
| | projection_layers: 1 |
| | projection_ratio: 2.0 |
| | dropout: 0.1 |
| | ema_local_encoder: false |
| | encoder_zero_mask: true |
| | end_drop_path_rate: 0.0 |
| | init_extra_token_zero: true |
| | inverse_mask: false |
| | keep_masked_pct: 0.0 |
| | layernorm_embedding: true |
| | learned_alibi: false |
| | learned_alibi_scale: false |
| | learned_alibi_scale_per_head: false |
| | learned_alibi_scale_per_layer: false |
| | learned_pos: true |
| | local_grad_mult: 1.0 |
| | mask_channel_length: 64 |
| | mask_channel_prob: 0.0 |
| | mask_dropout: 0.0 |
| | mask_length: 5 |
| | mask_noise_std: 0.01 |
| | mask_prob: 0.7 |
| | mask_prob_adjust: 0.0 |
| | mask_prob_min: null |
| | max_source_positions: 512 |
| | model_depth: 8 |
| | no_scale_embedding: true |
| | no_token_positional_embeddings: false |
| | num_alibi_heads: 16 |
| | num_extra_tokens: 0 |
| | prenet_depth: 4 |
| | prenet_dropout: 0.0 |
| | prenet_layerdrop: 0.0 |
| | remove_masks: false |
| | start_drop_path_rate: 0.0 |
| | type: TEXT |
| | use_alibi_encoder: false |
| | norm_affine: true |
| | norm_eps: 1.0e-05 |
| | num_heads: 16 |
| | post_mlp_drop: 0.1 |
| | recon_loss: 0.0 |
| | seed: 1 |
| | shared_decoder: null |
| | skip_ema: false |
| | start_drop_path_rate: 0.0 |
| | supported_modality: AUDIO |
| |
|
| | tokenizer: CharTokenizer |
| | tokenizer_conf: |
| | unk_symbol: <unk> |
| | split_with_space: true |
| |
|
| | scope_map: |
| | - 'd2v_model.' |
| | - none |
| |
|
| |
|
| |
|