diff --git a/AudioEmoDetect_v1_onnx/.gitattributes b/AudioEmoDetect_v1_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/AudioEmoDetect_v1_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.config b/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.config new file mode 100644 index 0000000000000000000000000000000000000000..72e24d5e47258a0cb018571a0b97249ffa9e03d7 --- /dev/null +++ b/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.config @@ -0,0 +1,93 @@ +HubertConfig { + "_name_or_path": "PrachiPatel/AudioEmoDetect_v1", + "activation_dropout": 0.1, + "apply_spec_augment": true, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "group", + "feat_proj_dropout": 0.1, + "feat_proj_layer_norm": true, + "final_dropout": 0.1, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "id2label": { + "0": "Anger", + "1": "Disgust", + "2": "Fear", + "3": "Happiness", + "4": "Neutral", + "5": "Sadness" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2, + "LABEL_3": 3, + "LABEL_4": 4, + "LABEL_5": 5 + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "pad_token_id": 0, + "pooling_mode": "mean", + "tokenizer_class": "Wav2Vec2CTCTokenizer", + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.feature b/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.feature new file mode 100644 index 0000000000000000000000000000000000000000..dc9d64b4acf34285cec979c8b63084b2f4bf6790 --- /dev/null +++ b/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": false, + "sampling_rate": 16000 +} diff --git a/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.onnx b/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.onnx new file mode 100644 index 0000000000000000000000000000000000000000..7aef5fce99f332deeb60cf857e6b748ebc0b1011 --- /dev/null +++ b/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b62b4b3e7290be70d72440cf70765005d675695243a03a5781ede4678c05111 +size 378573700 diff --git a/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.yaml b/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..13f7323f49c23d814d105f939cc866fea1e61206 --- /dev/null +++ b/AudioEmoDetect_v1_onnx/AudioEmoDetect_v1.yaml @@ -0,0 +1,14 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: PrachiPatel/AudioEmoDetect_v1 +labels: + 0: Anger + 1: Disgust + 2: Fear + 3: Happiness + 4: Neutral + 5: Sadness +model: HubertForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/AudioEmoDetect_v1_onnx/source.txt b/AudioEmoDetect_v1_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..de39cdb3691345098107615783662f1b248f15b1 --- /dev/null +++ b/AudioEmoDetect_v1_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/AudioEmoDetect_v1_onnx \ No newline at end of file diff --git a/audio-emotion-detection-onnx/.gitattributes b/audio-emotion-detection-onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/audio-emotion-detection-onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/audio-emotion-detection-onnx/README.md b/audio-emotion-detection-onnx/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7b95401dc46245ac339fc25059d4a56d90b4cde5 --- /dev/null +++ b/audio-emotion-detection-onnx/README.md @@ -0,0 +1,3 @@ +--- +license: apache-2.0 +--- diff --git a/audio-emotion-detection-onnx/config.json b/audio-emotion-detection-onnx/config.json new file mode 100644 index 0000000000000000000000000000000000000000..83af6f285bfbfa705efa515438fa31bf4a52bcac --- /dev/null +++ b/audio-emotion-detection-onnx/config.json @@ -0,0 +1,133 @@ +{ + "_name_or_path": "Hatman/audio-emotion-detection", + "activation_dropout": 0.05, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": true, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.05, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.05, + "hidden_size": 1024, + "id2label": { + "0": "Angry", + "1": "Disgusted", + "2": "Fearful", + "3": "Happy", + "4": "Neutral", + "5": "Sad", + "6": "Suprised" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "label2id": { + "Angry": "0", + "Disgusted": "1", + "Fearful": "2", + "Happy": "3", + "Neutral": "4", + "Sad": "5", + "Suprised": "6" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.05, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 0, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "transformers_version": "4.37.2", + "use_weighted_layer_sum": false, + "vocab_size": 33, + "xvector_output_dim": 512 +} diff --git a/audio-emotion-detection-onnx/onnx/model_quantized.onnx b/audio-emotion-detection-onnx/onnx/model_quantized.onnx new file mode 100644 index 0000000000000000000000000000000000000000..a5b70f57cdb76544d81cd1d27dfe9ac127cdd915 --- /dev/null +++ b/audio-emotion-detection-onnx/onnx/model_quantized.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08a04319bc2a171a948f23375fb2397c7f99c4fbd9a984325c0ad9082964d8b5 +size 1263408071 diff --git a/audio-emotion-detection-onnx/preprocessor_config.json b/audio-emotion-detection-onnx/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9f99bcabcbeaf80e6791d79c9cb6cd68c6e7ae95 --- /dev/null +++ b/audio-emotion-detection-onnx/preprocessor_config.json @@ -0,0 +1,10 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "Wav2Vec2ProcessorWithLM", + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/audio-emotion-detection-onnx/source.txt b/audio-emotion-detection-onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbcc0c0f91e672f5d135ca6ca3fbbd83ee066f87 --- /dev/null +++ b/audio-emotion-detection-onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/Aroganta/audio-emotion-detection-onnx \ No newline at end of file diff --git a/unispeech-sat-emotion-russian-resd_onnx/.gitattributes b/unispeech-sat-emotion-russian-resd_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/unispeech-sat-emotion-russian-resd_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/unispeech-sat-emotion-russian-resd_onnx/source.txt b/unispeech-sat-emotion-russian-resd_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..c086214a3720a0f38e609241376fbf3cb3235d7e --- /dev/null +++ b/unispeech-sat-emotion-russian-resd_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/unispeech-sat-emotion-russian-resd_onnx \ No newline at end of file diff --git a/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.config b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.config new file mode 100644 index 0000000000000000000000000000000000000000..1bde6d3305ac513fdd85c38689edeb5251f51d4d --- /dev/null +++ b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.config @@ -0,0 +1,127 @@ +UniSpeechSatConfig { + "_name_or_path": "Aniemore/unispeech-sat-emotion-russian-resd", + "activation_dropout": 0.05, + "apply_spec_augment": true, + "architectures": [ + "UniSpeechSatForSequenceClassification" + ], + "attention_dropout": 0.05, + "bos_token_id": 1, + "classifier_proj_size": 768, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.05, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.05, + "finetuning_task": [ + "unispeech_sat_classification" + ], + "hidden_act": "gelu", + "hidden_dropout": 0.05, + "hidden_size": 1024, + "id2label": { + "0": "anger", + "1": "disgust", + "2": "enthusiasm", + "3": "fear", + "4": "happiness", + "5": "neutral", + "6": "sadness" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "label2id": { + "anger": 0, + "disgust": 1, + "enthusiasm": 2, + "fear": 3, + "happiness": 4, + "neutral": 5, + "sadness": 6 + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.05, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "unispeech-sat", + "num_attention_heads": 16, + "num_clusters": 504, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "pad_token_id": 0, + "pooling_mode": "mean", + "problem_type": "single_label_classification", + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "tokenizer_class": "Wav2Vec2CTCTokenizer", + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 40, + "xvector_output_dim": 512 +} diff --git a/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.feature b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.feature new file mode 100644 index 0000000000000000000000000000000000000000..7c6e5bc4c80dd64242e48f6297826d4fa682abd5 --- /dev/null +++ b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.onnx b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.onnx new file mode 100644 index 0000000000000000000000000000000000000000..28c659fc8a07a6ddbc367772b4a233f9664d4bec --- /dev/null +++ b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c52c43e1933d1be1f8c2e1e10c4cf63f3f66802453d1764b374f0bd9e2cc6e91 +size 1265552756 diff --git a/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.processor b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.processor new file mode 100644 index 0000000000000000000000000000000000000000..66aeb17a20c5744a591f46b492197977397ddb7c --- /dev/null +++ b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.processor @@ -0,0 +1,21 @@ +Wav2Vec2Processor: +- feature_extractor: Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} + +- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='Aniemore/unispeech-sat-emotion-russian-resd', vocab_size=40, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': ''}, clean_up_tokenization_spaces=True), added_tokens_decoder={ + 0: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 1: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 2: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 3: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), +} + +{ + "processor_class": "Wav2Vec2Processor" +} diff --git a/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.tokens b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.tokens new file mode 100644 index 0000000000000000000000000000000000000000..105919a969361bd2994f735cede694770caad511 --- /dev/null +++ b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.tokens @@ -0,0 +1,40 @@ +0: +1: +2: +3: +4: '|' +5: '''' +6: '-' +7: а +8: б +9: в +10: г +11: д +12: е +13: ж +14: з +15: и +16: й +17: к +18: л +19: м +20: н +21: о +22: п +23: р +24: с +25: т +26: у +27: ф +28: х +29: ц +30: ч +31: ш +32: щ +33: ъ +34: ы +35: ь +36: э +37: ю +38: я +39: ё diff --git a/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.yaml b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0aed8834740163724c00038f0aa9bbb78d1573ea --- /dev/null +++ b/unispeech-sat-emotion-russian-resd_onnx/unispeech-sat-emotion-russian-resd.yaml @@ -0,0 +1,15 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: Aniemore/unispeech-sat-emotion-russian-resd +labels: + 0: anger + 1: disgust + 2: enthusiasm + 3: fear + 4: happiness + 5: neutral + 6: sadness +model: UniSpeechSatForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/urdu-audio-emotions_onnx/.gitattributes b/urdu-audio-emotions_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/urdu-audio-emotions_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/urdu-audio-emotions_onnx/source.txt b/urdu-audio-emotions_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..67a021025226e653e1c2f4166983ad72838f9d8b --- /dev/null +++ b/urdu-audio-emotions_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/urdu-audio-emotions_onnx \ No newline at end of file diff --git a/urdu-audio-emotions_onnx/urdu-audio-emotions.config b/urdu-audio-emotions_onnx/urdu-audio-emotions.config new file mode 100644 index 0000000000000000000000000000000000000000..c8d2f22f8f2da995eb29b4cabe34bcc7a49e152f --- /dev/null +++ b/urdu-audio-emotions_onnx/urdu-audio-emotions.config @@ -0,0 +1,129 @@ +Wav2Vec2Config { + "_name_or_path": "Talha/urdu-audio-emotions", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 1024, + "id2label": { + "0": "Angry", + "1": "Happy", + "2": "Neutral", + "3": "Sad" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "label2id": { + "Angry": "0", + "Happy": "1", + "Neutral": "2", + "Sad": "3" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.075, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 0, + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "xvector_output_dim": 512 +} diff --git a/urdu-audio-emotions_onnx/urdu-audio-emotions.feature b/urdu-audio-emotions_onnx/urdu-audio-emotions.feature new file mode 100644 index 0000000000000000000000000000000000000000..7c6e5bc4c80dd64242e48f6297826d4fa682abd5 --- /dev/null +++ b/urdu-audio-emotions_onnx/urdu-audio-emotions.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/urdu-audio-emotions_onnx/urdu-audio-emotions.onnx b/urdu-audio-emotions_onnx/urdu-audio-emotions.onnx new file mode 100644 index 0000000000000000000000000000000000000000..f1b9d74108ff2be32daf241ca98a2128ca3b950b --- /dev/null +++ b/urdu-audio-emotions_onnx/urdu-audio-emotions.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7623de49f735de6bd611721738c495575192efa1bd4ac04bf8409d0e9a78d2d5 +size 1263404950 diff --git a/urdu-audio-emotions_onnx/urdu-audio-emotions.yaml b/urdu-audio-emotions_onnx/urdu-audio-emotions.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3f0ecf3229567b715f1456f55e2d81bd7ba65b40 --- /dev/null +++ b/urdu-audio-emotions_onnx/urdu-audio-emotions.yaml @@ -0,0 +1,12 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: Talha/urdu-audio-emotions +labels: + 0: Angry + 1: Happy + 2: Neutral + 3: Sad +model: Wav2Vec2ForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/w2v2c_ko_emotion_onnx/.gitattributes b/w2v2c_ko_emotion_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/w2v2c_ko_emotion_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/w2v2c_ko_emotion_onnx/source.txt b/w2v2c_ko_emotion_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fea633511f52c03b41a3cd9d08bb9e5a266ec05 --- /dev/null +++ b/w2v2c_ko_emotion_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/w2v2c_ko_emotion_onnx \ No newline at end of file diff --git a/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.config b/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.config new file mode 100644 index 0000000000000000000000000000000000000000..297be57dd58fe68d32af5a5b9fbcfaf9461a0e0f --- /dev/null +++ b/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.config @@ -0,0 +1,124 @@ +Wav2Vec2ConformerConfig { + "_name_or_path": "JUNGWJ/w2v2c_ko_emotion", + "activation_dropout": 0.1, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ConformerForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "conformer_conv_dropout": 0.1, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_depthwise_kernel_size": 31, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": true, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.1, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "id2label": { + "0": "Happy", + "1": "Sad", + "2": "Angry", + "3": "Anxious" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "Angry": "2", + "Anxious": "3", + "Happy": "0", + "Sad": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 64, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.05, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "max_source_positions": 5000, + "model_type": "wav2vec2-conformer", + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "proj_codevector_dim": 256, + "rotary_embedding_base": 10000, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 111, + "xvector_output_dim": 512 +} diff --git a/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.feature b/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.feature new file mode 100644 index 0000000000000000000000000000000000000000..0be68ab30a4dc0a635948b99096fa6887785ddb2 --- /dev/null +++ b/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.onnx b/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.onnx new file mode 100644 index 0000000000000000000000000000000000000000..ff1d139233d3dd9225e7be35dc9e05ca84184935 --- /dev/null +++ b/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e094a84c8c33b9dd3d9b7d8e8a83d42e30b2ae9fe51bc682ce33a12b20ab663b +size 732226369 diff --git a/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.yaml b/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.yaml new file mode 100644 index 0000000000000000000000000000000000000000..774d851689e164df9823d9e854e7248b68f8e504 --- /dev/null +++ b/w2v2c_ko_emotion_onnx/w2v2c_ko_emotion.yaml @@ -0,0 +1,12 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: JUNGWJ/w2v2c_ko_emotion +labels: + 0: Happy + 1: Sad + 2: Angry + 3: Anxious +model: Wav2Vec2ConformerForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/wav2vec2-base-Speech_Emotion_Recognition_onnx/.gitattributes b/wav2vec2-base-Speech_Emotion_Recognition_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/wav2vec2-base-Speech_Emotion_Recognition_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/wav2vec2-base-Speech_Emotion_Recognition_onnx/source.txt b/wav2vec2-base-Speech_Emotion_Recognition_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfe49903c0e7c4182e5e5a83be2897abef8e9013 --- /dev/null +++ b/wav2vec2-base-Speech_Emotion_Recognition_onnx/source.txt @@ -0,0 +1 @@ +wav2vec2-base-Speech_Emotion_Recognition_onnx \ No newline at end of file diff --git a/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.config b/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.config new file mode 100644 index 0000000000000000000000000000000000000000..4c61f0b9f3fe69c6d90eb5833d74ff2b6d47fd61 --- /dev/null +++ b/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.config @@ -0,0 +1,134 @@ +Wav2Vec2Config { + "_name_or_path": "DunnBC22/wav2vec2-base-Speech_Emotion_Recognition", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "SAD", + "1": "ANGRY", + "2": "DISGUST", + "3": "FEAR", + "4": "HAPPY", + "5": "NEUTRAL" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "ANGRY": "1", + "DISGUST": "2", + "FEAR": "3", + "HAPPY": "4", + "NEUTRAL": "5", + "SAD": "0" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 0, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "xvector_output_dim": 512 +} diff --git a/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.feature b/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.feature new file mode 100644 index 0000000000000000000000000000000000000000..8b0fc1ee98d994e693f7f5affb8e3d387b66b793 --- /dev/null +++ b/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} diff --git a/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.onnx b/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.onnx new file mode 100644 index 0000000000000000000000000000000000000000..92750f0da7932699d3b23f9bb5f4e8b32898b9a8 --- /dev/null +++ b/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b11485a7a5bf84f0cf74b116f1d94bccbc2181a30dd7f86c8ce44d2f38d612f +size 378583063 diff --git a/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.yaml b/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8dd1e9df334d972a655aa445cb0f3851c316df59 --- /dev/null +++ b/wav2vec2-base-Speech_Emotion_Recognition_onnx/wav2vec2-base-Speech_Emotion_Recognition.yaml @@ -0,0 +1,14 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: DunnBC22/wav2vec2-base-Speech_Emotion_Recognition +labels: + 0: SAD + 1: ANGRY + 2: DISGUST + 3: FEAR + 4: HAPPY + 5: NEUTRAL +model: Wav2Vec2ForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/wav2vec2-base-finetuned-emodb_onnx/.gitattributes b/wav2vec2-base-finetuned-emodb_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/wav2vec2-base-finetuned-emodb_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/wav2vec2-base-finetuned-emodb_onnx/source.txt b/wav2vec2-base-finetuned-emodb_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..d895ffa19fc36e9516024f407c62cf407014f526 --- /dev/null +++ b/wav2vec2-base-finetuned-emodb_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/wav2vec2-base-finetuned-emodb_onnx \ No newline at end of file diff --git a/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.config b/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.config new file mode 100644 index 0000000000000000000000000000000000000000..8bac77b7fda09e241d2e5ead9dd10fd0608b07cb --- /dev/null +++ b/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.config @@ -0,0 +1,132 @@ +Wav2Vec2Config { + "_name_or_path": "Hamzaaa/wav2vec2-base-finetuned-emodb", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "anger", + "1": "disgust", + "2": "fear", + "3": "happiness", + "4": "sadness" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "anger": 0, + "disgust": 1, + "fear": 2, + "happiness": 3, + "sadness": 4 + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 0, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "xvector_output_dim": 512 +} diff --git a/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.feature b/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.feature new file mode 100644 index 0000000000000000000000000000000000000000..8b0fc1ee98d994e693f7f5affb8e3d387b66b793 --- /dev/null +++ b/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} diff --git a/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.onnx b/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.onnx new file mode 100644 index 0000000000000000000000000000000000000000..8c3b9dc1332b447053b9329f75cb3580943d6fdc --- /dev/null +++ b/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dc008450e379642afc3197dbc7c2a19fb614e43e9f45527bed2a3e99d782d18 +size 378582035 diff --git a/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.yaml b/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.yaml new file mode 100644 index 0000000000000000000000000000000000000000..edea4ecba9c476c7735b5130b02a3b2637c0d243 --- /dev/null +++ b/wav2vec2-base-finetuned-emodb_onnx/wav2vec2-base-finetuned-emodb.yaml @@ -0,0 +1,13 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: Hamzaaa/wav2vec2-base-finetuned-emodb +labels: + 0: anger + 1: disgust + 2: fear + 3: happiness + 4: sadness +model: Wav2Vec2ForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/.gitattributes b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/source.txt b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b168a7253806dbea2309b91a3b90a9d6fccde91 --- /dev/null +++ b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx \ No newline at end of file diff --git a/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.config b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.config new file mode 100644 index 0000000000000000000000000000000000000000..353019f0bd57149fd1c455b122e28cd1c4508579 --- /dev/null +++ b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.config @@ -0,0 +1,134 @@ +Wav2Vec2Config { + "_name_or_path": "DrishtiSharma/wav2vec2-base-finetuned-sentiment-mesd-v9", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "Anger", + "1": "Disgust", + "2": "Fear", + "3": "Happiness", + "4": "Neutral", + "5": "Sadness" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "Anger": "0", + "Disgust": "1", + "Fear": "2", + "Happiness": "3", + "Neutral": "4", + "Sadness": "5" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 0, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "xvector_output_dim": 512 +} diff --git a/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.feature b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.feature new file mode 100644 index 0000000000000000000000000000000000000000..8b0fc1ee98d994e693f7f5affb8e3d387b66b793 --- /dev/null +++ b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} diff --git a/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.onnx b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.onnx new file mode 100644 index 0000000000000000000000000000000000000000..75ca3e946d0b242b613c0e6a132697b7409f9d6e --- /dev/null +++ b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15aadf5ccc0418390fdee951caad43430e2adf6256b7f57b2c5eb0e515421efd +size 378583063 diff --git a/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.yaml b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2e2d32e4f2e127fe95bf60eba6303ddb2d9b1971 --- /dev/null +++ b/wav2vec2-base-finetuned-sentiment-mesd-v9_onnx/wav2vec2-base-finetuned-sentiment-mesd-v9.yaml @@ -0,0 +1,14 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: DrishtiSharma/wav2vec2-base-finetuned-sentiment-mesd-v9 +labels: + 0: Anger + 1: Disgust + 2: Fear + 3: Happiness + 4: Neutral + 5: Sadness +model: Wav2Vec2ForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/wav2vec2-large-emotion-detection-german_onnx/.gitattributes b/wav2vec2-large-emotion-detection-german_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/wav2vec2-large-emotion-detection-german_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/wav2vec2-large-emotion-detection-german_onnx/source.txt b/wav2vec2-large-emotion-detection-german_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc8dbb4742a35ee53655097cb9e4066dfcaefed6 --- /dev/null +++ b/wav2vec2-large-emotion-detection-german_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/wav2vec2-large-emotion-detection-german_onnx \ No newline at end of file diff --git a/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.config b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.config new file mode 100644 index 0000000000000000000000000000000000000000..345ba67febfa8881297f4a700ce57f7729d57757 --- /dev/null +++ b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.config @@ -0,0 +1,131 @@ +Wav2Vec2Config { + "_name_or_path": "padmalcom/wav2vec2-large-emotion-detection-german", + "activation_dropout": 0.1, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSpeechClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.1, + "finetuning_task": "wav2vec2_clf", + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "id2label": { + "0": "anger", + "1": "boredom", + "2": "disgust", + "3": "fear", + "4": "happiness", + "5": "sadness", + "6": "neutral" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "label2id": { + "anger": 0, + "boredom": 1, + "disgust": 2, + "fear": 3, + "happiness": 4, + "neutral": 6, + "sadness": 5 + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 0, + "pooling_mode": "mean", + "problem_type": "single_label_classification", + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "xvector_output_dim": 512 +} diff --git a/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.feature b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.feature new file mode 100644 index 0000000000000000000000000000000000000000..0be68ab30a4dc0a635948b99096fa6887785ddb2 --- /dev/null +++ b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.onnx b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.onnx new file mode 100644 index 0000000000000000000000000000000000000000..735793c84477d43c76391b5a3e3672e5cd4a8466 --- /dev/null +++ b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ccb3dce44226c65ae9a961595944a833fb3b027d6cfe5f2865f00e77faeda4d +size 1263408034 diff --git a/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.processor b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.processor new file mode 100644 index 0000000000000000000000000000000000000000..04efc329378be8b4d1bf5cec8c6119dbd4cbc8cf --- /dev/null +++ b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.processor @@ -0,0 +1,21 @@ +Wav2Vec2Processor: +- feature_extractor: Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} + +- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='padmalcom/wav2vec2-large-emotion-detection-german', vocab_size=32, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': ''}, clean_up_tokenization_spaces=True), added_tokens_decoder={ + 0: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 1: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 2: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 3: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), +} + +{ + "processor_class": "Wav2Vec2Processor" +} diff --git a/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.tokens b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.tokens new file mode 100644 index 0000000000000000000000000000000000000000..f13d48d8183c87f226e5a1ffed4f27fd7001742a --- /dev/null +++ b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.tokens @@ -0,0 +1,32 @@ +0: +1: +2: +3: +4: '|' +5: E +6: T +7: A +8: O +9: N +10: I +11: H +12: S +13: R +14: D +15: L +16: U +17: M +18: W +19: C +20: F +21: G +22: Y +23: P +24: B +25: V +26: K +27: '''' +28: X +29: J +30: Q +31: Z diff --git a/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.yaml b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.yaml new file mode 100644 index 0000000000000000000000000000000000000000..734404266d19ad51c1f3f6fc832baac430ffc1aa --- /dev/null +++ b/wav2vec2-large-emotion-detection-german_onnx/wav2vec2-large-emotion-detection-german.yaml @@ -0,0 +1,15 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: padmalcom/wav2vec2-large-emotion-detection-german +labels: + 0: anger + 1: boredom + 2: disgust + 3: fear + 4: happiness + 5: sadness + 6: neutral +model: Wav2Vec2ForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/.gitattributes b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/source.txt b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..d88f344bff07190ff6c2ec6b1d302107a12d0c9e --- /dev/null +++ b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx \ No newline at end of file diff --git a/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.config b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.config new file mode 100644 index 0000000000000000000000000000000000000000..a1533cafa7a6c24303a582dd40a720bffdd43eff --- /dev/null +++ b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.config @@ -0,0 +1,123 @@ +Wav2Vec2Config { + "_name_or_path": "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim", + "activation_dropout": 0.1, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSpeechClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.1, + "finetuning_task": "wav2vec2_reg", + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "id2label": { + "0": "arousal", + "1": "dominance", + "2": "valence" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "label2id": { + "arousal": 0, + "dominance": 1, + "valence": 2 + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 0, + "pooling_mode": "mean", + "problem_type": "regression", + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": null, + "xvector_output_dim": 512 +} diff --git a/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.feature b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.feature new file mode 100644 index 0000000000000000000000000000000000000000..0be68ab30a4dc0a635948b99096fa6887785ddb2 --- /dev/null +++ b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.onnx b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.onnx new file mode 100644 index 0000000000000000000000000000000000000000..1b4af890dec5856d54c74d66ba8203a041ade468 --- /dev/null +++ b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba86ebca4174bf036182063e09495f7d523b8924a1a9c621f3daa1a0ec0b3595 +size 661664947 diff --git a/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.processor b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.processor new file mode 100644 index 0000000000000000000000000000000000000000..15bd74397c18b8188cb90fc62d699fdd4553745c --- /dev/null +++ b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.processor @@ -0,0 +1,21 @@ +Wav2Vec2Processor: +- feature_extractor: Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} + +- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim', vocab_size=0, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': ''}, clean_up_tokenization_spaces=True), added_tokens_decoder={ + 0: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 1: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 2: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 3: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), +} + +{ + "processor_class": "Wav2Vec2Processor" +} diff --git a/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.tokens b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.tokens new file mode 100644 index 0000000000000000000000000000000000000000..3ed10f9b14a51549d8c544423f664ee686f5a48c --- /dev/null +++ b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.tokens @@ -0,0 +1,4 @@ +0: +1: +2: +3: diff --git a/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.yaml b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ad57123ba212e30040e870fc86b1e710078dbfd8 --- /dev/null +++ b/wav2vec2-large-robust-12-ft-emotion-msp-dim_onnx/wav2vec2-large-robust-12-ft-emotion-msp-dim.yaml @@ -0,0 +1,11 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim +labels: + 0: arousal + 1: dominance + 2: valence +model: EmotionModel +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/.gitattributes b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..24e3ac88994e5a782a1f7286ca2dcffd66874da5 --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +onnx/*.model filter=lfs diff=lfs merge=lfs -text diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/README.md b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7be5fc7f47d5db027d120b8024982df93db95b74 --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/README.md @@ -0,0 +1,3 @@ +--- +license: mit +--- diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/config.json b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/config.json new file mode 100644 index 0000000000000000000000000000000000000000..605cf8fe6833c77e7bd454a9a64f9952e6b5a238 --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/config.json @@ -0,0 +1,107 @@ +{ + "_name_or_path": "jonatasgrosman/wav2vec2-large-xlsr-53-english", + "activation_dropout": 0.05, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": true, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.05, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "finetuning_task": "wav2vec2_clf", + "gradient_checkpointing": true, + "hidden_act": "gelu", + "hidden_dropout": 0.05, + "hidden_size": 1024, + "id2label": { + "0": "angry", + "1": "calm", + "2": "disgust", + "3": "fearful", + "4": "happy", + "5": "neutral", + "6": "sad", + "7": "surprised" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "label2id": { + "angry": 0, + "calm": 1, + "disgust": 2, + "fearful": 3, + "happy": 4, + "neutral": 5, + "sad": 6, + "surprised": 7 + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.05, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "pad_token_id": 0, + "pooling_mode": "mean", + "problem_type": "single_label_classification", + "proj_codevector_dim": 256, + "transformers_version": "4.8.2", + "vocab_size": 33 +} diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/onnx/model_quantized.onnx b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/onnx/model_quantized.onnx new file mode 100644 index 0000000000000000000000000000000000000000..e82cfe4ee5580c050e9b98d62dc777b6af8125cc --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/onnx/model_quantized.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e6c875f2bb1fe16d29005d6f57dda37e64e27401724353ff0d0c6b1f22d0a23 +size 1263409099 diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/preprocessor_config.json b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..73caa151574001d3d495fae897e1d38968249712 --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/source.txt b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bf1b11f403d52e83909735f4a16716ef95fb8da --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/Aroganta/wav2vec2-lg-xlsr-en-speech-emotion-recognition-onnx \ No newline at end of file diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/gitattributes b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/source.txt b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a38b05aa97b73d147e3dd99bd242c16d4e27d2e --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx \ No newline at end of file diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.config b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.config new file mode 100644 index 0000000000000000000000000000000000000000..82f5d474a3329a735cb3061205f7a835517bfc95 --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.config @@ -0,0 +1,139 @@ +Wav2Vec2Config { + "_name_or_path": "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", + "activation_dropout": 0.05, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": true, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.05, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "finetuning_task": "wav2vec2_clf", + "gradient_checkpointing": true, + "hidden_act": "gelu", + "hidden_dropout": 0.05, + "hidden_size": 1024, + "id2label": { + "0": "angry", + "1": "calm", + "2": "disgust", + "3": "fearful", + "4": "happy", + "5": "neutral", + "6": "sad", + "7": "surprised" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "label2id": { + "angry": 0, + "calm": 1, + "disgust": 2, + "fearful": 3, + "happy": 4, + "neutral": 5, + "sad": 6, + "surprised": 7 + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.05, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 0, + "pooling_mode": "mean", + "problem_type": "single_label_classification", + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 33, + "xvector_output_dim": 512 +} diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.feature b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.feature new file mode 100644 index 0000000000000000000000000000000000000000..0be68ab30a4dc0a635948b99096fa6887785ddb2 --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.onnx b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.onnx new file mode 100644 index 0000000000000000000000000000000000000000..b545473307b64d9b6afa2491f5fbcfc644f39045 --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b60ff8a70d81410ae56755140ba8b0456b6eb59f517da9ec69ce6d78103416ff +size 1263409062 diff --git a/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.yaml b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b56158a32b06ea7bbc1da476de135c869a8695f0 --- /dev/null +++ b/wav2vec2-lg-xlsr-en-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-en-speech-emotion-recognition.yaml @@ -0,0 +1,16 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition +labels: + 0: angry + 1: calm + 2: disgust + 3: fearful + 4: happy + 5: neutral + 6: sad + 7: surprised +model: Wav2Vec2ForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/.gitattributes b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/source.txt b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ab9327681492a7a3f29e4bcc4c2c53068beb12d --- /dev/null +++ b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx \ No newline at end of file diff --git a/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.config b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.config new file mode 100644 index 0000000000000000000000000000000000000000..6279adac6a5e76c02dcbbd6592e5b91ac6a39a12 --- /dev/null +++ b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.config @@ -0,0 +1,130 @@ +Wav2Vec2Config { + "_name_or_path": "Lajavaness/wav2vec2-lg-xlsr-fr-speech-emotion-recognition", + "activation_dropout": 0.05, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": true, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.05, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.05, + "hidden_size": 1024, + "id2label": { + "0": "Pleased", + "1": "Relaxed", + "2": "Neutral", + "3": "Sad", + "4": "Tension" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "label2id": { + "E1": 0, + "E2": 1, + "E3": 2, + "E4": 3, + "E6": 4 + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.05, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 0, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 59, + "xvector_output_dim": 512 +} diff --git a/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.feature b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.feature new file mode 100644 index 0000000000000000000000000000000000000000..6399a8dec4c059345467cc4f232bf1c49004dd11 --- /dev/null +++ b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.feature @@ -0,0 +1,10 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "Wav2Vec2Processor", + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.onnx b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.onnx new file mode 100644 index 0000000000000000000000000000000000000000..483f9b8ac7634767e65f289365eb09d36b86aac7 --- /dev/null +++ b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a0187561d6b29afde6f80a4cad3852705104ab41aa00a6f358be38ccd6ba560 +size 1263405978 diff --git a/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.processor b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.processor new file mode 100644 index 0000000000000000000000000000000000000000..b5f981d023bc3d5842ea9ba066dbb5521437ae1b --- /dev/null +++ b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.processor @@ -0,0 +1,22 @@ +Wav2Vec2Processor: +- feature_extractor: Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "processor_class": "Wav2Vec2Processor", + "return_attention_mask": true, + "sampling_rate": 16000 +} + +- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='Lajavaness/wav2vec2-lg-xlsr-fr-speech-emotion-recognition', vocab_size=59, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': ''}, clean_up_tokenization_spaces=True), added_tokens_decoder={ + 0: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 1: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 2: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), + 3: AddedToken("", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False), +} + +{ + "processor_class": "Wav2Vec2Processor" +} diff --git a/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.tokens b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.tokens new file mode 100644 index 0000000000000000000000000000000000000000..c35e017fa7827990b3da676bc1bd211911b8a071 --- /dev/null +++ b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.tokens @@ -0,0 +1,59 @@ +0: +1: +2: +3: +4: '|' +5: '''' +6: '-' +7: a +8: b +9: c +10: d +11: e +12: f +13: g +14: h +15: i +16: j +17: k +18: l +19: m +20: n +21: o +22: p +23: q +24: r +25: s +26: t +27: u +28: v +29: w +30: x +31: y +32: z +33: à +34: á +35: â +36: ä +37: ç +38: è +39: é +40: ê +41: ë +42: í +43: î +44: ï +45: ñ +46: ó +47: ô +48: ö +49: ù +50: ú +51: û +52: ü +53: ć +54: č +55: ō +56: œ +57: š +58: ș diff --git a/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.yaml b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d311b48efe4624563f5c520349bc8290429be16d --- /dev/null +++ b/wav2vec2-lg-xlsr-fr-speech-emotion-recognition_onnx/wav2vec2-lg-xlsr-fr-speech-emotion-recognition.yaml @@ -0,0 +1,13 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: Lajavaness/wav2vec2-lg-xlsr-fr-speech-emotion-recognition +labels: + 0: Pleased + 1: Relaxed + 2: Neutral + 3: Sad + 4: Tension +model: Wav2Vec2ForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/.gitattributes b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/source.txt b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4dbd414d9537d7441772781a88d7e5c996bf016 --- /dev/null +++ b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx \ No newline at end of file diff --git a/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.config b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.config new file mode 100644 index 0000000000000000000000000000000000000000..2e72c8dbf9f8db04dbe1aabdedd4ce8d1f2637a2 --- /dev/null +++ b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.config @@ -0,0 +1,130 @@ +Wav2Vec2Config { + "_name_or_path": "canlinzhang/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "neu", + "1": "ang", + "2": "sad", + "3": "hap" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "ang": 1, + "hap": 3, + "neu": 0, + "sad": 2 + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 0, + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "xvector_output_dim": 512 +} diff --git a/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.feature b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.feature new file mode 100644 index 0000000000000000000000000000000000000000..8b0fc1ee98d994e693f7f5affb8e3d387b66b793 --- /dev/null +++ b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} diff --git a/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.onnx b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.onnx new file mode 100644 index 0000000000000000000000000000000000000000..797d135851569c37b8d95c11f4c823acf2d42276 --- /dev/null +++ b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f972a615732d3b72e6472b4ba9307b8799c700e03c9b178306e47f3ee496845f +size 378581007 diff --git a/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.yaml b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab0c91027c5fb05cd7563e6544b3d0ddbb31cba0 --- /dev/null +++ b/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP_onnx/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP.yaml @@ -0,0 +1,12 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: canlinzhang/wav2vec2_speech_emotion_recognition_trained_on_IEMOCAP +labels: + 0: neu + 1: ang + 2: sad + 3: hap +model: Wav2Vec2ForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/.gitattributes b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/source.txt b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..94f4ae1612141f211312f7e08271da8203b12ab1 --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/source.txt @@ -0,0 +1,2 @@ + +https://huggingface.co/steveway/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx \ No newline at end of file diff --git a/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.config b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.config new file mode 100644 index 0000000000000000000000000000000000000000..5d166e5f1b76fac4c5cf69db969ccd8a14f0a020 --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.config @@ -0,0 +1,133 @@ +Wav2Vec2Config { + "_name_or_path": "Hamzaaa/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": true, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "finetuning_task": "wav2vec2_clf", + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 1024, + "id2label": { + "0": "anger", + "1": "disgust", + "2": "fear", + "3": "happiness", + "4": "sadness" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "label2id": { + "anger": 0, + "disgust": 1, + "fear": 2, + "happiness": 3, + "sadness": 4 + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 54, + "pooling_mode": "mean", + "problem_type": "single_label_classification", + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 55, + "xvector_output_dim": 512 +} diff --git a/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.feature b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.feature new file mode 100644 index 0000000000000000000000000000000000000000..8b0fc1ee98d994e693f7f5affb8e3d387b66b793 --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} diff --git a/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.onnx b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.onnx new file mode 100644 index 0000000000000000000000000000000000000000..73122dc451c7e5e479a6e6e09f0ccb5dba42791c --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5e94acc528d4a6828ecda9e9cbb6597f8ef9e5a383653f3082243044763c249 +size 1263405978 diff --git a/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.yaml b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eceabbb57e83e6dd33b00486ef9028f371262e67 --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee_onnx/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee.yaml @@ -0,0 +1,13 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: Hamzaaa/xlsr-wav2vec-speech-emotion-recognition-finetuned-Savee +labels: + 0: anger + 1: disgust + 2: fear + 3: happiness + 4: sadness +model: Wav2Vec2ForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000 diff --git a/xlsr-wav2vec-speech-emotion-recognition_onnx/.gitattributes b/xlsr-wav2vec-speech-emotion-recognition_onnx/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition_onnx/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/xlsr-wav2vec-speech-emotion-recognition_onnx/source.txt b/xlsr-wav2vec-speech-emotion-recognition_onnx/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..88f87701f71e0443807f67f2c421650de97fea87 --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition_onnx/source.txt @@ -0,0 +1 @@ +https://huggingface.co/steveway/xlsr-wav2vec-speech-emotion-recognition_onnx \ No newline at end of file diff --git a/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.config b/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.config new file mode 100644 index 0000000000000000000000000000000000000000..bbc1985b8344da840e4fdd3f8147734119a0e879 --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.config @@ -0,0 +1,134 @@ +Wav2Vec2Config { + "_name_or_path": "harshit345/xlsr-wav2vec-speech-emotion-recognition", + "activation_dropout": 0.0, + "adapter_attn_dim": null, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForSpeechClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": true, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "finetuning_task": "wav2vec2_clf", + "gradient_checkpointing": true, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 1024, + "id2label": { + "0": "anger", + "1": "disgust", + "2": "fear", + "3": "happiness", + "4": "sadness" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "label2id": { + "anger": 0, + "disgust": 1, + "fear": 2, + "happiness": 3, + "sadness": 4 + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 54, + "pooling_mode": "mean", + "problem_type": "single_label_classification", + "proj_codevector_dim": 256, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_weighted_layer_sum": false, + "vocab_size": 55, + "xvector_output_dim": 512 +} diff --git a/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.feature b/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.feature new file mode 100644 index 0000000000000000000000000000000000000000..0be68ab30a4dc0a635948b99096fa6887785ddb2 --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.feature @@ -0,0 +1,9 @@ +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.onnx b/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.onnx new file mode 100644 index 0000000000000000000000000000000000000000..856514b600e9c028b6d2ad2af491270ff675ac0b --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e86f0b7ea1da2ba9af9c00177449b57f5cba8a3d9c9d79f5348df86a05c1d4d +size 1263405978 diff --git a/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.yaml b/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dec898489593084ad39e95723f542a97a899865f --- /dev/null +++ b/xlsr-wav2vec-speech-emotion-recognition_onnx/xlsr-wav2vec-speech-emotion-recognition.yaml @@ -0,0 +1,13 @@ +feature_extractor: Wav2Vec2FeatureExtractor +full_name: harshit345/xlsr-wav2vec-speech-emotion-recognition +labels: + 0: anger + 1: disgust + 2: fear + 3: happiness + 4: sadness +model: Wav2Vec2ForSequenceClassification +sampling_rate: 16000 +shape: +- 1 +- 320000