| { | |
| "architectures": [ | |
| "ESSModel" | |
| ], | |
| "model": { | |
| "conformer": { | |
| "attention_dropout_p": 0.2, | |
| "conv_dropout_p": 0.2, | |
| "conv_expansion_factor": 2, | |
| "conv_kernel_size": 7, | |
| "decoder_dim": 256, | |
| "encoder_dim": 256, | |
| "feed_forward_dropout_p": 0.2, | |
| "feed_forward_expansion_factor": 4, | |
| "half_step_residual": true, | |
| "num_attention_heads": 2, | |
| "num_decode_layers": 6, | |
| "num_encode_layers": 4 | |
| }, | |
| "max_seq_len": 1000, | |
| "mode": "train", | |
| "num_emotion": 10, | |
| "reference_encoder": { | |
| "dropout": 0.2, | |
| "encoder_dim": 128 | |
| }, | |
| "variance_embedding": { | |
| "energy_quantization": "linear", | |
| "n_bins": 256, | |
| "pitch_quantization": "linear" | |
| }, | |
| "variance_predictor": { | |
| "dropout": 0.5, | |
| "filter_size": 256, | |
| "kernel_size": 3 | |
| }, | |
| "vocoder": { | |
| "model": "HiFi-GAN", | |
| "speaker": "tth" | |
| } | |
| }, | |
| "model_type": "emofs2", | |
| "preprocess": { | |
| "dataset": "vlsp2023emo", | |
| "emotion2id": { | |
| "angry": 3, | |
| "happy": 1, | |
| "neutral": 0, | |
| "sad": 2, | |
| "surprise": 4 | |
| }, | |
| "id2emotion": { | |
| "0": "neutral", | |
| "1": "happy", | |
| "2": "sad", | |
| "3": "angry", | |
| "4": "surprise" | |
| }, | |
| "path": { | |
| "corpus_path": "./data/pretrained_tts_dataset/tuyendv.dict", | |
| "lexicon_path": "../datasets/ess-vlsp2023-lexicon/lexicon.dict", | |
| "preprocessed_path": "../datasets/ess-vlsp2023-emo-processed-phoneme-level", | |
| "raw_path": "./data/pretrained_tts_dataset_raw" | |
| }, | |
| "preprocessing": { | |
| "audio": { | |
| "max_wav_value": 32768.0, | |
| "sampling_rate": 22050 | |
| }, | |
| "energy": { | |
| "feature": "phoneme_level", | |
| "normalization": true | |
| }, | |
| "mel": { | |
| "mel_fmax": 8000, | |
| "mel_fmin": 0, | |
| "n_mel_channels": 80 | |
| }, | |
| "pitch": { | |
| "feature": "phoneme_level", | |
| "normalization": true | |
| }, | |
| "stft": { | |
| "filter_length": 1024, | |
| "hop_length": 256, | |
| "win_length": 1024 | |
| }, | |
| "text": { | |
| "language": "en", | |
| "text_cleaners": [] | |
| }, | |
| "val_size": 512 | |
| }, | |
| "smoothing_label": 0.1 | |
| }, | |
| "torch_dtype": "float32", | |
| "train": null, | |
| "transformers_version": "4.35.2" | |
| } | |