Text-to-Speech
ONNX
zero-shot
multilingual
Approximetal commited on
Commit
04dbd13
·
verified ·
1 Parent(s): b993b85

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +157 -3
config.json CHANGED
@@ -1,4 +1,158 @@
1
  {
2
- "name": "lemas-tts",
3
- "type": "multilingual"
4
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "root": "pretrained_models",
3
+ "structure": {
4
+ "ckpts": {
5
+ "multilingual_grl": {
6
+ "path": "pretrained_models/ckpts/multilingual_grl",
7
+ "description": "Main LEMAS-TTS model (multilingual_grl)",
8
+ "files": [
9
+ {
10
+ "path": "pretrained_models/ckpts/multilingual_grl/multilingual_grl.safetensors",
11
+ "info": "Unconditional/text-conditioned non-autoregressive model weights with GRL, supporting multilingual TTS and editing (default Edit Model in Gradio)"
12
+ }
13
+ ]
14
+ },
15
+ "multilingual_prosody": {
16
+ "path": "pretrained_models/ckpts/multilingual_prosody",
17
+ "description": "non-autoregressive variant with an additional prosody encoder",
18
+ "files": [
19
+ {
20
+ "path": "pretrained_models/ckpts/multilingual_prosody/multilingual_prosody.safetensors",
21
+ "info": "Multilingual model weights with global prosody conditioning enabled (selectable in the model menu)"
22
+ }
23
+ ]
24
+ },
25
+ "prosody_encoder": {
26
+ "path": "pretrained_models/ckpts/prosody_encoder",
27
+ "description": "Pretssel / UnitY2-style prosody encoder used by the non-autoregressive prosody backend",
28
+ "files": [
29
+ {
30
+ "path": "pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json",
31
+ "info": "Architecture configuration for the Pretssel/UnitY2 prosody encoder (dimensions, number of layers, etc.)"
32
+ },
33
+ {
34
+ "path": "pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt",
35
+ "info": "Prosody encoder weights, used to extract global prosody embeddings from reference audio"
36
+ }
37
+ ]
38
+ },
39
+ "vocos-mel-24khz": {
40
+ "path": "pretrained_models/ckpts/vocos-mel-24khz",
41
+ "description": "Vocos neural vocoder (mel → 24kHz waveform)",
42
+ "files": [
43
+ {
44
+ "path": "pretrained_models/ckpts/vocos-mel-24khz/config.yaml",
45
+ "info": "Vocos vocoder configuration defining mel feature dimensions and network architecture"
46
+ },
47
+ {
48
+ "path": "pretrained_models/ckpts/vocos-mel-24khz/pytorch_model.bin",
49
+ "info": "Main Vocos vocoder weights, used to decode mel features in the CFM backend"
50
+ },
51
+ {
52
+ "path": "pretrained_models/ckpts/vocos-mel-24khz/README.md",
53
+ "info": "Documentation for the vocoder (origin, usage, and notes)"
54
+ }
55
+ ]
56
+ }
57
+ },
58
+ "data": {
59
+ "multilingual_grl": {
60
+ "path": "pretrained_models/data/multilingual_grl",
61
+ "description": "Text vocabulary for the multilingual_grl model",
62
+ "files": [
63
+ {
64
+ "path": "pretrained_models/data/multilingual_grl/vocab.txt",
65
+ "info": "phone vocabulary corresponding to the text embeddings of the multilingual_grl model"
66
+ }
67
+ ]
68
+ },
69
+ "multilingual_prosody": {
70
+ "path": "pretrained_models/data/multilingual_prosody",
71
+ "description": "Text vocabulary for the multilingual_prosody model",
72
+ "files": [
73
+ {
74
+ "path": "pretrained_models/data/multilingual_prosody/vocab.txt",
75
+ "info": "Shared text vocabulary used by the multilingual_prosody model"
76
+ }
77
+ ]
78
+ }
79
+ },
80
+ "demos": {
81
+ "root": "pretrained_models/demos",
82
+ "files": [
83
+ {
84
+ "path": "pretrained_models/demos/test.wav",
85
+ "info": "Simple test audio used for quick validation of gradio script"
86
+ }
87
+ ],
88
+ "lemas_edit_test": {
89
+ "path": "pretrained_models/demos/lemas_edit_test",
90
+ "description": "Audio samples and alignment annotations for LEMAS-Edit demos",
91
+ "subdirs": {
92
+ "vocals": {
93
+ "path": "pretrained_models/demos/lemas_edit_test/vocals",
94
+ "files": [
95
+ {
96
+ "path": "pretrained_models/demos/lemas_edit_test/vocals/en_AUD0000000214_S0001522.mp3",
97
+ "info": "English demo audio used for AR/NAR editing examples"
98
+ },
99
+ {
100
+ "path": "pretrained_models/demos/lemas_edit_test/vocals/zh_emilia_zh_0008385782.mp3",
101
+ "info": "Chinese demo audio used for multilingual editing examples"
102
+ }
103
+ ]
104
+ },
105
+ "align": {
106
+ "path": "pretrained_models/demos/lemas_edit_test/align",
107
+ "files": [
108
+ {
109
+ "path": "pretrained_models/demos/lemas_edit_test/align/en_AUD0000000214_S0001522.json",
110
+ "info": "MMS alignment JSON for the English demo, including intervals, words, and modified_index used for editing"
111
+ },
112
+ {
113
+ "path": "pretrained_models/demos/lemas_edit_test/align/zh_emilia_zh_0008385782.json",
114
+ "info": "MMS alignment JSON for the Chinese demo"
115
+ }
116
+ ]
117
+ }
118
+ }
119
+ }
120
+ },
121
+ "uvr5": {
122
+ "path": "pretrained_models/uvr5",
123
+ "description": "Kim Vocal UVR5 models and configurations (used for denoising)",
124
+ "files": [
125
+ {
126
+ "path": "pretrained_models/uvr5/Kim_Vocal_1.onnx",
127
+ "info": "Main UVR5 model (ONNX format) for vocal/accompaniment separation and denoising"
128
+ },
129
+ {
130
+ "path": "pretrained_models/uvr5/MDX-Net-Kim-Vocal1.json",
131
+ "info": "UVR5 model architecture and inference configuration (channels, frame length, etc.)"
132
+ },
133
+ {
134
+ "path": "pretrained_models/uvr5/model_data.json",
135
+ "info": "UVR5 metadata including presets, model list, and default parameters"
136
+ },
137
+ {
138
+ "path": "pretrained_models/uvr5/model_name_mapper.json",
139
+ "info": "Mapping from internal UVR5 model names to human-readable names for frontend selection"
140
+ }
141
+ ]
142
+ },
143
+ "whisperx": {
144
+ "path": "pretrained_models/whisperx",
145
+ "description": "WhisperX VAD and segmentation model assets",
146
+ "files": [
147
+ {
148
+ "path": "pretrained_models/whisperx/whisperx-vad-segmentation.bin",
149
+ "info": "WhisperX voice activity detection (VAD) weights used for long-audio segmentation and ASR alignment assistance"
150
+ },
151
+ {
152
+ "path": "pretrained_models/whisperx/whisperx-vad-segmentation.bak",
153
+ "info": "Backup or legacy version of the VAD model, kept for safety"
154
+ }
155
+ ]
156
+ }
157
+ }
158
+ }