Text-to-Speech
ONNX
zero-shot
multilingual
Approximetal commited on
Commit
b1fd273
·
verified ·
1 Parent(s): 97a5545

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +186 -4
config.json CHANGED
@@ -1,4 +1,186 @@
1
- {
2
- "name": "lemas-edit",
3
- "type": "multilingual"
4
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "root": "pretrained_models",
3
+ "structure": {
4
+ "files": [
5
+ {
6
+ "path": "pretrained_models/denoiser_model.onnx",
7
+ "info": "Audio denoising model (DeepFilterNet)"
8
+ }
9
+ ],
10
+ "ckpts": {
11
+ "autoregressive": {
12
+ "path": "pretrained_models/ckpts/autoregressive",
13
+ "description": "Autoregressive neural codec editing model and its dependencies",
14
+ "files": [
15
+ {
16
+ "path": "pretrained_models/ckpts/autoregressive/config.json",
17
+ "info": "Model architecture and hyperparameter configuration for the autoregressive editing model"
18
+ },
19
+ {
20
+ "path": "pretrained_models/ckpts/autoregressive/multilingual_330M.pth",
21
+ "info": "Main weights of the multilingual autoregressive codec editing model (~330M parameters), used when Edit Model=autoregressive"
22
+ },
23
+ {
24
+ "path": "pretrained_models/ckpts/autoregressive/encodec_4cb2048_giga.th",
25
+ "info": "Neural codec encoder (AudioTokenizer) with 4 codebooks × 2048 codewords, used to decode discrete codes into 16kHz waveforms"
26
+ },
27
+ {
28
+ "path": "pretrained_models/ckpts/autoregressive/dac_SR_8codes_2048_hop960_speech.pth",
29
+ "info": "DAC super-resolution model (AudioSR), used to reconstruct audio from 16kHz codec to 48kHz or target sample rate for the AR backend"
30
+ }
31
+ ]
32
+ },
33
+ "multilingual_grl": {
34
+ "path": "pretrained_models/ckpts/multilingual_grl",
35
+ "description": "Main LEMAS-TTS model (multilingual_grl)",
36
+ "files": [
37
+ {
38
+ "path": "pretrained_models/ckpts/multilingual_grl/multilingual_grl.safetensors",
39
+ "info": "Unconditional/text-conditioned non-autoregressive model weights with GRL, supporting multilingual TTS and editing (default Edit Model in Gradio)"
40
+ }
41
+ ]
42
+ },
43
+ "multilingual_prosody": {
44
+ "path": "pretrained_models/ckpts/multilingual_prosody",
45
+ "description": "non-autoregressive variant with an additional prosody encoder",
46
+ "files": [
47
+ {
48
+ "path": "pretrained_models/ckpts/multilingual_prosody/multilingual_prosody.safetensors",
49
+ "info": "Multilingual model weights with global prosody conditioning enabled (selectable in the model menu)"
50
+ }
51
+ ]
52
+ },
53
+ "prosody_encoder": {
54
+ "path": "pretrained_models/ckpts/prosody_encoder",
55
+ "description": "Pretssel / UnitY2-style prosody encoder used by the non-autoregressive prosody backend",
56
+ "files": [
57
+ {
58
+ "path": "pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json",
59
+ "info": "Architecture configuration for the Pretssel/UnitY2 prosody encoder (dimensions, number of layers, etc.)"
60
+ },
61
+ {
62
+ "path": "pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt",
63
+ "info": "Prosody encoder weights, used to extract global prosody embeddings from reference audio"
64
+ }
65
+ ]
66
+ },
67
+ "vocos-mel-24khz": {
68
+ "path": "pretrained_models/ckpts/vocos-mel-24khz",
69
+ "description": "Vocos neural vocoder (mel → 24kHz waveform)",
70
+ "files": [
71
+ {
72
+ "path": "pretrained_models/ckpts/vocos-mel-24khz/config.yaml",
73
+ "info": "Vocos vocoder configuration defining mel feature dimensions and network architecture"
74
+ },
75
+ {
76
+ "path": "pretrained_models/ckpts/vocos-mel-24khz/pytorch_model.bin",
77
+ "info": "Main Vocos vocoder weights, used to decode mel features in the CFM backend"
78
+ },
79
+ {
80
+ "path": "pretrained_models/ckpts/vocos-mel-24khz/README.md",
81
+ "info": "Documentation for the vocoder (origin, usage, and notes)"
82
+ }
83
+ ]
84
+ }
85
+ },
86
+ "data": {
87
+ "multilingual_grl": {
88
+ "path": "pretrained_models/data/multilingual_grl",
89
+ "description": "Text vocabulary for the multilingual_grl model",
90
+ "files": [
91
+ {
92
+ "path": "pretrained_models/data/multilingual_grl/vocab.txt",
93
+ "info": "phone vocabulary corresponding to the text embeddings of the multilingual_grl model"
94
+ }
95
+ ]
96
+ },
97
+ "multilingual_prosody": {
98
+ "path": "pretrained_models/data/multilingual_prosody",
99
+ "description": "Text vocabulary for the multilingual_prosody model",
100
+ "files": [
101
+ {
102
+ "path": "pretrained_models/data/multilingual_prosody/vocab.txt",
103
+ "info": "Shared text vocabulary used by the multilingual_prosody model"
104
+ }
105
+ ]
106
+ }
107
+ },
108
+ "demos": {
109
+ "root": "pretrained_models/demos",
110
+ "files": [
111
+ {
112
+ "path": "pretrained_models/demos/test.wav",
113
+ "info": "Simple test audio used for quick validation of gradio script"
114
+ }
115
+ ],
116
+ "lemas_edit_test": {
117
+ "path": "pretrained_models/demos/lemas_edit_test",
118
+ "description": "Audio samples and alignment annotations for LEMAS-Edit demos",
119
+ "subdirs": {
120
+ "vocals": {
121
+ "path": "pretrained_models/demos/lemas_edit_test/vocals",
122
+ "files": [
123
+ {
124
+ "path": "pretrained_models/demos/lemas_edit_test/vocals/en_AUD0000000214_S0001522.mp3",
125
+ "info": "English demo audio used for AR/NAR editing examples"
126
+ },
127
+ {
128
+ "path": "pretrained_models/demos/lemas_edit_test/vocals/zh_emilia_zh_0008385782.mp3",
129
+ "info": "Chinese demo audio used for multilingual editing examples"
130
+ }
131
+ ]
132
+ },
133
+ "align": {
134
+ "path": "pretrained_models/demos/lemas_edit_test/align",
135
+ "files": [
136
+ {
137
+ "path": "pretrained_models/demos/lemas_edit_test/align/en_AUD0000000214_S0001522.json",
138
+ "info": "MMS alignment JSON for the English demo, including intervals, words, and modified_index used for editing"
139
+ },
140
+ {
141
+ "path": "pretrained_models/demos/lemas_edit_test/align/zh_emilia_zh_0008385782.json",
142
+ "info": "MMS alignment JSON for the Chinese demo"
143
+ }
144
+ ]
145
+ }
146
+ }
147
+ }
148
+ },
149
+ "uvr5": {
150
+ "path": "pretrained_models/uvr5",
151
+ "description": "Kim Vocal UVR5 models and configurations (used for denoising)",
152
+ "files": [
153
+ {
154
+ "path": "pretrained_models/uvr5/Kim_Vocal_1.onnx",
155
+ "info": "Main UVR5 model (ONNX format) for vocal/accompaniment separation and denoising"
156
+ },
157
+ {
158
+ "path": "pretrained_models/uvr5/MDX-Net-Kim-Vocal1.json",
159
+ "info": "UVR5 model architecture and inference configuration (channels, frame length, etc.)"
160
+ },
161
+ {
162
+ "path": "pretrained_models/uvr5/model_data.json",
163
+ "info": "UVR5 metadata including presets, model list, and default parameters"
164
+ },
165
+ {
166
+ "path": "pretrained_models/uvr5/model_name_mapper.json",
167
+ "info": "Mapping from internal UVR5 model names to human-readable names for frontend selection"
168
+ }
169
+ ]
170
+ },
171
+ "whisperx": {
172
+ "path": "pretrained_models/whisperx",
173
+ "description": "WhisperX VAD and segmentation model assets",
174
+ "files": [
175
+ {
176
+ "path": "pretrained_models/whisperx/whisperx-vad-segmentation.bin",
177
+ "info": "WhisperX voice activity detection (VAD) weights used for long-audio segmentation and ASR alignment assistance"
178
+ },
179
+ {
180
+ "path": "pretrained_models/whisperx/whisperx-vad-segmentation.bak",
181
+ "info": "Backup or legacy version of the VAD model, kept for safety"
182
+ }
183
+ ]
184
+ }
185
+ }
186
+ }