Text-to-Speech
ONNX
zero-shot
multilingual
File size: 6,422 Bytes
b993b85
04dbd13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
{
  "root": "pretrained_models",
  "structure": {
    "ckpts": {
      "multilingual_grl": {
        "path": "pretrained_models/ckpts/multilingual_grl",
        "description": "Main LEMAS-TTS model (multilingual_grl)",
        "files": [
          {
            "path": "pretrained_models/ckpts/multilingual_grl/multilingual_grl.safetensors",
            "info": "Unconditional/text-conditioned non-autoregressive model weights with GRL, supporting multilingual TTS and editing (default Edit Model in Gradio)"
          }
        ]
      },
      "multilingual_prosody": {
        "path": "pretrained_models/ckpts/multilingual_prosody",
        "description": "non-autoregressive variant with an additional prosody encoder",
        "files": [
          {
            "path": "pretrained_models/ckpts/multilingual_prosody/multilingual_prosody.safetensors",
            "info": "Multilingual model weights with global prosody conditioning enabled (selectable in the model menu)"
          }
        ]
      },
      "prosody_encoder": {
        "path": "pretrained_models/ckpts/prosody_encoder",
        "description": "Pretssel / UnitY2-style prosody encoder used by the non-autoregressive prosody backend",
        "files": [
          {
            "path": "pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json",
            "info": "Architecture configuration for the Pretssel/UnitY2 prosody encoder (dimensions, number of layers, etc.)"
          },
          {
            "path": "pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt",
            "info": "Prosody encoder weights, used to extract global prosody embeddings from reference audio"
          }
        ]
      },
      "vocos-mel-24khz": {
        "path": "pretrained_models/ckpts/vocos-mel-24khz",
        "description": "Vocos neural vocoder (mel → 24kHz waveform)",
        "files": [
          {
            "path": "pretrained_models/ckpts/vocos-mel-24khz/config.yaml",
            "info": "Vocos vocoder configuration defining mel feature dimensions and network architecture"
          },
          {
            "path": "pretrained_models/ckpts/vocos-mel-24khz/pytorch_model.bin",
            "info": "Main Vocos vocoder weights, used to decode mel features in the CFM backend"
          },
          {
            "path": "pretrained_models/ckpts/vocos-mel-24khz/README.md",
            "info": "Documentation for the vocoder (origin, usage, and notes)"
          }
        ]
      }
    },
    "data": {
      "multilingual_grl": {
        "path": "pretrained_models/data/multilingual_grl",
        "description": "Text vocabulary for the multilingual_grl model",
        "files": [
          {
            "path": "pretrained_models/data/multilingual_grl/vocab.txt",
            "info": "phone vocabulary corresponding to the text embeddings of the multilingual_grl model"
          }
        ]
      },
      "multilingual_prosody": {
        "path": "pretrained_models/data/multilingual_prosody",
        "description": "Text vocabulary for the multilingual_prosody model",
        "files": [
          {
            "path": "pretrained_models/data/multilingual_prosody/vocab.txt",
            "info": "Shared text vocabulary used by the multilingual_prosody model"
          }
        ]
      }
    },
    "demos": {
      "root": "pretrained_models/demos",
      "files": [
        {
          "path": "pretrained_models/demos/test.wav",
          "info": "Simple test audio used for quick validation of gradio script"
        }
      ],
      "lemas_edit_test": {
        "path": "pretrained_models/demos/lemas_edit_test",
        "description": "Audio samples and alignment annotations for LEMAS-Edit demos",
        "subdirs": {
          "vocals": {
            "path": "pretrained_models/demos/lemas_edit_test/vocals",
            "files": [
              {
                "path": "pretrained_models/demos/lemas_edit_test/vocals/en_AUD0000000214_S0001522.mp3",
                "info": "English demo audio used for AR/NAR editing examples"
              },
              {
                "path": "pretrained_models/demos/lemas_edit_test/vocals/zh_emilia_zh_0008385782.mp3",
                "info": "Chinese demo audio used for multilingual editing examples"
              }
            ]
          },
          "align": {
            "path": "pretrained_models/demos/lemas_edit_test/align",
            "files": [
              {
                "path": "pretrained_models/demos/lemas_edit_test/align/en_AUD0000000214_S0001522.json",
                "info": "MMS alignment JSON for the English demo, including intervals, words, and modified_index used for editing"
              },
              {
                "path": "pretrained_models/demos/lemas_edit_test/align/zh_emilia_zh_0008385782.json",
                "info": "MMS alignment JSON for the Chinese demo"
              }
            ]
          }
        }
      }
    },
    "uvr5": {
      "path": "pretrained_models/uvr5",
      "description": "Kim Vocal UVR5 models and configurations (used for denoising)",
      "files": [
        {
          "path": "pretrained_models/uvr5/Kim_Vocal_1.onnx",
          "info": "Main UVR5 model (ONNX format) for vocal/accompaniment separation and denoising"
        },
        {
          "path": "pretrained_models/uvr5/MDX-Net-Kim-Vocal1.json",
          "info": "UVR5 model architecture and inference configuration (channels, frame length, etc.)"
        },
        {
          "path": "pretrained_models/uvr5/model_data.json",
          "info": "UVR5 metadata including presets, model list, and default parameters"
        },
        {
          "path": "pretrained_models/uvr5/model_name_mapper.json",
          "info": "Mapping from internal UVR5 model names to human-readable names for frontend selection"
        }
      ]
    },
    "whisperx": {
      "path": "pretrained_models/whisperx",
      "description": "WhisperX VAD and segmentation model assets",
      "files": [
        {
          "path": "pretrained_models/whisperx/whisperx-vad-segmentation.bin",
          "info": "WhisperX voice activity detection (VAD) weights used for long-audio segmentation and ASR alignment assistance"
        },
        {
          "path": "pretrained_models/whisperx/whisperx-vad-segmentation.bak",
          "info": "Backup or legacy version of the VAD model, kept for safety"
        }
      ]
    }
  }
}