File size: 7,783 Bytes
b1fd273 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
{
"root": "pretrained_models",
"structure": {
"files": [
{
"path": "pretrained_models/denoiser_model.onnx",
"info": "Audio denoising model (DeepFilterNet)"
}
],
"ckpts": {
"autoregressive": {
"path": "pretrained_models/ckpts/autoregressive",
"description": "Autoregressive neural codec editing model and its dependencies",
"files": [
{
"path": "pretrained_models/ckpts/autoregressive/config.json",
"info": "Model architecture and hyperparameter configuration for the autoregressive editing model"
},
{
"path": "pretrained_models/ckpts/autoregressive/multilingual_330M.pth",
"info": "Main weights of the multilingual autoregressive codec editing model (~330M parameters), used when Edit Model=autoregressive"
},
{
"path": "pretrained_models/ckpts/autoregressive/encodec_4cb2048_giga.th",
"info": "Neural codec encoder (AudioTokenizer) with 4 codebooks × 2048 codewords, used to decode discrete codes into 16kHz waveforms"
},
{
"path": "pretrained_models/ckpts/autoregressive/dac_SR_8codes_2048_hop960_speech.pth",
"info": "DAC super-resolution model (AudioSR), used to reconstruct audio from 16kHz codec to 48kHz or target sample rate for the AR backend"
}
]
},
"multilingual_grl": {
"path": "pretrained_models/ckpts/multilingual_grl",
"description": "Main LEMAS-TTS model (multilingual_grl)",
"files": [
{
"path": "pretrained_models/ckpts/multilingual_grl/multilingual_grl.safetensors",
"info": "Unconditional/text-conditioned non-autoregressive model weights with GRL, supporting multilingual TTS and editing (default Edit Model in Gradio)"
}
]
},
"multilingual_prosody": {
"path": "pretrained_models/ckpts/multilingual_prosody",
"description": "non-autoregressive variant with an additional prosody encoder",
"files": [
{
"path": "pretrained_models/ckpts/multilingual_prosody/multilingual_prosody.safetensors",
"info": "Multilingual model weights with global prosody conditioning enabled (selectable in the model menu)"
}
]
},
"prosody_encoder": {
"path": "pretrained_models/ckpts/prosody_encoder",
"description": "Pretssel / UnitY2-style prosody encoder used by the non-autoregressive prosody backend",
"files": [
{
"path": "pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json",
"info": "Architecture configuration for the Pretssel/UnitY2 prosody encoder (dimensions, number of layers, etc.)"
},
{
"path": "pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt",
"info": "Prosody encoder weights, used to extract global prosody embeddings from reference audio"
}
]
},
"vocos-mel-24khz": {
"path": "pretrained_models/ckpts/vocos-mel-24khz",
"description": "Vocos neural vocoder (mel → 24kHz waveform)",
"files": [
{
"path": "pretrained_models/ckpts/vocos-mel-24khz/config.yaml",
"info": "Vocos vocoder configuration defining mel feature dimensions and network architecture"
},
{
"path": "pretrained_models/ckpts/vocos-mel-24khz/pytorch_model.bin",
"info": "Main Vocos vocoder weights, used to decode mel features in the CFM backend"
},
{
"path": "pretrained_models/ckpts/vocos-mel-24khz/README.md",
"info": "Documentation for the vocoder (origin, usage, and notes)"
}
]
}
},
"data": {
"multilingual_grl": {
"path": "pretrained_models/data/multilingual_grl",
"description": "Text vocabulary for the multilingual_grl model",
"files": [
{
"path": "pretrained_models/data/multilingual_grl/vocab.txt",
"info": "phone vocabulary corresponding to the text embeddings of the multilingual_grl model"
}
]
},
"multilingual_prosody": {
"path": "pretrained_models/data/multilingual_prosody",
"description": "Text vocabulary for the multilingual_prosody model",
"files": [
{
"path": "pretrained_models/data/multilingual_prosody/vocab.txt",
"info": "Shared text vocabulary used by the multilingual_prosody model"
}
]
}
},
"demos": {
"root": "pretrained_models/demos",
"files": [
{
"path": "pretrained_models/demos/test.wav",
"info": "Simple test audio used for quick validation of gradio script"
}
],
"lemas_edit_test": {
"path": "pretrained_models/demos/lemas_edit_test",
"description": "Audio samples and alignment annotations for LEMAS-Edit demos",
"subdirs": {
"vocals": {
"path": "pretrained_models/demos/lemas_edit_test/vocals",
"files": [
{
"path": "pretrained_models/demos/lemas_edit_test/vocals/en_AUD0000000214_S0001522.mp3",
"info": "English demo audio used for AR/NAR editing examples"
},
{
"path": "pretrained_models/demos/lemas_edit_test/vocals/zh_emilia_zh_0008385782.mp3",
"info": "Chinese demo audio used for multilingual editing examples"
}
]
},
"align": {
"path": "pretrained_models/demos/lemas_edit_test/align",
"files": [
{
"path": "pretrained_models/demos/lemas_edit_test/align/en_AUD0000000214_S0001522.json",
"info": "MMS alignment JSON for the English demo, including intervals, words, and modified_index used for editing"
},
{
"path": "pretrained_models/demos/lemas_edit_test/align/zh_emilia_zh_0008385782.json",
"info": "MMS alignment JSON for the Chinese demo"
}
]
}
}
}
},
"uvr5": {
"path": "pretrained_models/uvr5",
"description": "Kim Vocal UVR5 models and configurations (used for denoising)",
"files": [
{
"path": "pretrained_models/uvr5/Kim_Vocal_1.onnx",
"info": "Main UVR5 model (ONNX format) for vocal/accompaniment separation and denoising"
},
{
"path": "pretrained_models/uvr5/MDX-Net-Kim-Vocal1.json",
"info": "UVR5 model architecture and inference configuration (channels, frame length, etc.)"
},
{
"path": "pretrained_models/uvr5/model_data.json",
"info": "UVR5 metadata including presets, model list, and default parameters"
},
{
"path": "pretrained_models/uvr5/model_name_mapper.json",
"info": "Mapping from internal UVR5 model names to human-readable names for frontend selection"
}
]
},
"whisperx": {
"path": "pretrained_models/whisperx",
"description": "WhisperX VAD and segmentation model assets",
"files": [
{
"path": "pretrained_models/whisperx/whisperx-vad-segmentation.bin",
"info": "WhisperX voice activity detection (VAD) weights used for long-audio segmentation and ASR alignment assistance"
},
{
"path": "pretrained_models/whisperx/whisperx-vad-segmentation.bak",
"info": "Backup or legacy version of the VAD model, kept for safety"
}
]
}
}
}
|