Text-to-Speech
ONNX
zero-shot
multilingual
LEMAS-TTS / config.json
Approximetal's picture
Update config.json
04dbd13 verified
{
"root": "pretrained_models",
"structure": {
"ckpts": {
"multilingual_grl": {
"path": "pretrained_models/ckpts/multilingual_grl",
"description": "Main LEMAS-TTS model (multilingual_grl)",
"files": [
{
"path": "pretrained_models/ckpts/multilingual_grl/multilingual_grl.safetensors",
"info": "Unconditional/text-conditioned non-autoregressive model weights with GRL, supporting multilingual TTS and editing (default Edit Model in Gradio)"
}
]
},
"multilingual_prosody": {
"path": "pretrained_models/ckpts/multilingual_prosody",
"description": "non-autoregressive variant with an additional prosody encoder",
"files": [
{
"path": "pretrained_models/ckpts/multilingual_prosody/multilingual_prosody.safetensors",
"info": "Multilingual model weights with global prosody conditioning enabled (selectable in the model menu)"
}
]
},
"prosody_encoder": {
"path": "pretrained_models/ckpts/prosody_encoder",
"description": "Pretssel / UnitY2-style prosody encoder used by the non-autoregressive prosody backend",
"files": [
{
"path": "pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json",
"info": "Architecture configuration for the Pretssel/UnitY2 prosody encoder (dimensions, number of layers, etc.)"
},
{
"path": "pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt",
"info": "Prosody encoder weights, used to extract global prosody embeddings from reference audio"
}
]
},
"vocos-mel-24khz": {
"path": "pretrained_models/ckpts/vocos-mel-24khz",
"description": "Vocos neural vocoder (mel → 24kHz waveform)",
"files": [
{
"path": "pretrained_models/ckpts/vocos-mel-24khz/config.yaml",
"info": "Vocos vocoder configuration defining mel feature dimensions and network architecture"
},
{
"path": "pretrained_models/ckpts/vocos-mel-24khz/pytorch_model.bin",
"info": "Main Vocos vocoder weights, used to decode mel features in the CFM backend"
},
{
"path": "pretrained_models/ckpts/vocos-mel-24khz/README.md",
"info": "Documentation for the vocoder (origin, usage, and notes)"
}
]
}
},
"data": {
"multilingual_grl": {
"path": "pretrained_models/data/multilingual_grl",
"description": "Text vocabulary for the multilingual_grl model",
"files": [
{
"path": "pretrained_models/data/multilingual_grl/vocab.txt",
"info": "phone vocabulary corresponding to the text embeddings of the multilingual_grl model"
}
]
},
"multilingual_prosody": {
"path": "pretrained_models/data/multilingual_prosody",
"description": "Text vocabulary for the multilingual_prosody model",
"files": [
{
"path": "pretrained_models/data/multilingual_prosody/vocab.txt",
"info": "Shared text vocabulary used by the multilingual_prosody model"
}
]
}
},
"demos": {
"root": "pretrained_models/demos",
"files": [
{
"path": "pretrained_models/demos/test.wav",
"info": "Simple test audio used for quick validation of gradio script"
}
],
"lemas_edit_test": {
"path": "pretrained_models/demos/lemas_edit_test",
"description": "Audio samples and alignment annotations for LEMAS-Edit demos",
"subdirs": {
"vocals": {
"path": "pretrained_models/demos/lemas_edit_test/vocals",
"files": [
{
"path": "pretrained_models/demos/lemas_edit_test/vocals/en_AUD0000000214_S0001522.mp3",
"info": "English demo audio used for AR/NAR editing examples"
},
{
"path": "pretrained_models/demos/lemas_edit_test/vocals/zh_emilia_zh_0008385782.mp3",
"info": "Chinese demo audio used for multilingual editing examples"
}
]
},
"align": {
"path": "pretrained_models/demos/lemas_edit_test/align",
"files": [
{
"path": "pretrained_models/demos/lemas_edit_test/align/en_AUD0000000214_S0001522.json",
"info": "MMS alignment JSON for the English demo, including intervals, words, and modified_index used for editing"
},
{
"path": "pretrained_models/demos/lemas_edit_test/align/zh_emilia_zh_0008385782.json",
"info": "MMS alignment JSON for the Chinese demo"
}
]
}
}
}
},
"uvr5": {
"path": "pretrained_models/uvr5",
"description": "Kim Vocal UVR5 models and configurations (used for denoising)",
"files": [
{
"path": "pretrained_models/uvr5/Kim_Vocal_1.onnx",
"info": "Main UVR5 model (ONNX format) for vocal/accompaniment separation and denoising"
},
{
"path": "pretrained_models/uvr5/MDX-Net-Kim-Vocal1.json",
"info": "UVR5 model architecture and inference configuration (channels, frame length, etc.)"
},
{
"path": "pretrained_models/uvr5/model_data.json",
"info": "UVR5 metadata including presets, model list, and default parameters"
},
{
"path": "pretrained_models/uvr5/model_name_mapper.json",
"info": "Mapping from internal UVR5 model names to human-readable names for frontend selection"
}
]
},
"whisperx": {
"path": "pretrained_models/whisperx",
"description": "WhisperX VAD and segmentation model assets",
"files": [
{
"path": "pretrained_models/whisperx/whisperx-vad-segmentation.bin",
"info": "WhisperX voice activity detection (VAD) weights used for long-audio segmentation and ASR alignment assistance"
},
{
"path": "pretrained_models/whisperx/whisperx-vad-segmentation.bak",
"info": "Backup or legacy version of the VAD model, kept for safety"
}
]
}
}
}