mesko-tts / config.json
mesklintech's picture
Publish BioVoice-TTS sparse energy checkpoint and model card
424c56c verified
{
"audio": {
"sample_rate": 24000,
"n_fft": 1024,
"win_length": 1024,
"hop_length": 256,
"n_mels": 80,
"f_min": 0.0,
"f_max": 12000.0,
"trim_db": 32.0,
"pitch_fmin": 50.0,
"pitch_fmax": 600.0
},
"dataset": {
"train_manifest": "data/features/ljspeech/train/normalized_manifest.jsonl",
"eval_manifest": "data/features/ljspeech/eval/normalized_manifest.jsonl",
"feature_dir": "data/features/ljspeech",
"max_text_tokens": 256,
"max_mel_frames": 2048,
"min_duration_seconds": 0.5,
"max_duration_seconds": 20.0,
"num_workers": 0
},
"semantic": {
"vocab_size": 39,
"d_model": 256,
"num_heads": 4,
"low_rank": 32,
"top_k": 12,
"local_window": 32,
"memory_candidates": 8,
"landmark_count": 8,
"content_memory_candidates": 8,
"laminar_steps": 2,
"laminar_eta": 0.1,
"max_positions": 512
},
"speaker": {
"input_dim": 80,
"conv_channels": 128,
"embedding_dim": 192,
"low_rank": 24,
"top_k": 10,
"local_window": 24
},
"prosody": {
"d_model": 256,
"hidden_dim": 128,
"pitch_bins": 128
},
"acoustic": {
"d_model": 256,
"speaker_dim": 192,
"prosody_dim": 3,
"n_mels": 80,
"low_rank": 32,
"top_k": 24,
"local_window": 48,
"chunk_size": 24,
"streaming_cache_frames": 96
},
"vocoder": {
"n_mels": 80,
"channels": 128,
"residual_layers": 6,
"upsample_scales": [
8,
5,
3,
2
],
"sample_rate": 24000
},
"training": {
"seed": 7,
"epochs": 50,
"batch_size": 4,
"learning_rate": 0.0002,
"weight_decay": 0.01,
"warmup_steps": 1000,
"grad_clip": 1.0,
"grad_accum_steps": 1,
"precision": "fp32",
"log_every": 10,
"eval_every": 500,
"save_every": 1000,
"output_dir": "artifacts/ljspeech_tts",
"num_nodes": 1,
"devices": 1,
"distributed_backend": "gloo"
}
}