mesko-tts / config.json

Publish BioVoice-TTS sparse energy checkpoint and model card

424c56c verified 1 day ago

1.96 kB

	{
	"audio": {
	"sample_rate": 24000,
	"n_fft": 1024,
	"win_length": 1024,
	"hop_length": 256,
	"n_mels": 80,
	"f_min": 0.0,
	"f_max": 12000.0,
	"trim_db": 32.0,
	"pitch_fmin": 50.0,
	"pitch_fmax": 600.0
	},
	"dataset": {
	"train_manifest": "data/features/ljspeech/train/normalized_manifest.jsonl",
	"eval_manifest": "data/features/ljspeech/eval/normalized_manifest.jsonl",
	"feature_dir": "data/features/ljspeech",
	"max_text_tokens": 256,
	"max_mel_frames": 2048,
	"min_duration_seconds": 0.5,
	"max_duration_seconds": 20.0,
	"num_workers": 0
	},
	"semantic": {
	"vocab_size": 39,
	"d_model": 256,
	"num_heads": 4,
	"low_rank": 32,
	"top_k": 12,
	"local_window": 32,
	"memory_candidates": 8,
	"landmark_count": 8,
	"content_memory_candidates": 8,
	"laminar_steps": 2,
	"laminar_eta": 0.1,
	"max_positions": 512
	},
	"speaker": {
	"input_dim": 80,
	"conv_channels": 128,
	"embedding_dim": 192,
	"low_rank": 24,
	"top_k": 10,
	"local_window": 24
	},
	"prosody": {
	"d_model": 256,
	"hidden_dim": 128,
	"pitch_bins": 128
	},
	"acoustic": {
	"d_model": 256,
	"speaker_dim": 192,
	"prosody_dim": 3,
	"n_mels": 80,
	"low_rank": 32,
	"top_k": 24,
	"local_window": 48,
	"chunk_size": 24,
	"streaming_cache_frames": 96
	},
	"vocoder": {
	"n_mels": 80,
	"channels": 128,
	"residual_layers": 6,
	"upsample_scales": [
	8,
	5,
	3,
	2
	],
	"sample_rate": 24000
	},
	"training": {
	"seed": 7,
	"epochs": 50,
	"batch_size": 4,
	"learning_rate": 0.0002,
	"weight_decay": 0.01,
	"warmup_steps": 1000,
	"grad_clip": 1.0,
	"grad_accum_steps": 1,
	"precision": "fp32",
	"log_every": 10,
	"eval_every": 500,
	"save_every": 1000,
	"output_dir": "artifacts/ljspeech_tts",
	"num_nodes": 1,
	"devices": 1,
	"distributed_backend": "gloo"
	}
	}