| { | |
| "audio": { | |
| "sample_rate": 24000, | |
| "n_fft": 1024, | |
| "win_length": 1024, | |
| "hop_length": 256, | |
| "n_mels": 80, | |
| "f_min": 0.0, | |
| "f_max": 12000.0, | |
| "trim_db": 32.0, | |
| "pitch_fmin": 50.0, | |
| "pitch_fmax": 600.0 | |
| }, | |
| "dataset": { | |
| "train_manifest": "data/features/ljspeech/train/normalized_manifest.jsonl", | |
| "eval_manifest": "data/features/ljspeech/eval/normalized_manifest.jsonl", | |
| "feature_dir": "data/features/ljspeech", | |
| "max_text_tokens": 256, | |
| "max_mel_frames": 2048, | |
| "min_duration_seconds": 0.5, | |
| "max_duration_seconds": 20.0, | |
| "num_workers": 0 | |
| }, | |
| "semantic": { | |
| "vocab_size": 39, | |
| "d_model": 256, | |
| "num_heads": 4, | |
| "low_rank": 32, | |
| "top_k": 12, | |
| "local_window": 32, | |
| "memory_candidates": 8, | |
| "landmark_count": 8, | |
| "content_memory_candidates": 8, | |
| "laminar_steps": 2, | |
| "laminar_eta": 0.1, | |
| "max_positions": 512 | |
| }, | |
| "speaker": { | |
| "input_dim": 80, | |
| "conv_channels": 128, | |
| "embedding_dim": 192, | |
| "low_rank": 24, | |
| "top_k": 10, | |
| "local_window": 24 | |
| }, | |
| "prosody": { | |
| "d_model": 256, | |
| "hidden_dim": 128, | |
| "pitch_bins": 128 | |
| }, | |
| "acoustic": { | |
| "d_model": 256, | |
| "speaker_dim": 192, | |
| "prosody_dim": 3, | |
| "n_mels": 80, | |
| "low_rank": 32, | |
| "top_k": 24, | |
| "local_window": 48, | |
| "chunk_size": 24, | |
| "streaming_cache_frames": 96 | |
| }, | |
| "vocoder": { | |
| "n_mels": 80, | |
| "channels": 128, | |
| "residual_layers": 6, | |
| "upsample_scales": [ | |
| 8, | |
| 5, | |
| 3, | |
| 2 | |
| ], | |
| "sample_rate": 24000 | |
| }, | |
| "training": { | |
| "seed": 7, | |
| "epochs": 50, | |
| "batch_size": 4, | |
| "learning_rate": 0.0002, | |
| "weight_decay": 0.01, | |
| "warmup_steps": 1000, | |
| "grad_clip": 1.0, | |
| "grad_accum_steps": 1, | |
| "precision": "fp32", | |
| "log_every": 10, | |
| "eval_every": 500, | |
| "save_every": 1000, | |
| "output_dir": "artifacts/ljspeech_tts", | |
| "num_nodes": 1, | |
| "devices": 1, | |
| "distributed_backend": "gloo" | |
| } | |
| } |