{ "audio": { "sample_rate": 24000, "n_fft": 1024, "win_length": 1024, "hop_length": 256, "n_mels": 80, "f_min": 0.0, "f_max": 12000.0, "trim_db": 32.0, "pitch_fmin": 50.0, "pitch_fmax": 600.0 }, "dataset": { "train_manifest": "data/features/ljspeech/train/normalized_manifest.jsonl", "eval_manifest": "data/features/ljspeech/eval/normalized_manifest.jsonl", "feature_dir": "data/features/ljspeech", "max_text_tokens": 256, "max_mel_frames": 2048, "min_duration_seconds": 0.5, "max_duration_seconds": 20.0, "num_workers": 0 }, "semantic": { "vocab_size": 39, "d_model": 256, "num_heads": 4, "low_rank": 32, "top_k": 12, "local_window": 32, "memory_candidates": 8, "landmark_count": 8, "content_memory_candidates": 8, "laminar_steps": 2, "laminar_eta": 0.1, "max_positions": 512 }, "speaker": { "input_dim": 80, "conv_channels": 128, "embedding_dim": 192, "low_rank": 24, "top_k": 10, "local_window": 24 }, "prosody": { "d_model": 256, "hidden_dim": 128, "pitch_bins": 128 }, "acoustic": { "d_model": 256, "speaker_dim": 192, "prosody_dim": 3, "n_mels": 80, "low_rank": 32, "top_k": 24, "local_window": 48, "chunk_size": 24, "streaming_cache_frames": 96 }, "vocoder": { "n_mels": 80, "channels": 128, "residual_layers": 6, "upsample_scales": [ 8, 5, 3, 2 ], "sample_rate": 24000 }, "training": { "seed": 7, "epochs": 50, "batch_size": 4, "learning_rate": 0.0002, "weight_decay": 0.01, "warmup_steps": 1000, "grad_clip": 1.0, "grad_accum_steps": 1, "precision": "fp32", "log_every": 10, "eval_every": 500, "save_every": 1000, "output_dir": "artifacts/ljspeech_tts", "num_nodes": 1, "devices": 1, "distributed_backend": "gloo" } }