File size: 1,962 Bytes

424c56c

{
  "audio": {
    "sample_rate": 24000,
    "n_fft": 1024,
    "win_length": 1024,
    "hop_length": 256,
    "n_mels": 80,
    "f_min": 0.0,
    "f_max": 12000.0,
    "trim_db": 32.0,
    "pitch_fmin": 50.0,
    "pitch_fmax": 600.0
  },
  "dataset": {
    "train_manifest": "data/features/ljspeech/train/normalized_manifest.jsonl",
    "eval_manifest": "data/features/ljspeech/eval/normalized_manifest.jsonl",
    "feature_dir": "data/features/ljspeech",
    "max_text_tokens": 256,
    "max_mel_frames": 2048,
    "min_duration_seconds": 0.5,
    "max_duration_seconds": 20.0,
    "num_workers": 0
  },
  "semantic": {
    "vocab_size": 39,
    "d_model": 256,
    "num_heads": 4,
    "low_rank": 32,
    "top_k": 12,
    "local_window": 32,
    "memory_candidates": 8,
    "landmark_count": 8,
    "content_memory_candidates": 8,
    "laminar_steps": 2,
    "laminar_eta": 0.1,
    "max_positions": 512
  },
  "speaker": {
    "input_dim": 80,
    "conv_channels": 128,
    "embedding_dim": 192,
    "low_rank": 24,
    "top_k": 10,
    "local_window": 24
  },
  "prosody": {
    "d_model": 256,
    "hidden_dim": 128,
    "pitch_bins": 128
  },
  "acoustic": {
    "d_model": 256,
    "speaker_dim": 192,
    "prosody_dim": 3,
    "n_mels": 80,
    "low_rank": 32,
    "top_k": 24,
    "local_window": 48,
    "chunk_size": 24,
    "streaming_cache_frames": 96
  },
  "vocoder": {
    "n_mels": 80,
    "channels": 128,
    "residual_layers": 6,
    "upsample_scales": [
      8,
      5,
      3,
      2
    ],
    "sample_rate": 24000
  },
  "training": {
    "seed": 7,
    "epochs": 50,
    "batch_size": 4,
    "learning_rate": 0.0002,
    "weight_decay": 0.01,
    "warmup_steps": 1000,
    "grad_clip": 1.0,
    "grad_accum_steps": 1,
    "precision": "fp32",
    "log_every": 10,
    "eval_every": 500,
    "save_every": 1000,
    "output_dir": "artifacts/ljspeech_tts",
    "num_nodes": 1,
    "devices": 1,
    "distributed_backend": "gloo"
  }
}