File size: 1,962 Bytes
424c56c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
{
  "audio": {
    "sample_rate": 24000,
    "n_fft": 1024,
    "win_length": 1024,
    "hop_length": 256,
    "n_mels": 80,
    "f_min": 0.0,
    "f_max": 12000.0,
    "trim_db": 32.0,
    "pitch_fmin": 50.0,
    "pitch_fmax": 600.0
  },
  "dataset": {
    "train_manifest": "data/features/ljspeech/train/normalized_manifest.jsonl",
    "eval_manifest": "data/features/ljspeech/eval/normalized_manifest.jsonl",
    "feature_dir": "data/features/ljspeech",
    "max_text_tokens": 256,
    "max_mel_frames": 2048,
    "min_duration_seconds": 0.5,
    "max_duration_seconds": 20.0,
    "num_workers": 0
  },
  "semantic": {
    "vocab_size": 39,
    "d_model": 256,
    "num_heads": 4,
    "low_rank": 32,
    "top_k": 12,
    "local_window": 32,
    "memory_candidates": 8,
    "landmark_count": 8,
    "content_memory_candidates": 8,
    "laminar_steps": 2,
    "laminar_eta": 0.1,
    "max_positions": 512
  },
  "speaker": {
    "input_dim": 80,
    "conv_channels": 128,
    "embedding_dim": 192,
    "low_rank": 24,
    "top_k": 10,
    "local_window": 24
  },
  "prosody": {
    "d_model": 256,
    "hidden_dim": 128,
    "pitch_bins": 128
  },
  "acoustic": {
    "d_model": 256,
    "speaker_dim": 192,
    "prosody_dim": 3,
    "n_mels": 80,
    "low_rank": 32,
    "top_k": 24,
    "local_window": 48,
    "chunk_size": 24,
    "streaming_cache_frames": 96
  },
  "vocoder": {
    "n_mels": 80,
    "channels": 128,
    "residual_layers": 6,
    "upsample_scales": [
      8,
      5,
      3,
      2
    ],
    "sample_rate": 24000
  },
  "training": {
    "seed": 7,
    "epochs": 50,
    "batch_size": 4,
    "learning_rate": 0.0002,
    "weight_decay": 0.01,
    "warmup_steps": 1000,
    "grad_clip": 1.0,
    "grad_accum_steps": 1,
    "precision": "fp32",
    "log_every": 10,
    "eval_every": 500,
    "save_every": 1000,
    "output_dir": "artifacts/ljspeech_tts",
    "num_nodes": 1,
    "devices": 1,
    "distributed_backend": "gloo"
  }
}