| { |
| "args": null, |
| "audio_embedding_dim": 1024, |
| "audio_embedding_dropout": 0.0, |
| "audio_max_length": 20.0, |
| "audio_min_length": 2.0, |
| "audio_pad_token": 2050, |
| "audio_positional_embedding_dropout": 0.0, |
| "audio_vocab_size": "2048", |
| "batch_size": 100, |
| "clipping_update_period": 1000, |
| "codebook_weight": "[2,1,1,1]", |
| "d_model": 1024, |
| "dataset": "gigaspeech", |
| "dataset_dir": "/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest/xl", |
| "drop_long": 1, |
| "dynamic_batching": 1, |
| "early_stop_step": 3200, |
| "early_stop_threshold": -1.0, |
| "empty_token": 2048, |
| "encodec_folder_name": "encodec_16khz_4codebooks", |
| "encodec_sr": 50, |
| "eog": 2049, |
| "eos": 2051, |
| "exp_dir": "/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/tts_enhanced_330M", |
| "gradient_accumulation_steps": 24, |
| "gradient_clip_val": 1.0, |
| "load_model_from": null, |
| "lr": 1e-05, |
| "manifest_name": "manifest_large16khz_lessambi", |
| "mask_len_max": 600, |
| "mask_len_min": 1, |
| "mask_sample_dist": "poisson1", |
| "max_mask_portion": 0.9, |
| "max_n_spans": 3, |
| "max_num_tokens": 50000, |
| "min_gap": 5, |
| "n_codebooks": 4, |
| "n_special": 4, |
| "nhead": 16, |
| "num_buckets": 10, |
| "num_decoder_layers": 24, |
| "num_epochs": 10, |
| "num_steps": 500000, |
| "num_workers": 8, |
| "optimizer_name": "AdamW", |
| "pad_x": 0, |
| "phn2num": { |
| "!": 17, |
| "\"": 97, |
| ",": 64, |
| ".": 77, |
| "1": 80, |
| ":": 93, |
| ";": 81, |
| "<MUSIC>": 39, |
| "<NOISE>": 52, |
| "<OTHER>": 60, |
| "<SIL>": 53, |
| "?": 78, |
| "_": 15, |
| "a\u026a": 48, |
| "a\u026a\u0259": 56, |
| "a\u026a\u025a": 2, |
| "a\u028a": 36, |
| "b": 20, |
| "d": 72, |
| "d\u0292": 57, |
| "e": 85, |
| "e\u026a": 6, |
| "f": 69, |
| "h": 14, |
| "i": 27, |
| "i\u0259": 42, |
| "i\u02d0": 68, |
| "i\u02d0\u02d0": 51, |
| "j": 67, |
| "k": 41, |
| "kh": 84, |
| "l": 63, |
| "m": 9, |
| "n": 23, |
| "n\u02b2": 8, |
| "o": 86, |
| "o\u028a": 25, |
| "o\u02d0": 74, |
| "o\u02d0\u0279": 40, |
| "p": 34, |
| "q": 96, |
| "r": 79, |
| "s": 66, |
| "t": 73, |
| "t\u0255": 87, |
| "t\u0283": 75, |
| "t\u02b0": 94, |
| "u": 1, |
| "u\u02d0": 47, |
| "v": 31, |
| "w": 19, |
| "x": 4, |
| "z": 22, |
| "\u00a1": 98, |
| "\u00ab": 88, |
| "\u00bb": 89, |
| "\u00bf": 95, |
| "\u00e6": 32, |
| "\u00e6\u00e6": 50, |
| "\u00e7": 10, |
| "\u00f0": 7, |
| "\u014b": 58, |
| "\u0250": 70, |
| "\u0250\u0250": 71, |
| "\u0251": 61, |
| "\u0251\u02d0": 0, |
| "\u0251\u02d0\u0279": 44, |
| "\u0252": 83, |
| "\u0254": 3, |
| "\u0254\u026a": 13, |
| "\u0254\u02d0": 29, |
| "\u0254\u02d0\u0279": 33, |
| "\u0259": 54, |
| "\u0259l": 16, |
| "\u0259\u028a": 90, |
| "\u025a": 35, |
| "\u025b": 18, |
| "\u025b\u0279": 11, |
| "\u025b\u02d0": 82, |
| "\u025c\u02d0": 21, |
| "\u0261": 49, |
| "\u0261\u02b2": 37, |
| "\u026a": 65, |
| "\u026a\u0279": 76, |
| "\u026a\u02d0": 100, |
| "\u026c": 46, |
| "\u026f": 91, |
| "\u0279": 5, |
| "\u027e": 24, |
| "\u0283": 26, |
| "\u028a": 43, |
| "\u028a\u0279": 28, |
| "\u028c": 38, |
| "\u0292": 55, |
| "\u0294": 59, |
| "\u0303": 45, |
| "\u0329": 12, |
| "\u03b8": 30, |
| "\u1d7b": 62, |
| "\u2014": 99, |
| "\u2026": 92 |
| }, |
| "phn_folder_name": "phonemes", |
| "precision": "float16", |
| "print_every_n_steps": 800, |
| "pseudo_epoch_size": 3000, |
| "reduce_lr_start_epoch": 4, |
| "reduce_lr_start_step": 3000, |
| "reduced_eog": 1, |
| "resume": false, |
| "seed": 1, |
| "shuffle_mask_embedding": 0, |
| "special_first": 0, |
| "tb_write_every_n_steps": 100, |
| "text_embedding_dropout": 0.0, |
| "text_max_length": 400, |
| "text_min_length": 10.0, |
| "text_pad_token": 120, |
| "text_positional_embedding_dropout": 0.0, |
| "text_vocab_size": 120, |
| "trm_dropout": 0.0, |
| "val_every_n_steps": 3200, |
| "val_max_num_tokens": 6000, |
| "warmup_fraction": 0.1, |
| "weight_decay": 0.0 |
| } |