# CONFIG -----------------------------------------------------------------------------------------------------------# # Here are the input and output data paths (Note: you can override wav_path in preprocess.py) wav_path = 'E:\\智能语音处理系统\\Noise-suppression-and-speech-recognition-systems-master\\WaveRNNModel\\data\\LJSpeech-1.1\\wavs' data_path = 'E:\\智能语音处理系统\\Noise-suppression-and-speech-recognition-systems-master\\WaveRNNModel\\data' # model ids are separate - that way you can use a new tts with an old wavernn and vice versa # NB: expect undefined behaviour if models were trained on different DSP settings voc_model_id = 'ljspeech_mol' tts_model_id = 'ljspeech_lsa_smooth_attention' # set this to True if you are only interested in WaveRNN ignore_tts = False # DSP --------------------------------------------------------------------------------------------------------------# # Settings for all models sample_rate = 22050 n_fft = 2048 fft_bins = n_fft // 2 + 1 num_mels = 80 hop_length = 275 # 12.5ms - in line with Tacotron 2 paper win_length = 1100 # 50ms - same reason as above fmin = 40 min_level_db = -100 ref_level_db = 20 bits = 9 # bit depth of signal mu_law = True # Recommended to suppress noise if using raw bits in hp.voc_mode below peak_norm = False # Normalise to the peak of each wav file # WAVERNN / VOCODER ------------------------------------------------------------------------------------------------# # Model Hparams voc_mode = 'MOL' # either 'RAW' (softmax on raw bits) or 'MOL' (sample from mixture of logistics) voc_upsample_factors = (5, 5, 11) # NB - this needs to correctly factorise hop_length voc_rnn_dims = 512 voc_fc_dims = 512 voc_compute_dims = 128 voc_res_out_dims = 128 voc_res_blocks = 10 # Training voc_batch_size = 32 voc_lr = 1e-4 voc_checkpoint_every = 25_000 voc_gen_at_checkpoint = 5 # number of samples to generate at each checkpoint voc_total_steps = 1_000_000 # Total number of training steps voc_test_samples = 50 # How many unseen samples to put aside for testing voc_pad = 2 # this will pad the input so that the resnet can 'see' wider than input length voc_seq_len = hop_length * 5 # must be a multiple of hop_length voc_clip_grad_norm = 4 # set to None if no gradient clipping needed # Generating / Synthesizing voc_gen_batched = True # very fast (realtime+) single utterance batched generation voc_target = 11_000 # target number of samples to be generated in each batch entry voc_overlap = 550 # number of samples for crossfading between batches # TACOTRON/TTS -----------------------------------------------------------------------------------------------------# # Model Hparams tts_embed_dims = 256 # embedding dimension for the graphemes/phoneme inputs tts_encoder_dims = 128 tts_decoder_dims = 256 tts_postnet_dims = 128 tts_encoder_K = 16 tts_lstm_dims = 512 tts_postnet_K = 8 tts_num_highways = 4 tts_dropout = 0.5 tts_cleaner_names = ['english_cleaners'] tts_stop_threshold = -3.4 # Value below which audio generation ends. # For example, for a range of [-4, 4], this # will terminate the sequence at the first # frame that has all values < -3.4 # Training #tts_schedule = [(7, 1e-3, 10_000, 32), # progressive training schedule # (5, 1e-4, 100_000, 32), # (r, lr, step, batch_size) # (2, 1e-4, 180_000, 16), # (2, 1e-4, 350_000, 8)] tts_schedule = [(7, 1e-3, 10_000, 32)] # progressive training schedule #(5, 1e-4, 100_000, 64), # (r, lr, step, batch_size) #(2, 1e-4, 180_000, 64), #(2, 1e-4, 350_000, 64)] tts_max_mel_len = 1250 # if you have a couple of extremely long spectrograms you might want to use this tts_bin_lengths = True # bins the spectrogram lengths before sampling in data loader - speeds up training tts_clip_grad_norm = 1.0 # clips the gradient norm to prevent explosion - set to None if not needed tts_checkpoint_every = 2_000 # checkpoints the model every X steps # TODO: tts_phoneme_prob = 0.0 # [0 <-> 1] probability for feeding model phonemes vrs graphemes # ------------------------------------------------------------------------------------------------------------------#