File size: 3,605 Bytes

# set random seed, so that you may reproduce your result.
__set_seed1: !apply:random.seed [1986]
__set_seed2: !apply:numpy.random.seed [1986]
__set_seed3: !apply:torch.manual_seed [1986]
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]

# fixed params
sample_rate: 24000
llm_input_size: 896
llm_output_size: 896
spk_embed_dim: 192
qwen_pretrain_path: ''
token_frame_rate: 12.5
token_mel_ratio: 4

# stream related params
chunk_size: 5 # streaming inference chunk size, in token
num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks


flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
    input_size: 512
    output_size: 80
    spk_embed_dim: !ref <spk_embed_dim>
    output_type: 'mel'
    vocab_size: 20480
    input_frame_rate: !ref <token_frame_rate>
    only_mask_loss: True
    token_mel_ratio: !ref <token_mel_ratio>
    pre_lookahead_len: 3
    encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
        output_size: 512
        attention_heads: 8
        linear_units: 2048
        num_blocks: 6
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        normalize_before: True
        input_layer: 'linear'
        pos_enc_layer_type: 'rel_pos_espnet'
        selfattention_layer_type: 'rel_selfattn'
        input_size: 512
        upsample_stride: 4
        use_cnn_module: False
        macaron_style: False
        static_chunk_size: !ref <chunk_size>
    decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
        in_channels: 240
        n_spks: 1
        spk_emb_dim: 80
        cfm_params: !new:omegaconf.DictConfig
            content:
                sigma_min: 1e-06
                solver: 'euler'
                t_scheduler: 'cosine'
                training_cfg_rate: 0.2
                inference_cfg_rate: 0.7
                reg_loss_type: 'l1'
        estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
            in_channels: 320
            out_channels: 80
            channels: [256]
            dropout: 0.0
            attention_head_dim: 64
            n_blocks: 4
            num_mid_blocks: 12
            num_heads: 8
            act_fn: 'gelu'
            static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
            num_decoding_left_chunks: !ref <num_decoding_left_chunks>

hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
    in_channels: 80
    base_channels: 512
    nb_harmonics: 8
    sampling_rate: !ref <sample_rate>
    nsf_alpha: 0.1
    nsf_sigma: 0.003
    nsf_voiced_threshold: 10
    upsample_rates: [8, 5, 3]
    upsample_kernel_sizes: [16, 11, 7]
    istft_params:
        n_fft: 16
        hop_len: 4
    resblock_kernel_sizes: [3, 7, 11]
    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
    source_resblock_kernel_sizes: [7, 7, 11]
    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
    lrelu_slope: 0.1
    audio_limit: 0.99
    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
        num_class: 1
        in_channels: 80
        cond_channels: 512

feat_extractor: !name:Matcha-TTS.matcha.utils.audio.mel_spectrogram
    n_fft: 1920
    num_mels: 80
    sampling_rate: !ref <sample_rate>
    hop_size: 480
    win_size: 1920
    fmin: 0
    fmax: 8000
    center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
    feat_extractor: !ref <feat_extractor>
    token_mel_ratio: 4
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
    sample_rate: !ref <sample_rate>
    hop_size: 480