Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

primepake commited on Jul 14

Commit

0f2bd14

1 Parent(s): 55ac664

add contrastive training code

Browse files

Files changed (7) hide show

speech/cosyvoice/flow/flow_matching.py +5 -1
speech/cosyvoice2.yaml +0 -206
speech/examples/magicdata-read/cosyvoice/conf +0 -1
speech/examples/magicdata-read/cosyvoice/cosyvoice +0 -1
speech/examples/magicdata-read/cosyvoice/local/prepare_data.py +0 -52
speech/examples/magicdata-read/cosyvoice/tools +0 -1
speech/examples/magicdata-read/cosyvoice/tts_text.json +0 -18

speech/cosyvoice/flow/flow_matching.py CHANGED Viewed

@@ -299,8 +299,12 @@ class ConditionalCFM(BASECFM):
             print('contrastive_loss: ', contrastive_loss)
         else:
             contrastive_loss = torch.tensor(0.0, device=fm_loss.device)
-        loss = fm_loss - self.lambda_weight * contrastive_loss
         return loss, y

             print('contrastive_loss: ', contrastive_loss)
         else:
             contrastive_loss = torch.tensor(0.0, device=fm_loss.device)
+        print("fm_loss: ", fm_loss)
+        contrastive_loss = self.lambda_weight * contrastive_loss
+        print('contrastive_loss: ', contrastive_loss)
+        loss = fm_loss - contrastive_loss
         return loss, y

speech/cosyvoice2.yaml DELETED Viewed

@@ -1,206 +0,0 @@
-# set random seed, so that you may reproduce your result.
-__set_seed1: !apply:random.seed [1986]
-__set_seed2: !apply:numpy.random.seed [1986]
-__set_seed3: !apply:torch.manual_seed [1986]
-__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
-# fixed params
-sample_rate: 24000
-llm_input_size: 896
-llm_output_size: 896
-spk_embed_dim: 192
-qwen_pretrain_path: ''
-token_frame_rate: 25
-token_mel_ratio: 2
-# stream related params
-chunk_size: 25 # streaming inference chunk size, in token
-num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
-# model params
-# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
-# for system/third_party class/function, we do not require this.
-llm: !new:cosyvoice.llm.llm.Qwen2LM
-    llm_input_size: !ref <llm_input_size>
-    llm_output_size: !ref <llm_output_size>
-    speech_token_size: 6561
-    length_normalized_loss: True
-    lsm_weight: 0
-    mix_ratio: [5, 15]
-    llm: !new:cosyvoice.llm.llm.Qwen2Encoder
-        pretrain_path: !ref <qwen_pretrain_path>
-    sampling: !name:cosyvoice.utils.common.ras_sampling
-        top_p: 0.8
-        top_k: 25
-        win_size: 10
-        tau_r: 0.1
-flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
-    input_size: 512
-    output_size: 80
-    spk_embed_dim: !ref <spk_embed_dim>
-    output_type: 'mel'
-    vocab_size: 6561
-    input_frame_rate: !ref <token_frame_rate>
-    only_mask_loss: True
-    token_mel_ratio: !ref <token_mel_ratio>
-    pre_lookahead_len: 3
-    encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
-        output_size: 512
-        attention_heads: 8
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.1
-        normalize_before: True
-        input_layer: 'linear'
-        pos_enc_layer_type: 'rel_pos_espnet'
-        selfattention_layer_type: 'rel_selfattn'
-        input_size: 512
-        use_cnn_module: False
-        macaron_style: False
-        static_chunk_size: !ref <chunk_size>
-    decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
-        in_channels: 240
-        n_spks: 1
-        spk_emb_dim: 80
-        cfm_params: !new:omegaconf.DictConfig
-            content:
-                sigma_min: 1e-06
-                solver: 'euler'
-                t_scheduler: 'cosine'
-                training_cfg_rate: 0.2
-                inference_cfg_rate: 0.7
-                reg_loss_type: 'l1'
-        estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
-            in_channels: 320
-            out_channels: 80
-            channels: [256]
-            dropout: 0.0
-            attention_head_dim: 64
-            n_blocks: 4
-            num_mid_blocks: 12
-            num_heads: 8
-            act_fn: 'gelu'
-            static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
-            num_decoding_left_chunks: !ref <num_decoding_left_chunks>
-hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
-    in_channels: 80
-    base_channels: 512
-    nb_harmonics: 8
-    sampling_rate: !ref <sample_rate>
-    nsf_alpha: 0.1
-    nsf_sigma: 0.003
-    nsf_voiced_threshold: 10
-    upsample_rates: [8, 5, 3]
-    upsample_kernel_sizes: [16, 11, 7]
-    istft_params:
-        n_fft: 16
-        hop_len: 4
-    resblock_kernel_sizes: [3, 7, 11]
-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-    source_resblock_kernel_sizes: [7, 7, 11]
-    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-    lrelu_slope: 0.1
-    audio_limit: 0.99
-    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
-        num_class: 1
-        in_channels: 80
-        cond_channels: 512
-# gan related module
-mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
-    n_fft: 1920
-    num_mels: 80
-    sampling_rate: !ref <sample_rate>
-    hop_size: 480
-    win_size: 1920
-    fmin: 0
-    fmax: null
-    center: False
-hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
-    generator: !ref <hift>
-    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
-        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
-        mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
-    mel_spec_transform: [
-        !ref <mel_spec_transform1>
-    ]
-individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener
-# processor functions
-parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
-get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
-    token_path: !ref <qwen_pretrain_path>
-    skip_special_tokens: True
-allowed_special: 'all'
-tokenize: !name:cosyvoice.dataset.processor.tokenize
-    get_tokenizer: !ref <get_tokenizer>
-    allowed_special: !ref <allowed_special>
-filter: !name:cosyvoice.dataset.processor.filter
-    max_length: 40960
-    min_length: 100
-    token_max_length: 200
-    token_min_length: 1
-resample: !name:cosyvoice.dataset.processor.resample
-    resample_rate: !ref <sample_rate>
-truncate: !name:cosyvoice.dataset.processor.truncate
-    truncate_length: 24480 # must be a multiplier of hop_size
-feat_extractor: !name:matcha.utils.audio.mel_spectrogram
-    n_fft: 1920
-    num_mels: 80
-    sampling_rate: !ref <sample_rate>
-    hop_size: 480
-    win_size: 1920
-    fmin: 0
-    fmax: 8000
-    center: False
-compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
-    feat_extractor: !ref <feat_extractor>
-compute_f0: !name:cosyvoice.dataset.processor.compute_f0
-    sample_rate: !ref <sample_rate>
-    hop_size: 480
-parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
-    normalize: True
-shuffle: !name:cosyvoice.dataset.processor.shuffle
-    shuffle_size: 1000
-sort: !name:cosyvoice.dataset.processor.sort
-    sort_size: 500  # sort_size should be less than shuffle_size
-batch: !name:cosyvoice.dataset.processor.batch
-    batch_type: 'dynamic'
-    max_frames_in_batch: 2000
-padding: !name:cosyvoice.dataset.processor.padding
-    use_spk_embedding: False # change to True during sft
-# dataset processor pipeline
-data_pipeline: [
-    !ref <individual_file_opener>,
-    !ref <tokenize>,
-    !ref <filter>,
-    !ref <resample>,
-    !ref <compute_fbank>,
-    !ref <parse_embedding>,
-    !ref <shuffle>,
-    !ref <sort>,
-    !ref <batch>,
-    !ref <padding>,
-]
-# llm flow train conf
-train_conf:
-    optim: adamw
-    optim_conf:
-        lr: 1e-5 # change to 1e-5 during sft
-    scheduler: constantlr # change to constantlr during sft
-    scheduler_conf:
-        warmup_steps: 2500
-    max_epoch: 200
-    grad_clip: 1
-    accum_grad: 1
-    log_interval: 100
-    save_per_step: -1

speech/examples/magicdata-read/cosyvoice/conf DELETED Viewed

	@@ -1 +0,0 @@
1	- ../../libritts/cosyvoice/conf

speech/examples/magicdata-read/cosyvoice/cosyvoice DELETED Viewed

	@@ -1 +0,0 @@
1	- ../../../cosyvoice

speech/examples/magicdata-read/cosyvoice/local/prepare_data.py DELETED Viewed

@@ -1,52 +0,0 @@
-import argparse
-import logging
-import os
-from tqdm import tqdm
-logger = logging.getLogger()
-def main():
-    utt2wav, utt2text, utt2spk, spk2utt = {}, {}, {}, {}
-    with open(os.path.join(args.src_dir, "TRANS.txt"), "r") as f:
-        lines = f.readlines()[1:]
-        lines = [l.split('\t') for l in lines]
-    for wav, spk, content in tqdm(lines):
-        wav, spk, content = wav.strip(), spk.strip(), content.strip()
-        content = content.replace('[FIL]', '')
-        content = content.replace('[SPK]', '')
-        wav = os.path.join(args.src_dir, spk, wav)
-        if not os.path.exists(wav):
-            continue
-        utt = os.path.basename(wav).replace('.wav', '')
-        utt2wav[utt] = wav
-        utt2text[utt] = content
-        utt2spk[utt] = spk
-        if spk not in spk2utt:
-            spk2utt[spk] = []
-        spk2utt[spk].append(utt)
-    with open('{}/wav.scp'.format(args.des_dir), 'w') as f:
-        for k, v in utt2wav.items():
-            f.write('{} {}\n'.format(k, v))
-    with open('{}/text'.format(args.des_dir), 'w') as f:
-        for k, v in utt2text.items():
-            f.write('{} {}\n'.format(k, v))
-    with open('{}/utt2spk'.format(args.des_dir), 'w') as f:
-        for k, v in utt2spk.items():
-            f.write('{} {}\n'.format(k, v))
-    with open('{}/spk2utt'.format(args.des_dir), 'w') as f:
-        for k, v in spk2utt.items():
-            f.write('{} {}\n'.format(k, ' '.join(v)))
-    return
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--src_dir',
-                        type=str)
-    parser.add_argument('--des_dir',
-                        type=str)
-    args = parser.parse_args()
-    main()

speech/examples/magicdata-read/cosyvoice/tools DELETED Viewed

	@@ -1 +0,0 @@
1	- ../../../tools

speech/examples/magicdata-read/cosyvoice/tts_text.json DELETED Viewed

@@ -1,18 +0,0 @@
-{
-  "38_5718_20170915093303": [
-    "我想这出最好歌曲把歌词发到网上请别人帮我作曲急急",
-    "叫他明天早上差五分儿九点去机场"
-  ],
-  "38_5721_20170915091235": [
-    "变温室调到零下两度档",
-    "交谈中请勿轻信汇款信息陌生电话请勿使用外挂软件"
-  ],
-  "38_5733_20170915130323": [
-    "这是老鹰乐队的一首经典歌曲",
-    "我急用这段音乐我自己找到一段但是有现场杂音"
-  ],
-  "38_5836_20170916221414": [
-    "给我播一个陶喆的专辑",
-    "这套餐好贵呀我发这么多短信贵死了"
-  ]
-}