Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

README.md +96 -3
campplus.onnx +3 -0
cosyvoice.yaml +202 -0
flow.pt +3 -0
hift.pt +3 -0
llm.pt +3 -0
speech_tokenizer_v1.onnx +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,96 @@
----
-license: apache-2.0
----

+# CosyEdit: Unlocking End-to-End Speech Editing Capability from Zero-Shot Text-to-Speech Models
+## Highlight 🔥
+**CosyEdit** is an End-to-End Speech Editing model built upon the powerful **CosyVoice** zero-shot text-to-speech model.
+<p align="center" style="margin-bottom:8px;">
+    <img src="./asset/CosyEditLogo.png" width="320">
+</p>
+<p align="center">
+    <a href="https://cjy1018.github.io/CosyEditDemoPage/" target="_blank" >🎧 Demo Page</a>
+    &nbsp;&nbsp;|&nbsp;&nbsp;
+    <a href="https://arxiv.org/abs/2601.05329" target="_blank">📜 Paper</a>
+    &nbsp;&nbsp;|&nbsp;&nbsp;
+    <a href="https://huggingface.co/CJY/CosyEdit" target="_blank">🤗 HuggingFace</a>
+    &nbsp;&nbsp;|&nbsp;&nbsp;
+    <a href="https://www.modelscope.cn/models/CJY1018/CosyEdit" target="_blank">🤖 ModelScope</a>
+</p>
+### Key Advantages
+- **Comfortable Speech Editing ☕**: No external speech–text alignment tools, no complex editing algorithms—everything is handled by an end-to-end model, just one-step editing.
+- **Native Multi-Span Editing ✂️**: Natively supports insertion, deletion, and substitution across multiple spans within a single utterance, all completed in one inference pass.
+- **Low-Cost, High-Performance ⚡**: Unlocks strong speech editing capabilities from existing zero-shot TTS models, delivering competitive performance with small model size and minimal training cost.
+## Install
+### Clone and Install
+- Clone the repo
+    ``` sh
+    git clone --recursive https://github.com/CJY1018/CosyEdit.git
+    # If you failed to clone the submodule due to network failures, please run the following command until success
+    cd CosyEdit
+    git submodule update --init --recursive
+    ```
+- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
+- Create Conda env:
+    ``` sh
+    conda create -n cosyedit -y python=3.10
+    conda activate cosyedit
+    pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
+    # If you encounter sox compatibility issues
+    # ubuntu
+    sudo apt-get install sox libsox-dev
+    # centos
+    sudo yum install sox sox-devel
+    ```
+### Model Download
+You can download the pretrained models by running the following code. The pretrained models will be saved in `pretrained_models` directory.
+``` python
+# modelscope SDK model download
+from modelscope import snapshot_download
+snapshot_download('CJY1018/CosyEdit', local_dir='pretrained_models/CosyEdit')
+# for overseas users, huggingface SDK model download
+from huggingface_hub import snapshot_download
+snapshot_download('CJY/CosyEdit', local_dir='pretrained_models/CosyEdit')
+```
+### Basic Usage
+Follow the code in `example.py` for detailed usage of CosyEdit.
+```sh
+python example.py
+```
+💡 CosyEdit is fully compatible with the CosyVoice codebase. This repository supports both speech editing with CosyEdit and speech synthesis using the original CosyVoice TTS models.
+## Acknowledgments
+We thank the following open-source projects for their support:
+1. We borrowed a lot of code from [CosyVoice](https://github.com/FunAudioLLM/CosyVoice).
+2. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
+## Citations
+If you find this work useful in your research, please consider citing our paper:
+``` bibtex
+@article{chen2026cosyedit,
+  title={CosyEdit: Unlocking End-to-End Speech Editing Capability from Zero-Shot Text-to-Speech Models},
+  author={Chen, Junyang and Jia, Yuhang and Wang, Hui and Zhou, Jiaming and Han, Yaxin and Feng, Mengying and Qin, Yong},
+  journal={arXiv preprint arXiv:2601.05329},
+  year={2026}
+}
+```
+## Disclaimer
+The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.

campplus.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
+size 28303423

cosyvoice.yaml ADDED Viewed

	@@ -0,0 +1,202 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1984]
+__set_seed2: !apply:numpy.random.seed [1984]
+__set_seed3: !apply:torch.manual_seed [1984]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1984]
+# fixed params
+sample_rate: 22050
+text_encoder_input_size: 512
+llm_input_size: 1024
+llm_output_size: 1024
+spk_embed_dim: 192
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:cosyvoice.llm.llm.TransformerLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    text_token_size: 51866
+    speech_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    spk_embed_dim: !ref <spk_embed_dim>
+    text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
+        input_size: !ref <text_encoder_input_size>
+        output_size: 1024
+        attention_heads: 16
+        linear_units: 4096
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        use_cnn_module: False
+        macaron_style: False
+        use_dynamic_chunk: False
+        use_dynamic_left_chunk: False
+        static_chunk_size: 1
+    llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
+        input_size: !ref <llm_input_size>
+        output_size: !ref <llm_output_size>
+        attention_heads: 16
+        linear_units: 4096
+        num_blocks: 14
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: 'linear_legacy'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        static_chunk_size: 1
+    sampling: !name:cosyvoice.utils.common.ras_sampling_edit
+        top_p: 0.8
+        top_k: 25
+        win_size: 10
+        tau_r: 0.1
+flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
+    input_size: 512
+    output_size: 80
+    spk_embed_dim: !ref <spk_embed_dim>
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 50
+    only_mask_loss: True
+    encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 8
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 512
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
+        channels: 80
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        n_spks: 1
+        spk_emb_dim: 80
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
+            in_channels: 320
+            out_channels: 80
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 12
+            num_heads: 8
+            act_fn: 'gelu'
+hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+# processor functions
+parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
+get_tokenizer: !name:whisper.tokenizer.get_tokenizer
+    multilingual: True
+    num_languages: 100
+    language: 'en'
+    task: 'transcribe'
+allowed_special: 'all'
+tokenize: !name:cosyvoice.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:cosyvoice.dataset.processor.filter
+    max_length: 40960
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:cosyvoice.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 80
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 8000
+    center: False
+compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:cosyvoice.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:cosyvoice.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:cosyvoice.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 2000
+padding: !name:cosyvoice.dataset.processor.padding
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <filter>,
+    !ref <resample>,
+    !ref <compute_fbank>,
+    !ref <parse_embedding>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.001
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 2500
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: -1

flow.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ffc0f21c903c7b05142afa2ac1532759c74d1a9ecb89eb2d77ebe12774c0897
+size 419806842

hift.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91e679b6ca1eff71187ffb4f3ab0444935594cdcc20a9bd12afad111ef8d6012
+size 81896716

llm.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef6363850ae1125789887223417700ef8995181c3c80f3cc0331d6a3860dc522
+size 1242973052

speech_tokenizer_v1.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23b5a723ed9143aebfd9ffda14ac4c21231f31c35ef837b6a13bb9e5488abb1e
+size 522624269