diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..6e6ab021c8378714c422bf4f51e60213d7d9a69f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+OpenVoice-RKNN2/result.wav filter=lfs diff=lfs merge=lfs -text
+OpenVoice-RKNN2/src2.wav filter=lfs diff=lfs merge=lfs -text
+OpenVoice-RKNN2/target.wav filter=lfs diff=lfs merge=lfs -text
+OpenVoice-RKNN2/tone_clone_model.rknn filter=lfs diff=lfs merge=lfs -text
diff --git a/OpenVoice-RKNN2/.gitattributes b/OpenVoice-RKNN2/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..282d8d7a1b4889e06a2925fd96ea206f02623b46
--- /dev/null
+++ b/OpenVoice-RKNN2/.gitattributes
@@ -0,0 +1,39 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+result.wav filter=lfs diff=lfs merge=lfs -text
+src2.wav filter=lfs diff=lfs merge=lfs -text
+target.wav filter=lfs diff=lfs merge=lfs -text
+tone_clone_model.rknn filter=lfs diff=lfs merge=lfs -text
diff --git a/OpenVoice-RKNN2/README.md b/OpenVoice-RKNN2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b395ff064c4df0831530d702e0288acad5833a67
--- /dev/null
+++ b/OpenVoice-RKNN2/README.md
@@ -0,0 +1,3 @@
+---
+license: agpl-3.0
+---
diff --git a/OpenVoice-RKNN2/configuration.json b/OpenVoice-RKNN2/configuration.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e33566b0d976167bd5f15801ef7005d59143e2f
--- /dev/null
+++ b/OpenVoice-RKNN2/configuration.json
@@ -0,0 +1,57 @@
+{
+ "_version_": "v2",
+ "data": {
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "n_speakers": 0
+ },
+ "model": {
+ "zero_g": true,
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 8,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "gin_channels": 256
+ }
+}
\ No newline at end of file
diff --git a/OpenVoice-RKNN2/convert_rknn.py b/OpenVoice-RKNN2/convert_rknn.py
new file mode 100644
index 0000000000000000000000000000000000000000..61a18ea0d00706cf0f08fd0bfe7c3fee19104a44
--- /dev/null
+++ b/OpenVoice-RKNN2/convert_rknn.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import datetime
+import argparse
+from rknn.api import RKNN
+from sys import exit
+
+# 模型配置
+MODELS = {
+ 'tone_clone': 'tone_clone_model.onnx',
+ 'tone_color_extract': 'tone_color_extract_model.onnx',
+}
+
+TARGET_AUDIO_LENS = [1024]
+
+SOURCE_AUDIO_LENS = [1024]
+
+AUDIO_DIM = 513
+
+QUANTIZE=False
+detailed_performance_log = True
+
+def convert_model(model_type):
+ """转换指定类型的模型到RKNN格式"""
+ if model_type not in MODELS:
+ print(f"错误: 不支持的模型类型 {model_type}")
+ return False
+
+ onnx_model = MODELS[model_type]
+ rknn_model = onnx_model.replace(".onnx",".rknn")
+
+ if model_type == 'tone_clone':
+ shapes = [
+ [
+ [1, 513, target_audio_len], # audio
+ [1], # audio_length
+ [1, 256, 1], # src_tone
+ [1, 256, 1], # dest_tone
+ [1], # tau
+ ] for target_audio_len in TARGET_AUDIO_LENS
+ ]
+ elif model_type == 'tone_color_extract':
+ shapes = [
+ [
+ [1, source_audio_len, 513], # audio
+ ] for source_audio_len in SOURCE_AUDIO_LENS
+ ]
+ # shapes = None
+
+ timedate_iso = datetime.datetime.now().isoformat()
+
+ rknn = RKNN(verbose=True)
+ rknn.config(
+ quantized_dtype='w8a8',
+ quantized_algorithm='normal',
+ quantized_method='channel',
+ quantized_hybrid_level=0,
+ target_platform='rk3588',
+ quant_img_RGB2BGR = False,
+ float_dtype='float16',
+ optimization_level=3,
+ custom_string=f"converted by: qq: 232004040, email: 2302004040@qq.com at {timedate_iso}",
+ remove_weight=False,
+ compress_weight=False,
+ inputs_yuv_fmt=None,
+ single_core_mode=False,
+ dynamic_input=shapes,
+ model_pruning=False,
+ op_target=None,
+ quantize_weight=False,
+ remove_reshape=False,
+ sparse_infer=False,
+ enable_flash_attention=False,
+ # disable_rules=['convert_gemm_by_exmatmul']
+ )
+
+ print(f"开始转换 {model_type} 模型...")
+ ret = rknn.load_onnx(model=onnx_model)
+ if ret != 0:
+ print("加载ONNX模型失败")
+ return False
+
+ ret = rknn.build(do_quantization=False, rknn_batch_size=None)
+ if ret != 0:
+ print("构建RKNN模型失败")
+ return False
+
+ ret = rknn.export_rknn(rknn_model)
+ if ret != 0:
+ print("导出RKNN模型失败")
+ return False
+
+ print(f"成功转换模型: {rknn_model}")
+ return True
+
+def main():
+ parser = argparse.ArgumentParser(description='转换ONNX模型到RKNN格式')
+ parser.add_argument('model_type', nargs='?', default='all',
+ choices=['all', 'tone_clone', 'tone_color_extract'],
+ help='要转换的模型类型 (默认: all)')
+
+ args = parser.parse_args()
+
+ if args.model_type == 'all':
+ # 转换所有模型
+ for model_type in MODELS.keys():
+ if not convert_model(model_type):
+ print(f"转换 {model_type} 失败")
+ else:
+ # 转换指定模型
+ if not convert_model(args.model_type):
+ print(f"转换 {args.model_type} 失败")
+
+if __name__ == '__main__':
+ main()
+
+
diff --git a/OpenVoice-RKNN2/export_onnx.py b/OpenVoice-RKNN2/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c07670b9100725943b2bca2b67ec7f4d97f6f56
--- /dev/null
+++ b/OpenVoice-RKNN2/export_onnx.py
@@ -0,0 +1,127 @@
+import torch
+import torch.nn as nn
+from openvoice.api import ToneColorConverter
+from openvoice.models import SynthesizerTrn
+import os
+
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+class ToneColorExtractWrapper(nn.Module):
+ def __init__(self, model):
+ super().__init__()
+ self.model = model
+
+ def forward(self, audio):
+ # audio: [1, source_audio_len, 513]
+ # 将mel谱图转置为模型需要的格式 [1, 513, source_audio_len]
+ audio = audio.contiguous()
+ # 提取声纹
+ g = self.model.ref_enc(audio)
+ # 扩展最后一维
+ # g = g.unsqueeze(-1) # [1, 256, 1]
+ return g
+
+class ToneCloneWrapper(nn.Module):
+ def __init__(self, model):
+ super().__init__()
+ self.model = model
+
+ def forward(self, audio, audio_lengths, src_tone, dest_tone, tau):
+ # 确保张量连续
+ audio = audio.contiguous()
+ src_tone = src_tone.contiguous()
+ dest_tone = dest_tone.contiguous()
+
+ # 语音转换
+ o_hat, _, _ = self.model.voice_conversion(
+ audio,
+ audio_lengths,
+ sid_src=src_tone,
+ sid_tgt=dest_tone,
+ tau=tau[0]
+ )
+ return o_hat
+
+def export_models(ckpt_path, output_dir, target_audio_lens, source_audio_lens):
+ """
+ 导出音色提取和克隆模型为ONNX格式
+
+ Args:
+ ckpt_path: 模型检查点路径
+ output_dir: 输出目录
+ target_audio_lens: 目标音频长度列表
+ source_audio_lens: 源音频长度列表
+ """
+
+ # 加载模型
+ device = "cpu"
+ converter = ToneColorConverter(f'{ckpt_path}/config.json', device=device)
+ converter.load_ckpt(f'{ckpt_path}/checkpoint.pth')
+
+ # 创建输出目录
+ os.makedirs(output_dir, exist_ok=True)
+
+ # 导出音色提取模型
+ extract_wrapper = ToneColorExtractWrapper(converter.model)
+ extract_wrapper.eval()
+
+ for source_len in source_audio_lens:
+ dummy_input = torch.randn(1, source_len, 513).contiguous()
+ output_path = f"{output_dir}/tone_color_extract_model.onnx"
+
+ torch.onnx.export(
+ extract_wrapper,
+ dummy_input,
+ output_path,
+ input_names=['input'],
+ output_names=['tone_embedding'],
+ dynamic_axes={
+ 'input': {1: 'source_audio_len'},
+ },
+ opset_version=11,
+ do_constant_folding=True,
+ verbose=True
+ )
+ print(f"Exported tone extract model to {output_path}")
+
+ # 导出音色克隆模型
+ clone_wrapper = ToneCloneWrapper(converter.model)
+ clone_wrapper.eval()
+
+ for target_len in target_audio_lens:
+ dummy_inputs = (
+ torch.randn(1, 513, target_len).contiguous(), # audio
+ torch.LongTensor([target_len]), # audio_lengths
+ torch.randn(1, 256, 1).contiguous(), # src_tone
+ torch.randn(1, 256, 1).contiguous(), # dest_tone
+ torch.FloatTensor([0.3]) # tau
+ )
+
+ output_path = f"{output_dir}/tone_clone_model.onnx"
+
+ torch.onnx.export(
+ clone_wrapper,
+ dummy_inputs,
+ output_path,
+ input_names=['audio', 'audio_length', 'src_tone', 'dest_tone', 'tau'],
+ output_names=['converted_audio'],
+ dynamic_axes={
+ 'audio': {2: 'target_audio_len'},
+ },
+ opset_version=17,
+ do_constant_folding=True,
+ verbose=True
+ )
+ print(f"Exported tone clone model to {output_path}")
+
+if __name__ == "__main__":
+ # 示例用法
+ TARGET_AUDIO_LENS = [1024] # 根据需要设置目标长度
+ SOURCE_AUDIO_LENS = [1024] # 根据需要设置源长度
+
+ export_models(
+ ckpt_path="checkpoints_v2/converter",
+ output_dir="onnx_models",
+ target_audio_lens=TARGET_AUDIO_LENS,
+ source_audio_lens=SOURCE_AUDIO_LENS
+ )
\ No newline at end of file
diff --git a/OpenVoice-RKNN2/result.wav b/OpenVoice-RKNN2/result.wav
new file mode 100644
index 0000000000000000000000000000000000000000..385edef7b2ea4f811cb0082a9afbbe60805a7ff6
--- /dev/null
+++ b/OpenVoice-RKNN2/result.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d11ad289cc5014994086548874fd145ac67c41eb9b91fdd822ad6bd05a40c90f
+size 393260
diff --git a/OpenVoice-RKNN2/source.txt b/OpenVoice-RKNN2/source.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eaa39fbaed6a638b7466276451601a0490d5738c
--- /dev/null
+++ b/OpenVoice-RKNN2/source.txt
@@ -0,0 +1 @@
+https://huggingface.co/happyme531/OpenVoice-RKNN2
\ No newline at end of file
diff --git a/OpenVoice-RKNN2/src2.wav b/OpenVoice-RKNN2/src2.wav
new file mode 100644
index 0000000000000000000000000000000000000000..0ebbef7f3def597ccdb5d20a700d9d299e182872
--- /dev/null
+++ b/OpenVoice-RKNN2/src2.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baf4ce666c5fa88e052381e0c33543be3015bf2f47154ac3925ee67c963c0a12
+size 1712078
diff --git a/OpenVoice-RKNN2/target.wav b/OpenVoice-RKNN2/target.wav
new file mode 100644
index 0000000000000000000000000000000000000000..932b758345f87e434de83e74ffdbf303b1a8a096
--- /dev/null
+++ b/OpenVoice-RKNN2/target.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c63d1b5cb444f3611a271d1c24d04363f5bdd73fb5745bc6b61e1c925a8f6084
+size 2165838
diff --git a/OpenVoice-RKNN2/test_rknn.py b/OpenVoice-RKNN2/test_rknn.py
new file mode 100644
index 0000000000000000000000000000000000000000..60777db16a72729148d363b474395356cae4c36e
--- /dev/null
+++ b/OpenVoice-RKNN2/test_rknn.py
@@ -0,0 +1,327 @@
+from typing import Callable
+import numpy as np
+import onnxruntime as ort
+import os
+from rknnlite.api import RKNNLite
+import json
+import os
+import time
+
+class HParams:
+ def __init__(self, **kwargs):
+ for k, v in kwargs.items():
+ if type(v) == dict:
+ v = HParams(**v)
+ self[k] = v
+
+ def keys(self):
+ return self.__dict__.keys()
+
+ def items(self):
+ return self.__dict__.items()
+
+ def values(self):
+ return self.__dict__.values()
+
+ def __len__(self):
+ return len(self.__dict__)
+
+ def __getitem__(self, key):
+ return getattr(self, key)
+
+ def __setitem__(self, key, value):
+ return setattr(self, key, value)
+
+ def __contains__(self, key):
+ return key in self.__dict__
+
+ def __repr__(self):
+ return self.__dict__.__repr__()
+
+ @staticmethod
+ def load_from_file(file_path:str):
+ if not os.path.exists(file_path):
+ raise FileNotFoundError(f"Can not found the configuration file \"{file_path}\"")
+ with open(file_path, "r", encoding="utf-8") as f:
+ hps = json.load(f)
+ return HParams(**hps)
+
+class BaseClassForOnnxInfer():
+ @staticmethod
+ def create_onnx_infer(infer_factor:Callable, onnx_model_path:str, providers:list, session_options:ort.SessionOptions = None, onnx_params:dict = None):
+ if not os.path.exists(onnx_model_path):
+ raise FileNotFoundError(f"Can not found the onnx model file \"{onnx_model_path}\"")
+ session = ort.InferenceSession(onnx_model_path, sess_options=BaseClassForOnnxInfer.adjust_onnx_session_options(session_options), providers=providers, **(onnx_params or {}))
+ fn = infer_factor(session)
+ fn.__session = session
+ return fn
+
+ @staticmethod
+ def get_def_onnx_session_options():
+ session_options = ort.SessionOptions()
+ session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+ return session_options
+
+ @staticmethod
+ def adjust_onnx_session_options(session_options:ort.SessionOptions = None):
+ return session_options or BaseClassForOnnxInfer.get_def_onnx_session_options()
+
+class OpenVoiceToneClone_ONNXRKNN(BaseClassForOnnxInfer):
+
+ PreferredProviders = ['CPUExecutionProvider']
+
+ def __init__(self, model_path, execution_provider:str = None, verbose:bool = False, onnx_session_options:ort.SessionOptions = None, onnx_params:dict = None, target_length:int = 1024):
+ '''
+ Create the instance of the tone cloner
+
+ Args:
+ model_path (str): The path of the folder which contains the model
+ execution_provider (str): The provider that onnxruntime used. Such as CPUExecutionProvider, CUDAExecutionProvider, etc. Or you can use CPU, CUDA as short one. If it is None, the constructor will choose a best one automaticlly
+ verbose (bool): Set True to show more detail informations when working
+ onnx_session_options (onnxruntime.SessionOptions): The custom options for onnx session
+ onnx_params (dict): Other parameters you want to pass to the onnxruntime.InferenceSession constructor
+ target_length (int): The target length for padding/truncating spectrogram, defaults to 1024
+
+ Returns:
+ OpenVoiceToneClone_ONNX: The instance of the tone cloner
+ '''
+ self.__verbose = verbose
+ self.__target_length = target_length
+
+ if verbose:
+ print("Loading the configuration...")
+ config_path = os.path.join(model_path, "configuration.json")
+ self.__hparams = HParams.load_from_file(config_path)
+
+ execution_provider = f"{execution_provider}ExecutionProvider" if (execution_provider is not None) and (not execution_provider.endswith("ExecutionProvider")) else execution_provider
+ available_providers = ort.get_available_providers()
+ # self.__execution_providers = [execution_provider if execution_provider in available_providers else next((provider for provider in MeloTTS_ONNX.PreferredProviders if provider in available_providers), 'CPUExecutionProvider')]
+ self.__execution_providers = ['CPUExecutionProvider']
+ if verbose:
+ print("Creating onnx session for tone color extractor...")
+ def se_infer_factor(session):
+ return lambda **kwargs: session.run(None, kwargs)[0]
+ self.__se_infer = self.create_onnx_infer(se_infer_factor, os.path.join(model_path, "tone_color_extract_model.onnx"), self.__execution_providers, onnx_session_options, onnx_params)
+
+ if verbose:
+ print("Creating RKNNLite session for tone clone ...")
+ # 初始化RKNNLite
+ self.__tc_rknn = RKNNLite(verbose=verbose)
+ # 加载RKNN模型
+ ret = self.__tc_rknn.load_rknn(os.path.join(model_path, "tone_clone_model.rknn"))
+ if ret != 0:
+ raise RuntimeError("Failed to load RKNN model")
+ # 初始化运行时
+ ret = self.__tc_rknn.init_runtime()
+ if ret != 0:
+ raise RuntimeError("Failed to init RKNN runtime")
+
+ def __del__(self):
+ """释放RKNN资源"""
+ if hasattr(self, '_OpenVoiceToneClone_ONNXRKNN__tc_rknn'):
+ self.__tc_rknn.release()
+
+ hann_window = {}
+
+ def __spectrogram_numpy(self, y, n_fft, sampling_rate, hop_size, win_size, onesided=True):
+ if self.__verbose:
+ if np.min(y) < -1.1:
+ print("min value is ", np.min(y))
+ if np.max(y) > 1.1:
+ print("max value is ", np.max(y))
+
+ # 填充
+ y = np.pad(
+ y,
+ int((n_fft - hop_size) / 2),
+ mode="reflect",
+ )
+
+ # 生成汉宁窗
+ win_key = f"{str(y.dtype)}-{win_size}"
+ if True or win_key not in hann_window:
+ OpenVoiceToneClone_ONNXRKNN.hann_window[win_key] = np.hanning(win_size + 1)[:-1].astype(y.dtype)
+ window = OpenVoiceToneClone_ONNXRKNN.hann_window[win_key]
+
+ # 短时傅里叶变换
+ y_len = y.shape[0]
+ win_len = window.shape[0]
+ count = int((y_len - win_len) // hop_size) + 1
+ spec = np.empty((count, int(win_len / 2) + 1 if onesided else (int(win_len / 2) + 1) * 2, 2))
+ start = 0
+ end = start + win_len
+ idx = 0
+ while end <= y_len:
+ segment = y[start:end]
+ frame = segment * window
+ step_result = np.fft.rfft(frame) if onesided else np.fft.fft(frame)
+ spec[idx] = np.column_stack((step_result.real, step_result.imag))
+ start = start + hop_size
+ end = start + win_len
+ idx += 1
+
+ # 合并实部虚部
+ spec = np.sqrt(np.sum(np.square(spec), axis=-1) + 1e-6)
+
+ return np.array([spec], dtype=np.float32)
+
+ def extract_tone_color(self, audio:np.array):
+ '''
+ Extract the tone color from an audio
+
+ Args:
+ audio (numpy.array): The data of the audio
+
+ Returns:
+ numpy.array: The tone color vector
+ '''
+ hps = self.__hparams
+ y = self.to_mono(audio.astype(np.float32))
+ spec = self.__spectrogram_numpy(y, hps.data.filter_length,
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
+ )
+
+ if self.__verbose:
+ print("spec shape", spec.shape)
+ return self.__se_infer(input=spec).reshape(1,256,1)
+
+ def mix_tone_color(self, colors:list):
+ '''
+ Mix multi tone colors to a single one
+
+ Args:
+ color (list[numpy.array]): The list of the tone colors you want to mix. Each element should be the result of extract_tone_color.
+
+ Returns:
+ numpy.array: The tone color vector
+ '''
+ return np.stack(colors).mean(axis=0)
+
+ def tone_clone(self, audio:np.array, target_tone_color:np.array, tau=0.3):
+ '''
+ Clone the tone
+
+ Args:
+ audio (numpy.array): The data of the audio that will be changed the tone
+ target_tone_color (numpy.array): The tone color that you want to clone. It should be the result of the extract_tone_color or mix_tone_color.
+ tau (float):
+
+ Returns:
+ numpy.array: The dest audio
+ '''
+ assert (target_tone_color.shape == (1,256,1)), "The target tone color must be an array with shape (1,256,1)"
+ hps = self.__hparams
+ src = self.to_mono(audio.astype(np.float32))
+ src = self.__spectrogram_numpy(src, hps.data.filter_length,
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
+ )
+ src_tone = self.__se_infer(input=src).reshape(1,256,1)
+
+ src = np.transpose(src, (0, 2, 1))
+ # 记录原始长度
+ original_length = src.shape[2]
+
+ # Pad或截断到固定长度
+ if original_length > self.__target_length:
+ if self.__verbose:
+ print(f"Input length {original_length} exceeds target length {self.__target_length}, truncating...")
+ src = src[:, :, :self.__target_length]
+ elif original_length < self.__target_length:
+ if self.__verbose:
+ print(f"Input length {original_length} is less than target length {self.__target_length}, padding...")
+ pad_width = ((0, 0), (0, 0), (0, self.__target_length - original_length))
+ src = np.pad(src, pad_width, mode='constant', constant_values=0)
+
+ src_length = np.array([self.__target_length], dtype=np.int64) # 使用固定长度
+
+ if self.__verbose:
+ print("src shape", src.shape)
+ print("src_length shape", src_length.shape)
+ print("src_tone shape", src_tone.shape)
+ print("target_tone_color shape", target_tone_color.shape)
+ print("tau", tau)
+
+ # 准备RKNNLite的输入
+ inputs = [
+ src,
+ src_length,
+ src_tone,
+ target_tone_color,
+ np.array([tau], dtype=np.float32)
+ ]
+
+ # 使用RKNNLite进行推理
+ outputs = self.__tc_rknn.inference(inputs=inputs)
+ res = outputs[0][0, 0] # 获取第一个输出的第一个样本
+
+ generated_multiplier = 262144 / 1024
+ # 如果原始输入较短,则截取掉padding部分
+ if original_length < self.__target_length:
+ res = res[:int(original_length * generated_multiplier)]
+
+ if self.__verbose:
+ print("res shape", res.shape)
+ return res
+
+ def to_mono(self, audio:np.array):
+ '''
+ Change the audio to be a mono audio
+
+ Args:
+ audio (numpy.array): The source audio
+
+ Returns:
+ numpy.array: The mono audio data
+ '''
+ return np.mean(audio, axis=1) if len(audio.shape) > 1 else audio
+
+ def resample(self, audio:np.array, original_rate:int):
+ '''
+ Resample the audio to match the model. It is used for changing the sample rate of the audio.
+
+ Args:
+ audio (numpy.array): The source audio you want to resample.
+ original_rate (int): The original sample rate of the source audio
+
+ Returns:
+ numpy.array: The dest data of the audio after resample
+ '''
+ audio = self.to_mono(audio)
+ target_rate = self.__hparams.data.sampling_rate
+ duration = audio.shape[0] / original_rate
+ target_length = int(duration * target_rate)
+ time_original = np.linspace(0, duration, num=audio.shape[0])
+ time_target = np.linspace(0, duration, num=target_length)
+ resampled_data = np.interp(time_target, time_original, audio)
+ return resampled_data
+
+ @property
+ def sample_rate(self):
+ '''
+ The sample rate of the tone cloning result
+ '''
+ return self.__hparams.data.sampling_rate
+
+
+tc = OpenVoiceToneClone_ONNXRKNN(".",verbose=True)
+import soundfile
+
+tgt = soundfile.read("target.wav", dtype='float32')
+tgt = tc.resample(tgt[0], tgt[1])
+
+# 计时extract_tone_color
+start_time = time.time()
+tgt_tone_color = tc.extract_tone_color(tgt)
+extract_time = time.time() - start_time
+print(f"提取音色特征耗时: {extract_time:.2f}秒")
+
+src = soundfile.read("src2.wav", dtype='float32')
+src = tc.resample(src[0], src[1])
+
+# 计时tone_clone
+start_time = time.time()
+result = tc.tone_clone(src, tgt_tone_color)
+clone_time = time.time() - start_time
+print(f"克隆音色耗时: {clone_time:.2f}秒")
+
+soundfile.write("result.wav", result, tc.sample_rate)
diff --git a/OpenVoice-RKNN2/tone_clone_model.onnx b/OpenVoice-RKNN2/tone_clone_model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..809914fba8c77b53a1e8b7ff7222d5d4389e6bfe
--- /dev/null
+++ b/OpenVoice-RKNN2/tone_clone_model.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:896195b84b0cb87a828bb8cab06577e9c024356bc9727b1a8f4174154bc0affa
+size 157196170
diff --git a/OpenVoice-RKNN2/tone_clone_model.rknn b/OpenVoice-RKNN2/tone_clone_model.rknn
new file mode 100644
index 0000000000000000000000000000000000000000..de7e5965423f36a62c98a31763eb61cfd887c1da
--- /dev/null
+++ b/OpenVoice-RKNN2/tone_clone_model.rknn
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cd7dc3385c55ca610580edaba263510091314be35ae4688a1c076afe9e5d84a
+size 108102277
diff --git a/OpenVoice-RKNN2/tone_color_extract_model.onnx b/OpenVoice-RKNN2/tone_color_extract_model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..48b87cc4db9fe1ef0fcb89f99004856820eef626
--- /dev/null
+++ b/OpenVoice-RKNN2/tone_color_extract_model.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e91c2cb696e199d2519ed8b62ca6e3c8e42cb99ca13955dd6e188051486e681c
+size 3364792
diff --git a/OpenVoice/.gitattributes b/OpenVoice/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b
--- /dev/null
+++ b/OpenVoice/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/OpenVoice/README.md b/OpenVoice/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f344e2510e24ac1d6d7c15cbdd52d156944063d0
--- /dev/null
+++ b/OpenVoice/README.md
@@ -0,0 +1,33 @@
+---
+license: mit
+tags:
+- audio
+- text-to-speech
+- instant-voice-cloning
+language:
+- en
+- zh
+inference: false
+---
+
+# OpenVoice
+
+
+
+OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
+
+
+
+### Features
+- **Accurate Tone Color Cloning.** OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
+- **Flexible Voice Style Control.** OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
+- **Zero-shot Cross-lingual Voice Cloning.** Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
+
+### How to Use
+Please see [usage](https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md) for detailed instructions.
+
+### Links
+- [Github](https://github.com/myshell-ai/OpenVoice)
+- [HFDemo](https://huggingface.co/spaces/myshell-ai/OpenVoice)
+- [Discord](https://discord.gg/myshell)
+
diff --git a/OpenVoice/checkpoints/base_speakers/EN/checkpoint.pth b/OpenVoice/checkpoints/base_speakers/EN/checkpoint.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fb7c26af57011437a02ebb1c4fe8ed307cc30f21
--- /dev/null
+++ b/OpenVoice/checkpoints/base_speakers/EN/checkpoint.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1db1ae1a5c8ded049bd1536051489aefbfad4a5077c01c2257e9e88fa1bb8422
+size 160467309
diff --git a/OpenVoice/checkpoints/base_speakers/EN/config.json b/OpenVoice/checkpoints/base_speakers/EN/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7309ad10eae3c160ea0ef44261372c4f3364587
--- /dev/null
+++ b/OpenVoice/checkpoints/base_speakers/EN/config.json
@@ -0,0 +1,145 @@
+{
+ "data": {
+ "text_cleaners": [
+ "cjke_cleaners2"
+ ],
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "n_mel_channels": 80,
+ "add_blank": true,
+ "cleaned_text": true,
+ "n_speakers": 10
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "n_layers_trans_flow": 3,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 8,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256
+ },
+ "symbols": [
+ "_",
+ ",",
+ ".",
+ "!",
+ "?",
+ "-",
+ "~",
+ "\u2026",
+ "N",
+ "Q",
+ "a",
+ "b",
+ "d",
+ "e",
+ "f",
+ "g",
+ "h",
+ "i",
+ "j",
+ "k",
+ "l",
+ "m",
+ "n",
+ "o",
+ "p",
+ "s",
+ "t",
+ "u",
+ "v",
+ "w",
+ "x",
+ "y",
+ "z",
+ "\u0251",
+ "\u00e6",
+ "\u0283",
+ "\u0291",
+ "\u00e7",
+ "\u026f",
+ "\u026a",
+ "\u0254",
+ "\u025b",
+ "\u0279",
+ "\u00f0",
+ "\u0259",
+ "\u026b",
+ "\u0265",
+ "\u0278",
+ "\u028a",
+ "\u027e",
+ "\u0292",
+ "\u03b8",
+ "\u03b2",
+ "\u014b",
+ "\u0266",
+ "\u207c",
+ "\u02b0",
+ "`",
+ "^",
+ "#",
+ "*",
+ "=",
+ "\u02c8",
+ "\u02cc",
+ "\u2192",
+ "\u2193",
+ "\u2191",
+ " "
+ ],
+ "speakers": {
+ "default": 1,
+ "whispering": 2,
+ "shouting": 3,
+ "excited": 4,
+ "cheerful": 5,
+ "terrified": 6,
+ "angry": 7,
+ "sad": 8,
+ "friendly": 9
+ }
+}
\ No newline at end of file
diff --git a/OpenVoice/checkpoints/base_speakers/EN/en_default_se.pth b/OpenVoice/checkpoints/base_speakers/EN/en_default_se.pth
new file mode 100644
index 0000000000000000000000000000000000000000..319d7eb4bee7b785a47f4e6191c2132dec12abcf
--- /dev/null
+++ b/OpenVoice/checkpoints/base_speakers/EN/en_default_se.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cab24002eec738d0fe72cb73a34e57fbc3999c1bd4a1670a7b56ee4e3590ac9
+size 1789
diff --git a/OpenVoice/checkpoints/base_speakers/EN/en_style_se.pth b/OpenVoice/checkpoints/base_speakers/EN/en_style_se.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c2fd50abf058f6ab65879395b62fb7e3c0289b47
--- /dev/null
+++ b/OpenVoice/checkpoints/base_speakers/EN/en_style_se.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9
+size 1783
diff --git a/OpenVoice/checkpoints/base_speakers/ZH/checkpoint.pth b/OpenVoice/checkpoints/base_speakers/ZH/checkpoint.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fcadb5c222e9ea92fc9ada4920249fc65cad1692
--- /dev/null
+++ b/OpenVoice/checkpoints/base_speakers/ZH/checkpoint.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9
+size 160467309
diff --git a/OpenVoice/checkpoints/base_speakers/ZH/config.json b/OpenVoice/checkpoints/base_speakers/ZH/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..130256092fb8ad00f938149bf8aa1a62aae30023
--- /dev/null
+++ b/OpenVoice/checkpoints/base_speakers/ZH/config.json
@@ -0,0 +1,137 @@
+{
+ "data": {
+ "text_cleaners": [
+ "cjke_cleaners2"
+ ],
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "n_mel_channels": 80,
+ "add_blank": true,
+ "cleaned_text": true,
+ "n_speakers": 10
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "n_layers_trans_flow": 3,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 8,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256
+ },
+ "symbols": [
+ "_",
+ ",",
+ ".",
+ "!",
+ "?",
+ "-",
+ "~",
+ "\u2026",
+ "N",
+ "Q",
+ "a",
+ "b",
+ "d",
+ "e",
+ "f",
+ "g",
+ "h",
+ "i",
+ "j",
+ "k",
+ "l",
+ "m",
+ "n",
+ "o",
+ "p",
+ "s",
+ "t",
+ "u",
+ "v",
+ "w",
+ "x",
+ "y",
+ "z",
+ "\u0251",
+ "\u00e6",
+ "\u0283",
+ "\u0291",
+ "\u00e7",
+ "\u026f",
+ "\u026a",
+ "\u0254",
+ "\u025b",
+ "\u0279",
+ "\u00f0",
+ "\u0259",
+ "\u026b",
+ "\u0265",
+ "\u0278",
+ "\u028a",
+ "\u027e",
+ "\u0292",
+ "\u03b8",
+ "\u03b2",
+ "\u014b",
+ "\u0266",
+ "\u207c",
+ "\u02b0",
+ "`",
+ "^",
+ "#",
+ "*",
+ "=",
+ "\u02c8",
+ "\u02cc",
+ "\u2192",
+ "\u2193",
+ "\u2191",
+ " "
+ ],
+ "speakers": {
+ "default": 0
+ }
+}
\ No newline at end of file
diff --git a/OpenVoice/checkpoints/base_speakers/ZH/zh_default_se.pth b/OpenVoice/checkpoints/base_speakers/ZH/zh_default_se.pth
new file mode 100644
index 0000000000000000000000000000000000000000..471841ae84a31aae1c8e25c1ef4548b3e87a32bb
--- /dev/null
+++ b/OpenVoice/checkpoints/base_speakers/ZH/zh_default_se.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf
+size 1789
diff --git a/OpenVoice/checkpoints/converter/checkpoint.pth b/OpenVoice/checkpoints/converter/checkpoint.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c38ff17666bae2bae4236f85bfe2284f4885b31a
--- /dev/null
+++ b/OpenVoice/checkpoints/converter/checkpoint.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89ae83aa4e3668fef64b388b789ff7b0ce0def9f801069edfc18a00ea420748d
+size 131327338
diff --git a/OpenVoice/checkpoints/converter/config.json b/OpenVoice/checkpoints/converter/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a163d4254b637e9fd489712db40c15aeacda169e
--- /dev/null
+++ b/OpenVoice/checkpoints/converter/config.json
@@ -0,0 +1,57 @@
+{
+ "data": {
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "n_speakers": 0
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 8,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256
+ }
+}
\ No newline at end of file
diff --git a/OpenVoice/source.txt b/OpenVoice/source.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ee8bafdd583140880210fddf28a3e3bb8743f03c
--- /dev/null
+++ b/OpenVoice/source.txt
@@ -0,0 +1 @@
+https://huggingface.co/myshell-ai/OpenVoice
\ No newline at end of file
diff --git a/OpenVoiceV2/.DS_Store b/OpenVoiceV2/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..a5280b1eb27f9d0da429dfca861860e80b6fc471
Binary files /dev/null and b/OpenVoiceV2/.DS_Store differ
diff --git a/OpenVoiceV2/.gitattributes b/OpenVoiceV2/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..dab9a4e17afd2ef39d90ccb0b40ef2786fe77422
--- /dev/null
+++ b/OpenVoiceV2/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/OpenVoiceV2/README.md b/OpenVoiceV2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b900a52c61ef1799a4d38d174d043506719ed069
--- /dev/null
+++ b/OpenVoiceV2/README.md
@@ -0,0 +1,116 @@
+---
+license: mit
+tags:
+- audio
+- text-to-speech
+- instant-voice-cloning
+language:
+- en
+- zh
+inference: false
+---
+
+# OpenVoice V2
+
+
+
+
+In April 2024, we release OpenVoice V2, which includes all features in V1 and has:
+
+1. Better Audio Quality. OpenVoice V2 adopts a different training strategy that delivers better audio quality.
+
+2. Native Multi-lingual Support. English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2.
+
+3. Free Commercial Use. Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
+
+
+
+
+### Features
+- **Accurate Tone Color Cloning.** OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
+- **Flexible Voice Style Control.** OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
+- **Zero-shot Cross-lingual Voice Cloning.** Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
+
+### How to Use
+Please see [usage](https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md) for detailed instructions.
+
+# Usage
+
+## Table of Content
+
+- [Quick Use](#quick-use): directly use OpenVoice without installation.
+- [Linux Install](#linux-install): for researchers and developers only.
+ - [V1](#openvoice-v1)
+ - [V2](#openvoice-v2)
+- [Install on Other Platforms](#install-on-other-platforms): unofficial installation guide contributed by the community
+
+## Quick Use
+
+The input speech audio of OpenVoice can be in **Any Language**. OpenVoice can clone the voice in that speech audio, and use the voice to speak in multiple languages. For quick use, we recommend you to try the already deployed services:
+
+- [British English](https://app.myshell.ai/widget/vYjqae)
+- [American English](https://app.myshell.ai/widget/nEFFJf)
+- [Indian English](https://app.myshell.ai/widget/V3iYze)
+- [Australian English](https://app.myshell.ai/widget/fM7JVf)
+- [Spanish](https://app.myshell.ai/widget/NNFFVz)
+- [French](https://app.myshell.ai/widget/z2uyUz)
+- [Chinese](https://app.myshell.ai/widget/fU7nUz)
+- [Japanese](https://app.myshell.ai/widget/IfIB3u)
+- [Korean](https://app.myshell.ai/widget/q6ZjIn)
+
+## Linux Install
+
+This section is only for developers and researchers who are familiar with Linux, Python and PyTorch. Clone this repo, and run
+
+```
+conda create -n openvoice python=3.9
+conda activate openvoice
+git clone git@github.com:myshell-ai/OpenVoice.git
+cd OpenVoice
+pip install -e .
+```
+
+No matter if you are using V1 or V2, the above installation is the same.
+
+### OpenVoice V1
+
+Download the checkpoint from [here](https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_1226.zip) and extract it to the `checkpoints` folder.
+
+**1. Flexible Voice Style Control.**
+Please see [`demo_part1.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part1.ipynb) for an example usage of how OpenVoice enables flexible style control over the cloned voice.
+
+**2. Cross-Lingual Voice Cloning.**
+Please see [`demo_part2.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb) for an example for languages seen or unseen in the MSML training set.
+
+**3. Gradio Demo.**. We provide a minimalist local gradio demo here. We strongly suggest the users to look into `demo_part1.ipynb`, `demo_part2.ipynb` and the [QnA](QA.md) if they run into issues with the gradio demo. Launch a local gradio demo with `python -m openvoice_app --share`.
+
+### OpenVoice V2
+
+Download the checkpoint from [here](https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip) and extract it to the `checkpoints_v2` folder.
+
+Install [MeloTTS](https://github.com/myshell-ai/MeloTTS):
+```
+pip install git+https://github.com/myshell-ai/MeloTTS.git
+python -m unidic download
+```
+
+**Demo Usage.** Please see [`demo_part3.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part3.ipynb) for example usage of OpenVoice V2. Now it natively supports English, Spanish, French, Chinese, Japanese and Korean.
+
+
+## Install on Other Platforms
+
+This section provides the unofficial installation guides by open-source contributors in the community:
+
+- Windows
+ - [Guide](https://github.com/Alienpups/OpenVoice/blob/main/docs/USAGE_WINDOWS.md) by [@Alienpups](https://github.com/Alienpups)
+ - You are welcome to contribute if you have a better installation guide. We will list you here.
+- Docker
+ - [Guide](https://github.com/StevenJSCF/OpenVoice/blob/update-docs/docs/DF_USAGE.md) by [@StevenJSCF](https://github.com/StevenJSCF)
+ - You are welcome to contribute if you have a better installation guide. We will list you here.
+
+
+### Links
+- [Github](https://github.com/myshell-ai/OpenVoice)
+- [HFDemo](https://huggingface.co/spaces/myshell-ai/OpenVoiceV2)
+- [Discord](https://discord.gg/myshell)
+
diff --git a/OpenVoiceV2/base_speakers/.DS_Store b/OpenVoiceV2/base_speakers/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..6c304615878a84fe8d7a73535eaac6fa6d110cde
Binary files /dev/null and b/OpenVoiceV2/base_speakers/.DS_Store differ
diff --git a/OpenVoiceV2/base_speakers/ses/en-au.pth b/OpenVoiceV2/base_speakers/ses/en-au.pth
new file mode 100644
index 0000000000000000000000000000000000000000..83cc04fcfc206148e9295f78fba4060b230d3c2e
--- /dev/null
+++ b/OpenVoiceV2/base_speakers/ses/en-au.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e9782233deef51fc5289d05ad4dd4ce12b196e282eccf6b6db6256bbd02daaa
+size 1701
diff --git a/OpenVoiceV2/base_speakers/ses/en-br.pth b/OpenVoiceV2/base_speakers/ses/en-br.pth
new file mode 100644
index 0000000000000000000000000000000000000000..848b00486ea77ea4c71a7bc90b5cf126ad3e1695
--- /dev/null
+++ b/OpenVoiceV2/base_speakers/ses/en-br.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bf5a88025cfd10473b25d65d5c0e608338ce4533059c5f9a3383e69c812d389
+size 1701
diff --git a/OpenVoiceV2/base_speakers/ses/en-default.pth b/OpenVoiceV2/base_speakers/ses/en-default.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fcee330f862e5dcb54227718b1d23cdbd15fef50
--- /dev/null
+++ b/OpenVoiceV2/base_speakers/ses/en-default.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4139de3bc2ea162f45a5a5f9559b710686c9689749b5ab8945ee5e2a082d154
+size 1783
diff --git a/OpenVoiceV2/base_speakers/ses/en-india.pth b/OpenVoiceV2/base_speakers/ses/en-india.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3ab96e9468282789c3f554a3d7ae50f996e06cd6
--- /dev/null
+++ b/OpenVoiceV2/base_speakers/ses/en-india.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad03d946757e95fe9e13239aa4b11071d98f22316f604f34b1a0b4bdf41cda48
+size 1701
diff --git a/OpenVoiceV2/base_speakers/ses/en-newest.pth b/OpenVoiceV2/base_speakers/ses/en-newest.pth
new file mode 100644
index 0000000000000000000000000000000000000000..88082ac29f6bc6799a6f42bfeec082ba6ce2a90f
--- /dev/null
+++ b/OpenVoiceV2/base_speakers/ses/en-newest.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a3798229b1114f0e9cc137b33211809def7dda5a8a9398d5a112c0b42699177
+size 1692
diff --git a/OpenVoiceV2/base_speakers/ses/en-us.pth b/OpenVoiceV2/base_speakers/ses/en-us.pth
new file mode 100644
index 0000000000000000000000000000000000000000..30bdaacfb81611de2e3c396b22d18c3f7be6eede
--- /dev/null
+++ b/OpenVoiceV2/base_speakers/ses/en-us.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d092d4af0815a4bfbc6105b65621ab68dc4c61b2f55044d8a66968a34947c32
+size 1701
diff --git a/OpenVoiceV2/base_speakers/ses/es.pth b/OpenVoiceV2/base_speakers/ses/es.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9a3eb78a6ca0a51c5ec8afda4ef1c764fac6e509
--- /dev/null
+++ b/OpenVoiceV2/base_speakers/ses/es.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8cece8853fb75b9f5217a1f5cda9807bac92a3e4c4547fc651e404d05deff63
+size 1692
diff --git a/OpenVoiceV2/base_speakers/ses/fr.pth b/OpenVoiceV2/base_speakers/ses/fr.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d19dbe324e7a370821f429cab679693da85b79e6
--- /dev/null
+++ b/OpenVoiceV2/base_speakers/ses/fr.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a01f6d30a73efa368c288a542a522a2bcdd4e2ec5589d8646b307cf8e2ad9ae
+size 1692
diff --git a/OpenVoiceV2/base_speakers/ses/jp.pth b/OpenVoiceV2/base_speakers/ses/jp.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ebbc7e9391c8e83884c84762a4aa4b6d4bc04cd8
--- /dev/null
+++ b/OpenVoiceV2/base_speakers/ses/jp.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b645ff428de4a57a22122318968f1e6127ac81fda2e2aa66062deccd3864416
+size 1692
diff --git a/OpenVoiceV2/base_speakers/ses/kr.pth b/OpenVoiceV2/base_speakers/ses/kr.pth
new file mode 100644
index 0000000000000000000000000000000000000000..96e0c00f26c451e5d4f719a683a569fbbb1c3e0c
--- /dev/null
+++ b/OpenVoiceV2/base_speakers/ses/kr.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f501479d6072741a396725bec79144653e9f4a5381b85901e29683aa169795df
+size 1692
diff --git a/OpenVoiceV2/base_speakers/ses/zh.pth b/OpenVoiceV2/base_speakers/ses/zh.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8d89210dd6a6bad7f9484cf86d48ba5d40ac51f1
--- /dev/null
+++ b/OpenVoiceV2/base_speakers/ses/zh.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b353de562700c13faacf096ecfc0adcafd26e6704a9feef572be1279714e031
+size 1692
diff --git a/OpenVoiceV2/converter/checkpoint.pth b/OpenVoiceV2/converter/checkpoint.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fa2f9421735901fd3db22a904f07b5a591faad7d
--- /dev/null
+++ b/OpenVoiceV2/converter/checkpoint.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
+size 131320490
diff --git a/OpenVoiceV2/converter/config.json b/OpenVoiceV2/converter/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..12a6e581e6f602fc60aa910be24aa0fc27b27855
--- /dev/null
+++ b/OpenVoiceV2/converter/config.json
@@ -0,0 +1,57 @@
+{
+ "_version_": "v2",
+ "data": {
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "n_speakers": 0
+ },
+ "model": {
+ "zero_g": true,
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 8,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "gin_channels": 256
+ }
+}
\ No newline at end of file
diff --git a/OpenVoiceV2/languages.txt b/OpenVoiceV2/languages.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d843bae14542b8b0188b315ec44d80f1365c2986
--- /dev/null
+++ b/OpenVoiceV2/languages.txt
@@ -0,0 +1,6 @@
+English
+Spanish
+French
+Chinese
+Japanese
+Korean
\ No newline at end of file
diff --git a/OpenVoiceV2/source.txt b/OpenVoiceV2/source.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7d1d0807f155cafa0b10c876139bb851a0d42d35
--- /dev/null
+++ b/OpenVoiceV2/source.txt
@@ -0,0 +1 @@
+https://huggingface.co/myshell-ai/OpenVoiceV2
\ No newline at end of file
diff --git a/openvoice-tunner-v2/.gitattributes b/openvoice-tunner-v2/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b
--- /dev/null
+++ b/openvoice-tunner-v2/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/openvoice-tunner-v2/README.md b/openvoice-tunner-v2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a19ac9f0c18b452f6b91559ab927fd199e5e970
--- /dev/null
+++ b/openvoice-tunner-v2/README.md
@@ -0,0 +1,7 @@
+
+This is a simple copy of the tuner for openvoice v2
+
+https://github.com/myshell-ai/OpenVoice
+---
+license: mit
+---
diff --git a/openvoice-tunner-v2/checkpoint.pth b/openvoice-tunner-v2/checkpoint.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fa2f9421735901fd3db22a904f07b5a591faad7d
--- /dev/null
+++ b/openvoice-tunner-v2/checkpoint.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
+size 131320490
diff --git a/openvoice-tunner-v2/config.json b/openvoice-tunner-v2/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e33566b0d976167bd5f15801ef7005d59143e2f
--- /dev/null
+++ b/openvoice-tunner-v2/config.json
@@ -0,0 +1,57 @@
+{
+ "_version_": "v2",
+ "data": {
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "n_speakers": 0
+ },
+ "model": {
+ "zero_g": true,
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 8,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "gin_channels": 256
+ }
+}
\ No newline at end of file
diff --git a/openvoice-tunner-v2/source.txt b/openvoice-tunner-v2/source.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b72f11f37181e154604dc0c85151e9f80234036c
--- /dev/null
+++ b/openvoice-tunner-v2/source.txt
@@ -0,0 +1 @@
+https://huggingface.co/daswer123/openvoice-tunner-v2
\ No newline at end of file
diff --git a/openvoice_tone_clone_onnx/.gitattributes b/openvoice_tone_clone_onnx/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..886ac0cb117b79f59f5815e645143cedb7602eb2
--- /dev/null
+++ b/openvoice_tone_clone_onnx/.gitattributes
@@ -0,0 +1,38 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*.db* filter=lfs diff=lfs merge=lfs -text
+*.ark* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.gguf* filter=lfs diff=lfs merge=lfs -text
+*.ggml filter=lfs diff=lfs merge=lfs -text
+*.llamafile* filter=lfs diff=lfs merge=lfs -text
+*.pt2 filter=lfs diff=lfs merge=lfs -text
diff --git a/openvoice_tone_clone_onnx/README.md b/openvoice_tone_clone_onnx/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8e64bc7d36d9bd5b5c0df9d19e3e5a205135f168
--- /dev/null
+++ b/openvoice_tone_clone_onnx/README.md
@@ -0,0 +1,86 @@
+# MeloTTS and OpenVoice Tone Clone by ONNX
+[中文](./README_cn.md)
+
+## Introduction
+This is an implementation of Melo TTS and OpenVoice Tone Clone by onnxruntime.
+We restruct the text utils for speeding up.
+We convert the models into onnx format. The models are stored in this repository as lfs.
+
+We has just implement the zh-mix-en language. Other languages will come soon.
+
+## Usage
+
+```python
+# tts demo
+from melo_onnx import MeloTTX_ONNX
+import soundfile
+
+model_path = "path/to/folder/of/model_tts"
+tts = MeloTTX_ONNX(model_path)
+audio = tts.speak("今天天气真nice。", tts.speakers[0])
+
+soundfile.write("path/of/result.wav", audio, samplerate=tts.sample_rate)
+
+```
+
+```python
+# Tone clone demo
+from melo_onnx import OpenVoiceToneClone_ONNX
+tc = OpenVoiceToneClone_ONNX("path/to/folder/of/model_tone_clone")
+import soundfile
+tgt = soundfile.read("path/of/audio_for_tone_color", dtype='float32')
+tgt = tc.resample(tgt[0], tgt[1])
+tgt_tone_color = tc.extract_tone_color(tgt)
+src = soundfile.read("path/of/audio_to_change_tone", dtype='float32')
+src = tc.resample(src[0], src[1])
+result = tc.tone_clone(src, tgt_tone_color)
+soundfile.write("path/of/result.wav", result, tc.sample_rate)
+```
+
+### The parameters of the constructor of MeloTTX_ONNX:
+- **model_path** str. The path of the folder store the model.
+- **execution_provider** str. The device for the onnxruntime, CUDA, CPU, or others. If it's None, the library will choose the better one between the CUDA and CPU.
+- **verbose** bool. Set True for display the detail information when the library working.
+- **onnx_session_options** onnxruntim.SessionOptions. You can setup the special options for the onnx session.
+- **onnx_params** dict. The other parameters you want to pass into the onnxruntim.InferenceSession
+
+### The parameters of the MeloTTX_ONNX.speak:
+- **text** str. The text you want to synthesis.
+- **speaker** str. The speaker you want to use.
+- **speed** float. The speed of the speech
+- **sdp_ratio** float.
+- **noise_scale** float.
+- **noise_scale_w** float.
+- **pbar** function. Such as tqdm
+- **Returns** numpy.array. The data of the result audio
+
+### Some useful property of the instance of the MeloTTS:
+- **speakers**: [str]. Readonly. The available speakers
+- **sample_rate**: int. Readonly. The sample rate of the synthesis result
+- **language**: str. Readonly. The language of the current model
+
+### The parameters of the constructor of OpenVoiceToneClone_ONNX:
+- **model_path** str. The path of the folder store the model.
+- **execution_provider** str. The device for the onnxruntime, CUDA, CPU, or others. If it's None, the library will choose the better one between the CUDA and CPU.
+- **verbose** bool. Set True for display the detail information when the library working.
+- **onnx_session_options** onnxruntim.SessionOptions. You can setup the special options for the onnx session.
+- **onnx_params** dict. The other parameters you want to pass into the onnxruntim.InferenceSession
+
+### The parameters of OpenVoiceToneClone_ONNX.extract_tone_color
+- **audio** numpy.array. The data of the audio
+- **Returns** numpy.array. The tone color vector
+
+### The parameters of OpenVoiceToneClone_ONNX.mix_tone_color
+- **color** list[numpy.array]. The list of the tone colors you want to mix. Each element should be the result of extract_tone_color.
+- **Returns** numpy.array. The tone color vector
+
+### The parameters of OpenVoiceToneClone_ONNX.tone_clone
+- **audio** numpy.array. The data of the audio that will be changed the tone
+- **target_tone_color** numpy.array. The tone color that you want to clone. It should be the result of the extract_tone_color or mix_tone_color.
+- **tau** float
+- **Returns** numpy.array. The dest audio
+
+### The parameters of OpenVoiceToneClone_ONNX.resample
+- **audio** numpy.array. The source audio you want to resample.
+- **original_rate** int. The original sample rate of the source audio
+- **Returns** numpy.array. The dest data of the audio after resample
diff --git a/openvoice_tone_clone_onnx/README_cn.md b/openvoice_tone_clone_onnx/README_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..f96ed55c501577200681609640f65419ea39a3f0
--- /dev/null
+++ b/openvoice_tone_clone_onnx/README_cn.md
@@ -0,0 +1,87 @@
+# MeloTTS by ONNX
+[English](./README.md)
+
+## 介绍
+这是MeloTTS和OpenVoice音色克隆的ONNX运行时的实现,以用于在运行时加速整个TTS的过程。
+目前在Intel Core i7 10代1.3GHz处理器上,测试“今天天气真nice”的合成速度约为0.95s。
+
+我们重构了文本处理工具集,让模块的加载和运行可以尽可能提速。
+我们将用到的模型转换成了onnx格式,并归并到同一目录下。模型文件目前以大文件形式存放在本仓库的models文件夹下面。使用时需要自行下载。
+
+我们目前仅实现了zh-mix-en(即中英混合)语言。其他的语言支持在后续逐步补充。
+
+## 使用方法
+
+```python
+# tts demo
+from melo_onnx import MeloTTX_ONNX
+import soundfile
+
+model_path = "path/to/folder/of/model_tts"
+tts = MeloTTX_ONNX(model_path)
+audio = tts.speak("今天天气真nice。", tts.speakers[0])
+
+soundfile.write("path/of/result.wav", audio, samplerate=tts.sample_rate)
+
+```
+
+```python
+# Tone clone demo
+from melo_onnx import OpenVoiceToneClone_ONNX
+tc = OpenVoiceToneClone_ONNX("path/to/folder/of/model_tone_clone")
+import soundfile
+tgt = soundfile.read("path/of/audio_for_tone_color", dtype='float32')
+tgt = tc.resample(tgt[0], tgt[1])
+tgt_tone_color = tc.extract_tone_color(tgt)
+src = soundfile.read("path/of/audio_to_change_tone", dtype='float32')
+src = tc.resample(src[0], src[1])
+result = tc.tone_clone(src, tgt_tone_color)
+soundfile.write("path/of/result.wav", result, tc.sample_rate)
+```
+
+### MeloTTX_ONNX构造函数的参数如下:
+- **model_path** str. 模型存放目录的路径.
+- **execution_provider** str. onnxruntime使用的设备,比如CUDA、CPU、等等。如果这个参数传入None,系统会在CUDA和CPU中选择一个最佳的设备。
+- **verbose** bool. 设置True表示在系统工作中输出详细的信息,一般可用来做调试。
+- **onnx_session_options** onnxruntim.SessionOptions. 用来传入指定的onnx推理会话参数
+- **onnx_params** dict. 你系统在onnxruntim.InferenceSession构造时传入的其他参数
+
+### MeloTTX_ONNX.speak方法的参数如下:
+- **text** str. 需要进行合成的文本
+- **speaker** str. 你想使用的发音者
+- **speed** float. 语音的速度
+- **sdp_ratio** float.
+- **noise_scale** float.
+- **noise_scale_w** float.
+- **pbar** function. Such as tqdm
+
+### MeloTTS实例的可用属性:
+- **speakers**: [str]. 只读。有效的发音者列表
+- **sample_rate**: int. 只读。合成结果的采样率
+- **language**: str. 只读。当前使用的模型的语言
+
+### OpenVoiceToneClone_ONNX构造函数的参数如下:
+- **model_path** str. 模型存放目录的路径.
+- **execution_provider** str. onnxruntime使用的设备,比如CUDA、CPU、等等。如果这个参数传入None,系统会在CUDA和CPU中选择一个最佳的设备。
+- **verbose** bool. 设置True表示在系统工作中输出详细的信息,一般可用来做调试。
+- **onnx_session_options** onnxruntim.SessionOptions. 用来传入指定的onnx推理会话参数
+- **onnx_params** dict. 你系统在onnxruntim.InferenceSession构造时传入的其他参数
+
+### OpenVoiceToneClone_ONNX.extract_tone_color方法的参数如下
+- **audio** numpy.array. 要提取音色的音频数据
+- **Returns** numpy.array. 提取到的音色数据
+
+### OpenVoiceToneClone_ONNX.mix_tone_color方法的参数如下
+- **color** list[numpy.array]. 要混合的音色的列表。列表中的每个元素都必须是extract_tone_color方法的返回值。
+- **Returns** numpy.array. 混合得到的音色
+
+### OpenVoiceToneClone_ONNX.tone_clone方法的参数如下
+- **audio** numpy.array. 要改变音色的原始音频数据
+- **target_tone_color** numpy.array. 目标音色数据。这个参数必须是extract_tone_color或mix_tone_color的返回值
+- **tau** float
+- **Returns** numpy.array. 修改音色后的音频数据
+
+### OpenVoiceToneClone_ONNX.resample方法的参数如下
+- **audio** numpy.array. 要重采样的音频的原始数据
+- **original_rate** int. 原始音频的原始采样率
+- **Returns** numpy.array. 重采样得到的音频数据
diff --git a/openvoice_tone_clone_onnx/configuration.json b/openvoice_tone_clone_onnx/configuration.json
new file mode 100644
index 0000000000000000000000000000000000000000..31cefc55be6bfe0f2d79fab9cde9876cc71f1660
--- /dev/null
+++ b/openvoice_tone_clone_onnx/configuration.json
@@ -0,0 +1,58 @@
+{
+ "task":"text-to-speech",
+ "data": {
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "n_speakers": 0
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 8,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256
+ }
+}
\ No newline at end of file
diff --git a/openvoice_tone_clone_onnx/source.txt b/openvoice_tone_clone_onnx/source.txt
new file mode 100644
index 0000000000000000000000000000000000000000..19c26c9087b481ea231a1670e1f0f595727ed4f7
--- /dev/null
+++ b/openvoice_tone_clone_onnx/source.txt
@@ -0,0 +1 @@
+https://huggingface.co/seasonstudio/openvoice_tone_clone_onnx
\ No newline at end of file
diff --git a/openvoice_tone_clone_onnx/tone_clone_model.onnx b/openvoice_tone_clone_onnx/tone_clone_model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..dcedae0a1571648e640aa63093e7549148463363
--- /dev/null
+++ b/openvoice_tone_clone_onnx/tone_clone_model.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea01c404712bc208694f79613488212e8cb7ad171c58361c22b645ca8b380197
+size 127891564
diff --git a/openvoice_tone_clone_onnx/tone_color_extract_model.onnx b/openvoice_tone_clone_onnx/tone_color_extract_model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..d11ca80b648a566650ee037add3521a553ae7a9e
--- /dev/null
+++ b/openvoice_tone_clone_onnx/tone_color_extract_model.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a90ce9048c8ad43ef3ebeb15ea5a1837ffa28669d61d78f749ccc2a648710747
+size 3257992