Upload 10 files

Browse files

Files changed (11) hide show

.gitattributes +4 -0
configuration.json +57 -0
convert_rknn.py +118 -0
export_onnx.py +127 -0
result.wav +3 -0
src2.wav +3 -0
target.wav +3 -0
test_rknn.py +327 -0
tone_clone_model.onnx +3 -0
tone_clone_model.rknn +3 -0
tone_color_extract_model.onnx +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+result.wav filter=lfs diff=lfs merge=lfs -text
+src2.wav filter=lfs diff=lfs merge=lfs -text
+target.wav filter=lfs diff=lfs merge=lfs -text
+tone_clone_model.rknn filter=lfs diff=lfs merge=lfs -text

configuration.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "_version_": "v2",
+  "data": {
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_speakers": 0
+  },
+  "model": {
+    "zero_g": true,
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4
+    ],
+    "gin_channels": 256
+  }
+}

convert_rknn.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#!/usr/bin/env python
+# coding: utf-8
+import datetime
+import argparse
+from rknn.api import RKNN
+from sys import exit
+# 模型配置
+MODELS = {
+    'tone_clone': 'tone_clone_model.onnx',
+    'tone_color_extract': 'tone_color_extract_model.onnx',
+}
+TARGET_AUDIO_LENS = [1024]
+SOURCE_AUDIO_LENS = [1024]
+AUDIO_DIM = 513
+QUANTIZE=False
+detailed_performance_log = True
+def convert_model(model_type):
+    """转换指定类型的模型到RKNN格式"""
+    if model_type not in MODELS:
+        print(f"错误: 不支持的模型类型 {model_type}")
+        return False
+    onnx_model = MODELS[model_type]
+    rknn_model = onnx_model.replace(".onnx",".rknn")
+    if model_type == 'tone_clone':
+        shapes = [
+            [
+                [1, 513, target_audio_len], # audio
+                [1], # audio_length
+                [1, 256, 1], # src_tone
+                [1, 256, 1], # dest_tone
+                [1], # tau
+            ] for target_audio_len in TARGET_AUDIO_LENS
+        ]
+    elif model_type == 'tone_color_extract':
+        shapes = [
+            [
+                [1, source_audio_len, 513], # audio
+            ] for source_audio_len in SOURCE_AUDIO_LENS
+        ]
+        # shapes = None
+    timedate_iso = datetime.datetime.now().isoformat()
+    rknn = RKNN(verbose=True)
+    rknn.config(
+        quantized_dtype='w8a8',
+        quantized_algorithm='normal',
+        quantized_method='channel',
+        quantized_hybrid_level=0,
+        target_platform='rk3588',
+        quant_img_RGB2BGR = False,
+        float_dtype='float16',
+        optimization_level=3,
+        custom_string=f"converted by: qq: 232004040, email: 2302004040@qq.com at {timedate_iso}",
+        remove_weight=False,
+        compress_weight=False,
+        inputs_yuv_fmt=None,
+        single_core_mode=False,
+        dynamic_input=shapes,
+        model_pruning=False,
+        op_target=None,
+        quantize_weight=False,
+        remove_reshape=False,
+        sparse_infer=False,
+        enable_flash_attention=False,
+        #  disable_rules=['convert_gemm_by_exmatmul']
+    )
+    print(f"开始转换 {model_type} 模型...")
+    ret = rknn.load_onnx(model=onnx_model)
+    if ret != 0:
+        print("加载ONNX模型失败")
+        return False
+    ret = rknn.build(do_quantization=False, rknn_batch_size=None)
+    if ret != 0:
+        print("构建RKNN模型失败")
+        return False
+    ret = rknn.export_rknn(rknn_model)
+    if ret != 0:
+        print("导出RKNN模型失败")
+        return False
+    print(f"成功转换模型: {rknn_model}")
+    return True
+def main():
+    parser = argparse.ArgumentParser(description='转换ONNX模型到RKNN格式')
+    parser.add_argument('model_type', nargs='?', default='all',
+                      choices=['all', 'tone_clone', 'tone_color_extract'],
+                      help='要转换的模型类型 (默认: all)')
+    args = parser.parse_args()
+    if args.model_type == 'all':
+        # 转换所有模型
+        for model_type in MODELS.keys():
+            if not convert_model(model_type):
+                print(f"转换 {model_type} 失败")
+    else:
+        # 转换指定模型
+        if not convert_model(args.model_type):
+            print(f"转换 {args.model_type} 失败")
+if __name__ == '__main__':
+    main()

export_onnx.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch
+import torch.nn as nn
+from openvoice.api import ToneColorConverter
+from openvoice.models import SynthesizerTrn
+import os
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+class ToneColorExtractWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    def forward(self, audio):
+        # audio: [1, source_audio_len, 513]
+        # 将mel谱图转置为模型需要的格式 [1, 513, source_audio_len]
+        audio = audio.contiguous()
+        # 提取声纹
+        g = self.model.ref_enc(audio)
+        # 扩展最后一维
+        # g = g.unsqueeze(-1)  # [1, 256, 1]
+        return g
+class ToneCloneWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    def forward(self, audio, audio_lengths, src_tone, dest_tone, tau):
+        # 确保张量连续
+        audio = audio.contiguous()
+        src_tone = src_tone.contiguous()
+        dest_tone = dest_tone.contiguous()
+        # 语音转换
+        o_hat, _, _ = self.model.voice_conversion(
+            audio,
+            audio_lengths,
+            sid_src=src_tone,
+            sid_tgt=dest_tone,
+            tau=tau[0]
+        )
+        return o_hat
+def export_models(ckpt_path, output_dir, target_audio_lens, source_audio_lens):
+    """
+    导出音色提取和克隆模型为ONNX格式
+    Args:
+        ckpt_path: 模型检查点路径
+        output_dir: 输出目录
+        target_audio_lens: 目标音频长度列表
+        source_audio_lens: 源音频长度列表
+    """
+    # 加载模型
+    device = "cpu"
+    converter = ToneColorConverter(f'{ckpt_path}/config.json', device=device)
+    converter.load_ckpt(f'{ckpt_path}/checkpoint.pth')
+    # 创建输出目录
+    os.makedirs(output_dir, exist_ok=True)
+    # 导出音色提取模型
+    extract_wrapper = ToneColorExtractWrapper(converter.model)
+    extract_wrapper.eval()
+    for source_len in source_audio_lens:
+        dummy_input = torch.randn(1, source_len, 513).contiguous()
+        output_path = f"{output_dir}/tone_color_extract_model.onnx"
+        torch.onnx.export(
+            extract_wrapper,
+            dummy_input,
+            output_path,
+            input_names=['input'],
+            output_names=['tone_embedding'],
+            dynamic_axes={
+                'input': {1: 'source_audio_len'},
+            },
+            opset_version=11,
+            do_constant_folding=True,
+            verbose=True
+        )
+        print(f"Exported tone extract model to {output_path}")
+    # 导出音色克隆模型
+    clone_wrapper = ToneCloneWrapper(converter.model)
+    clone_wrapper.eval()
+    for target_len in target_audio_lens:
+        dummy_inputs = (
+            torch.randn(1, 513, target_len).contiguous(),  # audio
+            torch.LongTensor([target_len]),   # audio_lengths
+            torch.randn(1, 256, 1).contiguous(),          # src_tone
+            torch.randn(1, 256, 1).contiguous(),          # dest_tone
+            torch.FloatTensor([0.3])         # tau
+        )
+        output_path = f"{output_dir}/tone_clone_model.onnx"
+        torch.onnx.export(
+            clone_wrapper,
+            dummy_inputs,
+            output_path,
+            input_names=['audio', 'audio_length', 'src_tone', 'dest_tone', 'tau'],
+            output_names=['converted_audio'],
+            dynamic_axes={
+                'audio': {2: 'target_audio_len'},
+            },
+            opset_version=17,
+            do_constant_folding=True,
+            verbose=True
+        )
+        print(f"Exported tone clone model to {output_path}")
+if __name__ == "__main__":
+    # 示例用法
+    TARGET_AUDIO_LENS = [1024]  # 根据需要设置目标长度
+    SOURCE_AUDIO_LENS = [1024]  # 根据需要设置源长度
+    export_models(
+        ckpt_path="checkpoints_v2/converter",
+        output_dir="onnx_models",
+        target_audio_lens=TARGET_AUDIO_LENS,
+        source_audio_lens=SOURCE_AUDIO_LENS
+    )

result.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d11ad289cc5014994086548874fd145ac67c41eb9b91fdd822ad6bd05a40c90f
+size 393260

src2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:baf4ce666c5fa88e052381e0c33543be3015bf2f47154ac3925ee67c963c0a12
+size 1712078

target.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c63d1b5cb444f3611a271d1c24d04363f5bdd73fb5745bc6b61e1c925a8f6084
+size 2165838

test_rknn.py ADDED Viewed

	@@ -0,0 +1,327 @@

+from typing import Callable
+import numpy as np
+import onnxruntime as ort
+import os
+from rknnlite.api import RKNNLite
+import json
+import os
+import time
+class HParams:
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if type(v) == dict:
+                v = HParams(**v)
+            self[k] = v
+    def keys(self):
+        return self.__dict__.keys()
+    def items(self):
+        return self.__dict__.items()
+    def values(self):
+        return self.__dict__.values()
+    def __len__(self):
+        return len(self.__dict__)
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+    def __contains__(self, key):
+        return key in self.__dict__
+    def __repr__(self):
+        return self.__dict__.__repr__()
+    @staticmethod
+    def load_from_file(file_path:str):
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Can not found the configuration file \"{file_path}\"")
+        with open(file_path, "r", encoding="utf-8") as f:
+            hps = json.load(f)
+            return HParams(**hps)
+class BaseClassForOnnxInfer():
+    @staticmethod
+    def create_onnx_infer(infer_factor:Callable, onnx_model_path:str, providers:list, session_options:ort.SessionOptions = None, onnx_params:dict = None):
+        if not os.path.exists(onnx_model_path):
+            raise FileNotFoundError(f"Can not found the onnx model file \"{onnx_model_path}\"")
+        session = ort.InferenceSession(onnx_model_path, sess_options=BaseClassForOnnxInfer.adjust_onnx_session_options(session_options), providers=providers, **(onnx_params or {}))
+        fn = infer_factor(session)
+        fn.__session = session
+        return fn
+    @staticmethod
+    def get_def_onnx_session_options():
+        session_options = ort.SessionOptions()
+        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        return session_options
+    @staticmethod
+    def adjust_onnx_session_options(session_options:ort.SessionOptions = None):
+        return session_options or BaseClassForOnnxInfer.get_def_onnx_session_options()
+class OpenVoiceToneClone_ONNXRKNN(BaseClassForOnnxInfer):
+    PreferredProviders = ['CPUExecutionProvider']
+    def __init__(self, model_path, execution_provider:str = None, verbose:bool = False, onnx_session_options:ort.SessionOptions = None, onnx_params:dict = None, target_length:int = 1024):
+        '''
+        Create the instance of the tone cloner
+        Args:
+            model_path (str): The path of the folder which contains the model
+            execution_provider (str): The provider that onnxruntime used. Such as CPUExecutionProvider, CUDAExecutionProvider, etc. Or you can use CPU, CUDA as short one. If it is None, the constructor will choose a best one automaticlly
+            verbose (bool): Set True to show more detail informations when working
+            onnx_session_options (onnxruntime.SessionOptions): The custom options for onnx session
+            onnx_params (dict): Other parameters you want to pass to the onnxruntime.InferenceSession constructor
+            target_length (int): The target length for padding/truncating spectrogram, defaults to 1024
+        Returns:
+            OpenVoiceToneClone_ONNX: The instance of the tone cloner
+        '''
+        self.__verbose = verbose
+        self.__target_length = target_length
+        if verbose:
+            print("Loading the configuration...")
+        config_path = os.path.join(model_path, "configuration.json")
+        self.__hparams = HParams.load_from_file(config_path)
+        execution_provider = f"{execution_provider}ExecutionProvider" if (execution_provider is not None) and (not execution_provider.endswith("ExecutionProvider")) else execution_provider
+        available_providers = ort.get_available_providers()
+        # self.__execution_providers = [execution_provider if execution_provider in available_providers else next((provider for provider in MeloTTS_ONNX.PreferredProviders if provider in available_providers), 'CPUExecutionProvider')]
+        self.__execution_providers = ['CPUExecutionProvider']
+        if verbose:
+            print("Creating onnx session for tone color extractor...")
+        def se_infer_factor(session):
+            return lambda **kwargs: session.run(None, kwargs)[0]
+        self.__se_infer = self.create_onnx_infer(se_infer_factor, os.path.join(model_path, "tone_color_extract_model.onnx"), self.__execution_providers, onnx_session_options, onnx_params)
+        if verbose:
+            print("Creating RKNNLite session for tone clone ...")
+        # 初始化RKNNLite
+        self.__tc_rknn = RKNNLite(verbose=verbose)
+        # 加载RKNN模型
+        ret = self.__tc_rknn.load_rknn(os.path.join(model_path, "tone_clone_model.rknn"))
+        if ret != 0:
+            raise RuntimeError("Failed to load RKNN model")
+        # 初始化运行时
+        ret = self.__tc_rknn.init_runtime()
+        if ret != 0:
+            raise RuntimeError("Failed to init RKNN runtime")
+    def __del__(self):
+        """释放RKNN资源"""
+        if hasattr(self, '_OpenVoiceToneClone_ONNXRKNN__tc_rknn'):
+            self.__tc_rknn.release()
+    hann_window = {}
+    def __spectrogram_numpy(self, y, n_fft, sampling_rate, hop_size, win_size, onesided=True):
+        if self.__verbose:
+            if np.min(y) < -1.1:
+                print("min value is ", np.min(y))
+            if np.max(y) > 1.1:
+                print("max value is ", np.max(y))
+        # 填充
+        y = np.pad(
+            y,
+            int((n_fft - hop_size) / 2),
+            mode="reflect",
+        )
+        # 生成汉宁窗
+        win_key = f"{str(y.dtype)}-{win_size}"
+        if True or win_key not in hann_window:
+            OpenVoiceToneClone_ONNXRKNN.hann_window[win_key] = np.hanning(win_size + 1)[:-1].astype(y.dtype)
+        window = OpenVoiceToneClone_ONNXRKNN.hann_window[win_key]
+        # 短时傅里叶变换
+        y_len = y.shape[0]
+        win_len = window.shape[0]
+        count = int((y_len - win_len) // hop_size) + 1
+        spec = np.empty((count, int(win_len / 2) + 1 if onesided else (int(win_len / 2) + 1) * 2, 2))
+        start = 0
+        end = start + win_len
+        idx = 0
+        while end <= y_len:
+            segment = y[start:end]
+            frame = segment * window
+            step_result = np.fft.rfft(frame) if onesided else np.fft.fft(frame)
+            spec[idx] = np.column_stack((step_result.real, step_result.imag))
+            start = start + hop_size
+            end = start + win_len
+            idx += 1
+        # 合并实部虚部
+        spec = np.sqrt(np.sum(np.square(spec), axis=-1) + 1e-6)
+        return np.array([spec], dtype=np.float32)
+    def extract_tone_color(self, audio:np.array):
+        '''
+        Extract the tone color from an audio
+        Args:
+            audio (numpy.array): The data of the audio
+        Returns:
+            numpy.array: The tone color vector
+        '''
+        hps = self.__hparams
+        y = self.to_mono(audio.astype(np.float32))
+        spec = self.__spectrogram_numpy(y, hps.data.filter_length,
+                                    hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
+                                    )
+        if self.__verbose:
+            print("spec shape", spec.shape)
+        return self.__se_infer(input=spec).reshape(1,256,1)
+    def mix_tone_color(self, colors:list):
+        '''
+        Mix multi tone colors to a single one
+        Args:
+            color (list[numpy.array]): The list of the tone colors you want to mix. Each element should be the result of extract_tone_color.
+        Returns:
+            numpy.array: The tone color vector
+        '''
+        return np.stack(colors).mean(axis=0)
+    def tone_clone(self, audio:np.array, target_tone_color:np.array, tau=0.3):
+        '''
+        Clone the tone
+        Args:
+            audio (numpy.array): The data of the audio that will be changed the tone
+            target_tone_color (numpy.array): The tone color that you want to clone. It should be the result of the extract_tone_color or mix_tone_color.
+            tau (float):
+        Returns:
+            numpy.array: The dest audio
+        '''
+        assert (target_tone_color.shape == (1,256,1)), "The target tone color must be an array with shape (1,256,1)"
+        hps = self.__hparams
+        src = self.to_mono(audio.astype(np.float32))
+        src = self.__spectrogram_numpy(src, hps.data.filter_length,
+                                      hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
+                                      )
+        src_tone = self.__se_infer(input=src).reshape(1,256,1)
+        src = np.transpose(src, (0, 2, 1))
+        # 记录原始长度
+        original_length = src.shape[2]
+        # Pad或截断到固定长度
+        if original_length > self.__target_length:
+            if self.__verbose:
+                print(f"Input length {original_length} exceeds target length {self.__target_length}, truncating...")
+            src = src[:, :, :self.__target_length]
+        elif original_length < self.__target_length:
+            if self.__verbose:
+                print(f"Input length {original_length} is less than target length {self.__target_length}, padding...")
+            pad_width = ((0, 0), (0, 0), (0, self.__target_length - original_length))
+            src = np.pad(src, pad_width, mode='constant', constant_values=0)
+        src_length = np.array([self.__target_length], dtype=np.int64)  # 使用固定长度
+        if self.__verbose:
+            print("src shape", src.shape)
+            print("src_length shape", src_length.shape)
+            print("src_tone shape", src_tone.shape)
+            print("target_tone_color shape", target_tone_color.shape)
+            print("tau", tau)
+        # 准备RKNNLite的输入
+        inputs = [
+            src,
+            src_length,
+            src_tone,
+            target_tone_color,
+            np.array([tau], dtype=np.float32)
+        ]
+        # 使用RKNNLite进行推理
+        outputs = self.__tc_rknn.inference(inputs=inputs)
+        res = outputs[0][0, 0]  # 获取第一个输出的第一个样本
+        generated_multiplier = 262144 / 1024
+        # 如果原始输入较短,则截取掉padding部分
+        if original_length < self.__target_length:
+            res = res[:int(original_length * generated_multiplier)]
+        if self.__verbose:
+            print("res shape", res.shape)
+        return res
+    def to_mono(self, audio:np.array):
+        '''
+        Change the audio to be a mono audio
+        Args:
+            audio (numpy.array): The source audio
+        Returns:
+            numpy.array: The mono audio data
+        '''
+        return np.mean(audio, axis=1) if len(audio.shape) > 1 else audio
+    def resample(self, audio:np.array, original_rate:int):
+        '''
+        Resample the audio to match the model. It is used for changing the sample rate of the audio.
+        Args:
+            audio (numpy.array): The source audio you want to resample.
+            original_rate (int): The original sample rate of the source audio
+        Returns:
+            numpy.array: The dest data of the audio after resample
+        '''
+        audio = self.to_mono(audio)
+        target_rate = self.__hparams.data.sampling_rate
+        duration = audio.shape[0] / original_rate
+        target_length = int(duration * target_rate)
+        time_original = np.linspace(0, duration, num=audio.shape[0])
+        time_target = np.linspace(0, duration, num=target_length)
+        resampled_data = np.interp(time_target, time_original, audio)
+        return resampled_data
+    @property
+    def sample_rate(self):
+        '''
+        The sample rate of the tone cloning result
+        '''
+        return self.__hparams.data.sampling_rate
+tc = OpenVoiceToneClone_ONNXRKNN(".",verbose=True)
+import soundfile
+tgt = soundfile.read("target.wav", dtype='float32')
+tgt = tc.resample(tgt[0], tgt[1])
+# 计时extract_tone_color
+start_time = time.time()
+tgt_tone_color = tc.extract_tone_color(tgt)
+extract_time = time.time() - start_time
+print(f"提取音色特征耗时: {extract_time:.2f}秒")
+src = soundfile.read("src2.wav", dtype='float32')
+src = tc.resample(src[0], src[1])
+# 计时tone_clone
+start_time = time.time()
+result = tc.tone_clone(src, tgt_tone_color)
+clone_time = time.time() - start_time
+print(f"克隆音色耗时: {clone_time:.2f}秒")
+soundfile.write("result.wav", result, tc.sample_rate)

tone_clone_model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:896195b84b0cb87a828bb8cab06577e9c024356bc9727b1a8f4174154bc0affa
+size 157196170

tone_clone_model.rknn ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cd7dc3385c55ca610580edaba263510091314be35ae4688a1c076afe9e5d84a
+size 108102277

tone_color_extract_model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e91c2cb696e199d2519ed8b62ca6e3c8e42cb99ca13955dd6e188051486e681c
+size 3364792