OpenVoice & OpenVoiceV2
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +4 -0
- OpenVoice-RKNN2/.gitattributes +39 -0
- OpenVoice-RKNN2/README.md +3 -0
- OpenVoice-RKNN2/configuration.json +57 -0
- OpenVoice-RKNN2/convert_rknn.py +118 -0
- OpenVoice-RKNN2/export_onnx.py +127 -0
- OpenVoice-RKNN2/result.wav +3 -0
- OpenVoice-RKNN2/source.txt +1 -0
- OpenVoice-RKNN2/src2.wav +3 -0
- OpenVoice-RKNN2/target.wav +3 -0
- OpenVoice-RKNN2/test_rknn.py +327 -0
- OpenVoice-RKNN2/tone_clone_model.onnx +3 -0
- OpenVoice-RKNN2/tone_clone_model.rknn +3 -0
- OpenVoice-RKNN2/tone_color_extract_model.onnx +3 -0
- OpenVoice/.gitattributes +35 -0
- OpenVoice/README.md +33 -0
- OpenVoice/checkpoints/base_speakers/EN/checkpoint.pth +3 -0
- OpenVoice/checkpoints/base_speakers/EN/config.json +145 -0
- OpenVoice/checkpoints/base_speakers/EN/en_default_se.pth +3 -0
- OpenVoice/checkpoints/base_speakers/EN/en_style_se.pth +3 -0
- OpenVoice/checkpoints/base_speakers/ZH/checkpoint.pth +3 -0
- OpenVoice/checkpoints/base_speakers/ZH/config.json +137 -0
- OpenVoice/checkpoints/base_speakers/ZH/zh_default_se.pth +3 -0
- OpenVoice/checkpoints/converter/checkpoint.pth +3 -0
- OpenVoice/checkpoints/converter/config.json +57 -0
- OpenVoice/source.txt +1 -0
- OpenVoiceV2/.DS_Store +0 -0
- OpenVoiceV2/.gitattributes +35 -0
- OpenVoiceV2/README.md +116 -0
- OpenVoiceV2/base_speakers/.DS_Store +0 -0
- OpenVoiceV2/base_speakers/ses/en-au.pth +3 -0
- OpenVoiceV2/base_speakers/ses/en-br.pth +3 -0
- OpenVoiceV2/base_speakers/ses/en-default.pth +3 -0
- OpenVoiceV2/base_speakers/ses/en-india.pth +3 -0
- OpenVoiceV2/base_speakers/ses/en-newest.pth +3 -0
- OpenVoiceV2/base_speakers/ses/en-us.pth +3 -0
- OpenVoiceV2/base_speakers/ses/es.pth +3 -0
- OpenVoiceV2/base_speakers/ses/fr.pth +3 -0
- OpenVoiceV2/base_speakers/ses/jp.pth +3 -0
- OpenVoiceV2/base_speakers/ses/kr.pth +3 -0
- OpenVoiceV2/base_speakers/ses/zh.pth +3 -0
- OpenVoiceV2/converter/checkpoint.pth +3 -0
- OpenVoiceV2/converter/config.json +57 -0
- OpenVoiceV2/languages.txt +6 -0
- OpenVoiceV2/source.txt +1 -0
- openvoice-tunner-v2/.gitattributes +35 -0
- openvoice-tunner-v2/README.md +7 -0
- openvoice-tunner-v2/checkpoint.pth +3 -0
- openvoice-tunner-v2/config.json +57 -0
- openvoice-tunner-v2/source.txt +1 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
OpenVoice-RKNN2/result.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
OpenVoice-RKNN2/src2.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
OpenVoice-RKNN2/target.wav filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
OpenVoice-RKNN2/tone_clone_model.rknn filter=lfs diff=lfs merge=lfs -text
|
OpenVoice-RKNN2/.gitattributes
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
result.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
src2.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
target.wav filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
tone_clone_model.rknn filter=lfs diff=lfs merge=lfs -text
|
OpenVoice-RKNN2/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: agpl-3.0
|
| 3 |
+
---
|
OpenVoice-RKNN2/configuration.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_version_": "v2",
|
| 3 |
+
"data": {
|
| 4 |
+
"sampling_rate": 22050,
|
| 5 |
+
"filter_length": 1024,
|
| 6 |
+
"hop_length": 256,
|
| 7 |
+
"win_length": 1024,
|
| 8 |
+
"n_speakers": 0
|
| 9 |
+
},
|
| 10 |
+
"model": {
|
| 11 |
+
"zero_g": true,
|
| 12 |
+
"inter_channels": 192,
|
| 13 |
+
"hidden_channels": 192,
|
| 14 |
+
"filter_channels": 768,
|
| 15 |
+
"n_heads": 2,
|
| 16 |
+
"n_layers": 6,
|
| 17 |
+
"kernel_size": 3,
|
| 18 |
+
"p_dropout": 0.1,
|
| 19 |
+
"resblock": "1",
|
| 20 |
+
"resblock_kernel_sizes": [
|
| 21 |
+
3,
|
| 22 |
+
7,
|
| 23 |
+
11
|
| 24 |
+
],
|
| 25 |
+
"resblock_dilation_sizes": [
|
| 26 |
+
[
|
| 27 |
+
1,
|
| 28 |
+
3,
|
| 29 |
+
5
|
| 30 |
+
],
|
| 31 |
+
[
|
| 32 |
+
1,
|
| 33 |
+
3,
|
| 34 |
+
5
|
| 35 |
+
],
|
| 36 |
+
[
|
| 37 |
+
1,
|
| 38 |
+
3,
|
| 39 |
+
5
|
| 40 |
+
]
|
| 41 |
+
],
|
| 42 |
+
"upsample_rates": [
|
| 43 |
+
8,
|
| 44 |
+
8,
|
| 45 |
+
2,
|
| 46 |
+
2
|
| 47 |
+
],
|
| 48 |
+
"upsample_initial_channel": 512,
|
| 49 |
+
"upsample_kernel_sizes": [
|
| 50 |
+
16,
|
| 51 |
+
16,
|
| 52 |
+
4,
|
| 53 |
+
4
|
| 54 |
+
],
|
| 55 |
+
"gin_channels": 256
|
| 56 |
+
}
|
| 57 |
+
}
|
OpenVoice-RKNN2/convert_rknn.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# coding: utf-8
|
| 3 |
+
|
| 4 |
+
import datetime
|
| 5 |
+
import argparse
|
| 6 |
+
from rknn.api import RKNN
|
| 7 |
+
from sys import exit
|
| 8 |
+
|
| 9 |
+
# 模型配置
|
| 10 |
+
MODELS = {
|
| 11 |
+
'tone_clone': 'tone_clone_model.onnx',
|
| 12 |
+
'tone_color_extract': 'tone_color_extract_model.onnx',
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
TARGET_AUDIO_LENS = [1024]
|
| 16 |
+
|
| 17 |
+
SOURCE_AUDIO_LENS = [1024]
|
| 18 |
+
|
| 19 |
+
AUDIO_DIM = 513
|
| 20 |
+
|
| 21 |
+
QUANTIZE=False
|
| 22 |
+
detailed_performance_log = True
|
| 23 |
+
|
| 24 |
+
def convert_model(model_type):
|
| 25 |
+
"""转换指定类型的模型到RKNN格式"""
|
| 26 |
+
if model_type not in MODELS:
|
| 27 |
+
print(f"错误: 不支持的模型类型 {model_type}")
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
onnx_model = MODELS[model_type]
|
| 31 |
+
rknn_model = onnx_model.replace(".onnx",".rknn")
|
| 32 |
+
|
| 33 |
+
if model_type == 'tone_clone':
|
| 34 |
+
shapes = [
|
| 35 |
+
[
|
| 36 |
+
[1, 513, target_audio_len], # audio
|
| 37 |
+
[1], # audio_length
|
| 38 |
+
[1, 256, 1], # src_tone
|
| 39 |
+
[1, 256, 1], # dest_tone
|
| 40 |
+
[1], # tau
|
| 41 |
+
] for target_audio_len in TARGET_AUDIO_LENS
|
| 42 |
+
]
|
| 43 |
+
elif model_type == 'tone_color_extract':
|
| 44 |
+
shapes = [
|
| 45 |
+
[
|
| 46 |
+
[1, source_audio_len, 513], # audio
|
| 47 |
+
] for source_audio_len in SOURCE_AUDIO_LENS
|
| 48 |
+
]
|
| 49 |
+
# shapes = None
|
| 50 |
+
|
| 51 |
+
timedate_iso = datetime.datetime.now().isoformat()
|
| 52 |
+
|
| 53 |
+
rknn = RKNN(verbose=True)
|
| 54 |
+
rknn.config(
|
| 55 |
+
quantized_dtype='w8a8',
|
| 56 |
+
quantized_algorithm='normal',
|
| 57 |
+
quantized_method='channel',
|
| 58 |
+
quantized_hybrid_level=0,
|
| 59 |
+
target_platform='rk3588',
|
| 60 |
+
quant_img_RGB2BGR = False,
|
| 61 |
+
float_dtype='float16',
|
| 62 |
+
optimization_level=3,
|
| 63 |
+
custom_string=f"converted by: qq: 232004040, email: 2302004040@qq.com at {timedate_iso}",
|
| 64 |
+
remove_weight=False,
|
| 65 |
+
compress_weight=False,
|
| 66 |
+
inputs_yuv_fmt=None,
|
| 67 |
+
single_core_mode=False,
|
| 68 |
+
dynamic_input=shapes,
|
| 69 |
+
model_pruning=False,
|
| 70 |
+
op_target=None,
|
| 71 |
+
quantize_weight=False,
|
| 72 |
+
remove_reshape=False,
|
| 73 |
+
sparse_infer=False,
|
| 74 |
+
enable_flash_attention=False,
|
| 75 |
+
# disable_rules=['convert_gemm_by_exmatmul']
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
print(f"开始转换 {model_type} 模型...")
|
| 79 |
+
ret = rknn.load_onnx(model=onnx_model)
|
| 80 |
+
if ret != 0:
|
| 81 |
+
print("加载ONNX模型失败")
|
| 82 |
+
return False
|
| 83 |
+
|
| 84 |
+
ret = rknn.build(do_quantization=False, rknn_batch_size=None)
|
| 85 |
+
if ret != 0:
|
| 86 |
+
print("构建RKNN模型失败")
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
ret = rknn.export_rknn(rknn_model)
|
| 90 |
+
if ret != 0:
|
| 91 |
+
print("导出RKNN模型失败")
|
| 92 |
+
return False
|
| 93 |
+
|
| 94 |
+
print(f"成功转换模型: {rknn_model}")
|
| 95 |
+
return True
|
| 96 |
+
|
| 97 |
+
def main():
|
| 98 |
+
parser = argparse.ArgumentParser(description='转换ONNX模型到RKNN格式')
|
| 99 |
+
parser.add_argument('model_type', nargs='?', default='all',
|
| 100 |
+
choices=['all', 'tone_clone', 'tone_color_extract'],
|
| 101 |
+
help='要转换的模型类型 (默认: all)')
|
| 102 |
+
|
| 103 |
+
args = parser.parse_args()
|
| 104 |
+
|
| 105 |
+
if args.model_type == 'all':
|
| 106 |
+
# 转换所有模型
|
| 107 |
+
for model_type in MODELS.keys():
|
| 108 |
+
if not convert_model(model_type):
|
| 109 |
+
print(f"转换 {model_type} 失败")
|
| 110 |
+
else:
|
| 111 |
+
# 转换指定模型
|
| 112 |
+
if not convert_model(args.model_type):
|
| 113 |
+
print(f"转换 {args.model_type} 失败")
|
| 114 |
+
|
| 115 |
+
if __name__ == '__main__':
|
| 116 |
+
main()
|
| 117 |
+
|
| 118 |
+
|
OpenVoice-RKNN2/export_onnx.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from openvoice.api import ToneColorConverter
|
| 4 |
+
from openvoice.models import SynthesizerTrn
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
| 8 |
+
|
| 9 |
+
class ToneColorExtractWrapper(nn.Module):
|
| 10 |
+
def __init__(self, model):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.model = model
|
| 13 |
+
|
| 14 |
+
def forward(self, audio):
|
| 15 |
+
# audio: [1, source_audio_len, 513]
|
| 16 |
+
# 将mel谱图转置为模型需要的格式 [1, 513, source_audio_len]
|
| 17 |
+
audio = audio.contiguous()
|
| 18 |
+
# 提取声纹
|
| 19 |
+
g = self.model.ref_enc(audio)
|
| 20 |
+
# 扩展最后一维
|
| 21 |
+
# g = g.unsqueeze(-1) # [1, 256, 1]
|
| 22 |
+
return g
|
| 23 |
+
|
| 24 |
+
class ToneCloneWrapper(nn.Module):
|
| 25 |
+
def __init__(self, model):
|
| 26 |
+
super().__init__()
|
| 27 |
+
self.model = model
|
| 28 |
+
|
| 29 |
+
def forward(self, audio, audio_lengths, src_tone, dest_tone, tau):
|
| 30 |
+
# 确保张量连续
|
| 31 |
+
audio = audio.contiguous()
|
| 32 |
+
src_tone = src_tone.contiguous()
|
| 33 |
+
dest_tone = dest_tone.contiguous()
|
| 34 |
+
|
| 35 |
+
# 语音转换
|
| 36 |
+
o_hat, _, _ = self.model.voice_conversion(
|
| 37 |
+
audio,
|
| 38 |
+
audio_lengths,
|
| 39 |
+
sid_src=src_tone,
|
| 40 |
+
sid_tgt=dest_tone,
|
| 41 |
+
tau=tau[0]
|
| 42 |
+
)
|
| 43 |
+
return o_hat
|
| 44 |
+
|
| 45 |
+
def export_models(ckpt_path, output_dir, target_audio_lens, source_audio_lens):
|
| 46 |
+
"""
|
| 47 |
+
导出音色提取和克隆模型为ONNX格式
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
ckpt_path: 模型检查点路径
|
| 51 |
+
output_dir: 输出目录
|
| 52 |
+
target_audio_lens: 目标音频长度列表
|
| 53 |
+
source_audio_lens: 源音频长度列表
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
# 加载模型
|
| 57 |
+
device = "cpu"
|
| 58 |
+
converter = ToneColorConverter(f'{ckpt_path}/config.json', device=device)
|
| 59 |
+
converter.load_ckpt(f'{ckpt_path}/checkpoint.pth')
|
| 60 |
+
|
| 61 |
+
# 创建输出目录
|
| 62 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 63 |
+
|
| 64 |
+
# 导出音色提取模型
|
| 65 |
+
extract_wrapper = ToneColorExtractWrapper(converter.model)
|
| 66 |
+
extract_wrapper.eval()
|
| 67 |
+
|
| 68 |
+
for source_len in source_audio_lens:
|
| 69 |
+
dummy_input = torch.randn(1, source_len, 513).contiguous()
|
| 70 |
+
output_path = f"{output_dir}/tone_color_extract_model.onnx"
|
| 71 |
+
|
| 72 |
+
torch.onnx.export(
|
| 73 |
+
extract_wrapper,
|
| 74 |
+
dummy_input,
|
| 75 |
+
output_path,
|
| 76 |
+
input_names=['input'],
|
| 77 |
+
output_names=['tone_embedding'],
|
| 78 |
+
dynamic_axes={
|
| 79 |
+
'input': {1: 'source_audio_len'},
|
| 80 |
+
},
|
| 81 |
+
opset_version=11,
|
| 82 |
+
do_constant_folding=True,
|
| 83 |
+
verbose=True
|
| 84 |
+
)
|
| 85 |
+
print(f"Exported tone extract model to {output_path}")
|
| 86 |
+
|
| 87 |
+
# 导出音色克隆模型
|
| 88 |
+
clone_wrapper = ToneCloneWrapper(converter.model)
|
| 89 |
+
clone_wrapper.eval()
|
| 90 |
+
|
| 91 |
+
for target_len in target_audio_lens:
|
| 92 |
+
dummy_inputs = (
|
| 93 |
+
torch.randn(1, 513, target_len).contiguous(), # audio
|
| 94 |
+
torch.LongTensor([target_len]), # audio_lengths
|
| 95 |
+
torch.randn(1, 256, 1).contiguous(), # src_tone
|
| 96 |
+
torch.randn(1, 256, 1).contiguous(), # dest_tone
|
| 97 |
+
torch.FloatTensor([0.3]) # tau
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
output_path = f"{output_dir}/tone_clone_model.onnx"
|
| 101 |
+
|
| 102 |
+
torch.onnx.export(
|
| 103 |
+
clone_wrapper,
|
| 104 |
+
dummy_inputs,
|
| 105 |
+
output_path,
|
| 106 |
+
input_names=['audio', 'audio_length', 'src_tone', 'dest_tone', 'tau'],
|
| 107 |
+
output_names=['converted_audio'],
|
| 108 |
+
dynamic_axes={
|
| 109 |
+
'audio': {2: 'target_audio_len'},
|
| 110 |
+
},
|
| 111 |
+
opset_version=17,
|
| 112 |
+
do_constant_folding=True,
|
| 113 |
+
verbose=True
|
| 114 |
+
)
|
| 115 |
+
print(f"Exported tone clone model to {output_path}")
|
| 116 |
+
|
| 117 |
+
if __name__ == "__main__":
|
| 118 |
+
# 示例用法
|
| 119 |
+
TARGET_AUDIO_LENS = [1024] # 根据需要设置目标长度
|
| 120 |
+
SOURCE_AUDIO_LENS = [1024] # 根据需要设置源长度
|
| 121 |
+
|
| 122 |
+
export_models(
|
| 123 |
+
ckpt_path="checkpoints_v2/converter",
|
| 124 |
+
output_dir="onnx_models",
|
| 125 |
+
target_audio_lens=TARGET_AUDIO_LENS,
|
| 126 |
+
source_audio_lens=SOURCE_AUDIO_LENS
|
| 127 |
+
)
|
OpenVoice-RKNN2/result.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d11ad289cc5014994086548874fd145ac67c41eb9b91fdd822ad6bd05a40c90f
|
| 3 |
+
size 393260
|
OpenVoice-RKNN2/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/happyme531/OpenVoice-RKNN2
|
OpenVoice-RKNN2/src2.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:baf4ce666c5fa88e052381e0c33543be3015bf2f47154ac3925ee67c963c0a12
|
| 3 |
+
size 1712078
|
OpenVoice-RKNN2/target.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c63d1b5cb444f3611a271d1c24d04363f5bdd73fb5745bc6b61e1c925a8f6084
|
| 3 |
+
size 2165838
|
OpenVoice-RKNN2/test_rknn.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Callable
|
| 2 |
+
import numpy as np
|
| 3 |
+
import onnxruntime as ort
|
| 4 |
+
import os
|
| 5 |
+
from rknnlite.api import RKNNLite
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
class HParams:
|
| 11 |
+
def __init__(self, **kwargs):
|
| 12 |
+
for k, v in kwargs.items():
|
| 13 |
+
if type(v) == dict:
|
| 14 |
+
v = HParams(**v)
|
| 15 |
+
self[k] = v
|
| 16 |
+
|
| 17 |
+
def keys(self):
|
| 18 |
+
return self.__dict__.keys()
|
| 19 |
+
|
| 20 |
+
def items(self):
|
| 21 |
+
return self.__dict__.items()
|
| 22 |
+
|
| 23 |
+
def values(self):
|
| 24 |
+
return self.__dict__.values()
|
| 25 |
+
|
| 26 |
+
def __len__(self):
|
| 27 |
+
return len(self.__dict__)
|
| 28 |
+
|
| 29 |
+
def __getitem__(self, key):
|
| 30 |
+
return getattr(self, key)
|
| 31 |
+
|
| 32 |
+
def __setitem__(self, key, value):
|
| 33 |
+
return setattr(self, key, value)
|
| 34 |
+
|
| 35 |
+
def __contains__(self, key):
|
| 36 |
+
return key in self.__dict__
|
| 37 |
+
|
| 38 |
+
def __repr__(self):
|
| 39 |
+
return self.__dict__.__repr__()
|
| 40 |
+
|
| 41 |
+
@staticmethod
|
| 42 |
+
def load_from_file(file_path:str):
|
| 43 |
+
if not os.path.exists(file_path):
|
| 44 |
+
raise FileNotFoundError(f"Can not found the configuration file \"{file_path}\"")
|
| 45 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 46 |
+
hps = json.load(f)
|
| 47 |
+
return HParams(**hps)
|
| 48 |
+
|
| 49 |
+
class BaseClassForOnnxInfer():
|
| 50 |
+
@staticmethod
|
| 51 |
+
def create_onnx_infer(infer_factor:Callable, onnx_model_path:str, providers:list, session_options:ort.SessionOptions = None, onnx_params:dict = None):
|
| 52 |
+
if not os.path.exists(onnx_model_path):
|
| 53 |
+
raise FileNotFoundError(f"Can not found the onnx model file \"{onnx_model_path}\"")
|
| 54 |
+
session = ort.InferenceSession(onnx_model_path, sess_options=BaseClassForOnnxInfer.adjust_onnx_session_options(session_options), providers=providers, **(onnx_params or {}))
|
| 55 |
+
fn = infer_factor(session)
|
| 56 |
+
fn.__session = session
|
| 57 |
+
return fn
|
| 58 |
+
|
| 59 |
+
@staticmethod
|
| 60 |
+
def get_def_onnx_session_options():
|
| 61 |
+
session_options = ort.SessionOptions()
|
| 62 |
+
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 63 |
+
return session_options
|
| 64 |
+
|
| 65 |
+
@staticmethod
|
| 66 |
+
def adjust_onnx_session_options(session_options:ort.SessionOptions = None):
|
| 67 |
+
return session_options or BaseClassForOnnxInfer.get_def_onnx_session_options()
|
| 68 |
+
|
| 69 |
+
class OpenVoiceToneClone_ONNXRKNN(BaseClassForOnnxInfer):
|
| 70 |
+
|
| 71 |
+
PreferredProviders = ['CPUExecutionProvider']
|
| 72 |
+
|
| 73 |
+
def __init__(self, model_path, execution_provider:str = None, verbose:bool = False, onnx_session_options:ort.SessionOptions = None, onnx_params:dict = None, target_length:int = 1024):
|
| 74 |
+
'''
|
| 75 |
+
Create the instance of the tone cloner
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
model_path (str): The path of the folder which contains the model
|
| 79 |
+
execution_provider (str): The provider that onnxruntime used. Such as CPUExecutionProvider, CUDAExecutionProvider, etc. Or you can use CPU, CUDA as short one. If it is None, the constructor will choose a best one automaticlly
|
| 80 |
+
verbose (bool): Set True to show more detail informations when working
|
| 81 |
+
onnx_session_options (onnxruntime.SessionOptions): The custom options for onnx session
|
| 82 |
+
onnx_params (dict): Other parameters you want to pass to the onnxruntime.InferenceSession constructor
|
| 83 |
+
target_length (int): The target length for padding/truncating spectrogram, defaults to 1024
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
OpenVoiceToneClone_ONNX: The instance of the tone cloner
|
| 87 |
+
'''
|
| 88 |
+
self.__verbose = verbose
|
| 89 |
+
self.__target_length = target_length
|
| 90 |
+
|
| 91 |
+
if verbose:
|
| 92 |
+
print("Loading the configuration...")
|
| 93 |
+
config_path = os.path.join(model_path, "configuration.json")
|
| 94 |
+
self.__hparams = HParams.load_from_file(config_path)
|
| 95 |
+
|
| 96 |
+
execution_provider = f"{execution_provider}ExecutionProvider" if (execution_provider is not None) and (not execution_provider.endswith("ExecutionProvider")) else execution_provider
|
| 97 |
+
available_providers = ort.get_available_providers()
|
| 98 |
+
# self.__execution_providers = [execution_provider if execution_provider in available_providers else next((provider for provider in MeloTTS_ONNX.PreferredProviders if provider in available_providers), 'CPUExecutionProvider')]
|
| 99 |
+
self.__execution_providers = ['CPUExecutionProvider']
|
| 100 |
+
if verbose:
|
| 101 |
+
print("Creating onnx session for tone color extractor...")
|
| 102 |
+
def se_infer_factor(session):
|
| 103 |
+
return lambda **kwargs: session.run(None, kwargs)[0]
|
| 104 |
+
self.__se_infer = self.create_onnx_infer(se_infer_factor, os.path.join(model_path, "tone_color_extract_model.onnx"), self.__execution_providers, onnx_session_options, onnx_params)
|
| 105 |
+
|
| 106 |
+
if verbose:
|
| 107 |
+
print("Creating RKNNLite session for tone clone ...")
|
| 108 |
+
# 初始化RKNNLite
|
| 109 |
+
self.__tc_rknn = RKNNLite(verbose=verbose)
|
| 110 |
+
# 加载RKNN模型
|
| 111 |
+
ret = self.__tc_rknn.load_rknn(os.path.join(model_path, "tone_clone_model.rknn"))
|
| 112 |
+
if ret != 0:
|
| 113 |
+
raise RuntimeError("Failed to load RKNN model")
|
| 114 |
+
# 初始化运行时
|
| 115 |
+
ret = self.__tc_rknn.init_runtime()
|
| 116 |
+
if ret != 0:
|
| 117 |
+
raise RuntimeError("Failed to init RKNN runtime")
|
| 118 |
+
|
| 119 |
+
def __del__(self):
|
| 120 |
+
"""释放RKNN资源"""
|
| 121 |
+
if hasattr(self, '_OpenVoiceToneClone_ONNXRKNN__tc_rknn'):
|
| 122 |
+
self.__tc_rknn.release()
|
| 123 |
+
|
| 124 |
+
hann_window = {}
|
| 125 |
+
|
| 126 |
+
def __spectrogram_numpy(self, y, n_fft, sampling_rate, hop_size, win_size, onesided=True):
|
| 127 |
+
if self.__verbose:
|
| 128 |
+
if np.min(y) < -1.1:
|
| 129 |
+
print("min value is ", np.min(y))
|
| 130 |
+
if np.max(y) > 1.1:
|
| 131 |
+
print("max value is ", np.max(y))
|
| 132 |
+
|
| 133 |
+
# 填充
|
| 134 |
+
y = np.pad(
|
| 135 |
+
y,
|
| 136 |
+
int((n_fft - hop_size) / 2),
|
| 137 |
+
mode="reflect",
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# 生成汉宁窗
|
| 141 |
+
win_key = f"{str(y.dtype)}-{win_size}"
|
| 142 |
+
if True or win_key not in hann_window:
|
| 143 |
+
OpenVoiceToneClone_ONNXRKNN.hann_window[win_key] = np.hanning(win_size + 1)[:-1].astype(y.dtype)
|
| 144 |
+
window = OpenVoiceToneClone_ONNXRKNN.hann_window[win_key]
|
| 145 |
+
|
| 146 |
+
# 短时傅里叶变换
|
| 147 |
+
y_len = y.shape[0]
|
| 148 |
+
win_len = window.shape[0]
|
| 149 |
+
count = int((y_len - win_len) // hop_size) + 1
|
| 150 |
+
spec = np.empty((count, int(win_len / 2) + 1 if onesided else (int(win_len / 2) + 1) * 2, 2))
|
| 151 |
+
start = 0
|
| 152 |
+
end = start + win_len
|
| 153 |
+
idx = 0
|
| 154 |
+
while end <= y_len:
|
| 155 |
+
segment = y[start:end]
|
| 156 |
+
frame = segment * window
|
| 157 |
+
step_result = np.fft.rfft(frame) if onesided else np.fft.fft(frame)
|
| 158 |
+
spec[idx] = np.column_stack((step_result.real, step_result.imag))
|
| 159 |
+
start = start + hop_size
|
| 160 |
+
end = start + win_len
|
| 161 |
+
idx += 1
|
| 162 |
+
|
| 163 |
+
# 合并实部虚部
|
| 164 |
+
spec = np.sqrt(np.sum(np.square(spec), axis=-1) + 1e-6)
|
| 165 |
+
|
| 166 |
+
return np.array([spec], dtype=np.float32)
|
| 167 |
+
|
| 168 |
+
def extract_tone_color(self, audio:np.array):
|
| 169 |
+
'''
|
| 170 |
+
Extract the tone color from an audio
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
audio (numpy.array): The data of the audio
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
numpy.array: The tone color vector
|
| 177 |
+
'''
|
| 178 |
+
hps = self.__hparams
|
| 179 |
+
y = self.to_mono(audio.astype(np.float32))
|
| 180 |
+
spec = self.__spectrogram_numpy(y, hps.data.filter_length,
|
| 181 |
+
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
if self.__verbose:
|
| 185 |
+
print("spec shape", spec.shape)
|
| 186 |
+
return self.__se_infer(input=spec).reshape(1,256,1)
|
| 187 |
+
|
| 188 |
+
def mix_tone_color(self, colors:list):
|
| 189 |
+
'''
|
| 190 |
+
Mix multi tone colors to a single one
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
color (list[numpy.array]): The list of the tone colors you want to mix. Each element should be the result of extract_tone_color.
|
| 194 |
+
|
| 195 |
+
Returns:
|
| 196 |
+
numpy.array: The tone color vector
|
| 197 |
+
'''
|
| 198 |
+
return np.stack(colors).mean(axis=0)
|
| 199 |
+
|
| 200 |
+
def tone_clone(self, audio:np.array, target_tone_color:np.array, tau=0.3):
|
| 201 |
+
'''
|
| 202 |
+
Clone the tone
|
| 203 |
+
|
| 204 |
+
Args:
|
| 205 |
+
audio (numpy.array): The data of the audio that will be changed the tone
|
| 206 |
+
target_tone_color (numpy.array): The tone color that you want to clone. It should be the result of the extract_tone_color or mix_tone_color.
|
| 207 |
+
tau (float):
|
| 208 |
+
|
| 209 |
+
Returns:
|
| 210 |
+
numpy.array: The dest audio
|
| 211 |
+
'''
|
| 212 |
+
assert (target_tone_color.shape == (1,256,1)), "The target tone color must be an array with shape (1,256,1)"
|
| 213 |
+
hps = self.__hparams
|
| 214 |
+
src = self.to_mono(audio.astype(np.float32))
|
| 215 |
+
src = self.__spectrogram_numpy(src, hps.data.filter_length,
|
| 216 |
+
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
| 217 |
+
)
|
| 218 |
+
src_tone = self.__se_infer(input=src).reshape(1,256,1)
|
| 219 |
+
|
| 220 |
+
src = np.transpose(src, (0, 2, 1))
|
| 221 |
+
# 记录原始长度
|
| 222 |
+
original_length = src.shape[2]
|
| 223 |
+
|
| 224 |
+
# Pad或截断到固定长度
|
| 225 |
+
if original_length > self.__target_length:
|
| 226 |
+
if self.__verbose:
|
| 227 |
+
print(f"Input length {original_length} exceeds target length {self.__target_length}, truncating...")
|
| 228 |
+
src = src[:, :, :self.__target_length]
|
| 229 |
+
elif original_length < self.__target_length:
|
| 230 |
+
if self.__verbose:
|
| 231 |
+
print(f"Input length {original_length} is less than target length {self.__target_length}, padding...")
|
| 232 |
+
pad_width = ((0, 0), (0, 0), (0, self.__target_length - original_length))
|
| 233 |
+
src = np.pad(src, pad_width, mode='constant', constant_values=0)
|
| 234 |
+
|
| 235 |
+
src_length = np.array([self.__target_length], dtype=np.int64) # 使用固定长度
|
| 236 |
+
|
| 237 |
+
if self.__verbose:
|
| 238 |
+
print("src shape", src.shape)
|
| 239 |
+
print("src_length shape", src_length.shape)
|
| 240 |
+
print("src_tone shape", src_tone.shape)
|
| 241 |
+
print("target_tone_color shape", target_tone_color.shape)
|
| 242 |
+
print("tau", tau)
|
| 243 |
+
|
| 244 |
+
# 准备RKNNLite的输入
|
| 245 |
+
inputs = [
|
| 246 |
+
src,
|
| 247 |
+
src_length,
|
| 248 |
+
src_tone,
|
| 249 |
+
target_tone_color,
|
| 250 |
+
np.array([tau], dtype=np.float32)
|
| 251 |
+
]
|
| 252 |
+
|
| 253 |
+
# 使用RKNNLite进行推理
|
| 254 |
+
outputs = self.__tc_rknn.inference(inputs=inputs)
|
| 255 |
+
res = outputs[0][0, 0] # 获取第一个输出的第一个样本
|
| 256 |
+
|
| 257 |
+
generated_multiplier = 262144 / 1024
|
| 258 |
+
# 如果原始输入较短,则截取掉padding部分
|
| 259 |
+
if original_length < self.__target_length:
|
| 260 |
+
res = res[:int(original_length * generated_multiplier)]
|
| 261 |
+
|
| 262 |
+
if self.__verbose:
|
| 263 |
+
print("res shape", res.shape)
|
| 264 |
+
return res
|
| 265 |
+
|
| 266 |
+
def to_mono(self, audio:np.array):
|
| 267 |
+
'''
|
| 268 |
+
Change the audio to be a mono audio
|
| 269 |
+
|
| 270 |
+
Args:
|
| 271 |
+
audio (numpy.array): The source audio
|
| 272 |
+
|
| 273 |
+
Returns:
|
| 274 |
+
numpy.array: The mono audio data
|
| 275 |
+
'''
|
| 276 |
+
return np.mean(audio, axis=1) if len(audio.shape) > 1 else audio
|
| 277 |
+
|
| 278 |
+
def resample(self, audio:np.array, original_rate:int):
|
| 279 |
+
'''
|
| 280 |
+
Resample the audio to match the model. It is used for changing the sample rate of the audio.
|
| 281 |
+
|
| 282 |
+
Args:
|
| 283 |
+
audio (numpy.array): The source audio you want to resample.
|
| 284 |
+
original_rate (int): The original sample rate of the source audio
|
| 285 |
+
|
| 286 |
+
Returns:
|
| 287 |
+
numpy.array: The dest data of the audio after resample
|
| 288 |
+
'''
|
| 289 |
+
audio = self.to_mono(audio)
|
| 290 |
+
target_rate = self.__hparams.data.sampling_rate
|
| 291 |
+
duration = audio.shape[0] / original_rate
|
| 292 |
+
target_length = int(duration * target_rate)
|
| 293 |
+
time_original = np.linspace(0, duration, num=audio.shape[0])
|
| 294 |
+
time_target = np.linspace(0, duration, num=target_length)
|
| 295 |
+
resampled_data = np.interp(time_target, time_original, audio)
|
| 296 |
+
return resampled_data
|
| 297 |
+
|
| 298 |
+
@property
|
| 299 |
+
def sample_rate(self):
|
| 300 |
+
'''
|
| 301 |
+
The sample rate of the tone cloning result
|
| 302 |
+
'''
|
| 303 |
+
return self.__hparams.data.sampling_rate
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
tc = OpenVoiceToneClone_ONNXRKNN(".",verbose=True)
|
| 307 |
+
import soundfile
|
| 308 |
+
|
| 309 |
+
tgt = soundfile.read("target.wav", dtype='float32')
|
| 310 |
+
tgt = tc.resample(tgt[0], tgt[1])
|
| 311 |
+
|
| 312 |
+
# 计时extract_tone_color
|
| 313 |
+
start_time = time.time()
|
| 314 |
+
tgt_tone_color = tc.extract_tone_color(tgt)
|
| 315 |
+
extract_time = time.time() - start_time
|
| 316 |
+
print(f"提取音色特征耗时: {extract_time:.2f}秒")
|
| 317 |
+
|
| 318 |
+
src = soundfile.read("src2.wav", dtype='float32')
|
| 319 |
+
src = tc.resample(src[0], src[1])
|
| 320 |
+
|
| 321 |
+
# 计时tone_clone
|
| 322 |
+
start_time = time.time()
|
| 323 |
+
result = tc.tone_clone(src, tgt_tone_color)
|
| 324 |
+
clone_time = time.time() - start_time
|
| 325 |
+
print(f"克隆音色耗时: {clone_time:.2f}秒")
|
| 326 |
+
|
| 327 |
+
soundfile.write("result.wav", result, tc.sample_rate)
|
OpenVoice-RKNN2/tone_clone_model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:896195b84b0cb87a828bb8cab06577e9c024356bc9727b1a8f4174154bc0affa
|
| 3 |
+
size 157196170
|
OpenVoice-RKNN2/tone_clone_model.rknn
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7cd7dc3385c55ca610580edaba263510091314be35ae4688a1c076afe9e5d84a
|
| 3 |
+
size 108102277
|
OpenVoice-RKNN2/tone_color_extract_model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e91c2cb696e199d2519ed8b62ca6e3c8e42cb99ca13955dd6e188051486e681c
|
| 3 |
+
size 3364792
|
OpenVoice/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
OpenVoice/README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- audio
|
| 5 |
+
- text-to-speech
|
| 6 |
+
- instant-voice-cloning
|
| 7 |
+
language:
|
| 8 |
+
- en
|
| 9 |
+
- zh
|
| 10 |
+
inference: false
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# OpenVoice
|
| 14 |
+
|
| 15 |
+
<a href="https://trendshift.io/repositories/6161" target="_blank"><img src="https://trendshift.io/api/badge/repositories/6161" alt="myshell-ai%2FOpenVoice | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
| 16 |
+
|
| 17 |
+
OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
|
| 18 |
+
|
| 19 |
+
<video controls autoplay src="https://cdn-uploads.huggingface.co/production/uploads/641de0213239b631552713e4/uCHTHD9OUotgOflqDu3QK.mp4"></video>
|
| 20 |
+
|
| 21 |
+
### Features
|
| 22 |
+
- **Accurate Tone Color Cloning.** OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
|
| 23 |
+
- **Flexible Voice Style Control.** OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
|
| 24 |
+
- **Zero-shot Cross-lingual Voice Cloning.** Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
|
| 25 |
+
|
| 26 |
+
### How to Use
|
| 27 |
+
Please see [usage](https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md) for detailed instructions.
|
| 28 |
+
|
| 29 |
+
### Links
|
| 30 |
+
- [Github](https://github.com/myshell-ai/OpenVoice)
|
| 31 |
+
- [HFDemo](https://huggingface.co/spaces/myshell-ai/OpenVoice)
|
| 32 |
+
- [Discord](https://discord.gg/myshell)
|
| 33 |
+
|
OpenVoice/checkpoints/base_speakers/EN/checkpoint.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1db1ae1a5c8ded049bd1536051489aefbfad4a5077c01c2257e9e88fa1bb8422
|
| 3 |
+
size 160467309
|
OpenVoice/checkpoints/base_speakers/EN/config.json
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"data": {
|
| 3 |
+
"text_cleaners": [
|
| 4 |
+
"cjke_cleaners2"
|
| 5 |
+
],
|
| 6 |
+
"sampling_rate": 22050,
|
| 7 |
+
"filter_length": 1024,
|
| 8 |
+
"hop_length": 256,
|
| 9 |
+
"win_length": 1024,
|
| 10 |
+
"n_mel_channels": 80,
|
| 11 |
+
"add_blank": true,
|
| 12 |
+
"cleaned_text": true,
|
| 13 |
+
"n_speakers": 10
|
| 14 |
+
},
|
| 15 |
+
"model": {
|
| 16 |
+
"inter_channels": 192,
|
| 17 |
+
"hidden_channels": 192,
|
| 18 |
+
"filter_channels": 768,
|
| 19 |
+
"n_heads": 2,
|
| 20 |
+
"n_layers": 6,
|
| 21 |
+
"n_layers_trans_flow": 3,
|
| 22 |
+
"kernel_size": 3,
|
| 23 |
+
"p_dropout": 0.1,
|
| 24 |
+
"resblock": "1",
|
| 25 |
+
"resblock_kernel_sizes": [
|
| 26 |
+
3,
|
| 27 |
+
7,
|
| 28 |
+
11
|
| 29 |
+
],
|
| 30 |
+
"resblock_dilation_sizes": [
|
| 31 |
+
[
|
| 32 |
+
1,
|
| 33 |
+
3,
|
| 34 |
+
5
|
| 35 |
+
],
|
| 36 |
+
[
|
| 37 |
+
1,
|
| 38 |
+
3,
|
| 39 |
+
5
|
| 40 |
+
],
|
| 41 |
+
[
|
| 42 |
+
1,
|
| 43 |
+
3,
|
| 44 |
+
5
|
| 45 |
+
]
|
| 46 |
+
],
|
| 47 |
+
"upsample_rates": [
|
| 48 |
+
8,
|
| 49 |
+
8,
|
| 50 |
+
2,
|
| 51 |
+
2
|
| 52 |
+
],
|
| 53 |
+
"upsample_initial_channel": 512,
|
| 54 |
+
"upsample_kernel_sizes": [
|
| 55 |
+
16,
|
| 56 |
+
16,
|
| 57 |
+
4,
|
| 58 |
+
4
|
| 59 |
+
],
|
| 60 |
+
"n_layers_q": 3,
|
| 61 |
+
"use_spectral_norm": false,
|
| 62 |
+
"gin_channels": 256
|
| 63 |
+
},
|
| 64 |
+
"symbols": [
|
| 65 |
+
"_",
|
| 66 |
+
",",
|
| 67 |
+
".",
|
| 68 |
+
"!",
|
| 69 |
+
"?",
|
| 70 |
+
"-",
|
| 71 |
+
"~",
|
| 72 |
+
"\u2026",
|
| 73 |
+
"N",
|
| 74 |
+
"Q",
|
| 75 |
+
"a",
|
| 76 |
+
"b",
|
| 77 |
+
"d",
|
| 78 |
+
"e",
|
| 79 |
+
"f",
|
| 80 |
+
"g",
|
| 81 |
+
"h",
|
| 82 |
+
"i",
|
| 83 |
+
"j",
|
| 84 |
+
"k",
|
| 85 |
+
"l",
|
| 86 |
+
"m",
|
| 87 |
+
"n",
|
| 88 |
+
"o",
|
| 89 |
+
"p",
|
| 90 |
+
"s",
|
| 91 |
+
"t",
|
| 92 |
+
"u",
|
| 93 |
+
"v",
|
| 94 |
+
"w",
|
| 95 |
+
"x",
|
| 96 |
+
"y",
|
| 97 |
+
"z",
|
| 98 |
+
"\u0251",
|
| 99 |
+
"\u00e6",
|
| 100 |
+
"\u0283",
|
| 101 |
+
"\u0291",
|
| 102 |
+
"\u00e7",
|
| 103 |
+
"\u026f",
|
| 104 |
+
"\u026a",
|
| 105 |
+
"\u0254",
|
| 106 |
+
"\u025b",
|
| 107 |
+
"\u0279",
|
| 108 |
+
"\u00f0",
|
| 109 |
+
"\u0259",
|
| 110 |
+
"\u026b",
|
| 111 |
+
"\u0265",
|
| 112 |
+
"\u0278",
|
| 113 |
+
"\u028a",
|
| 114 |
+
"\u027e",
|
| 115 |
+
"\u0292",
|
| 116 |
+
"\u03b8",
|
| 117 |
+
"\u03b2",
|
| 118 |
+
"\u014b",
|
| 119 |
+
"\u0266",
|
| 120 |
+
"\u207c",
|
| 121 |
+
"\u02b0",
|
| 122 |
+
"`",
|
| 123 |
+
"^",
|
| 124 |
+
"#",
|
| 125 |
+
"*",
|
| 126 |
+
"=",
|
| 127 |
+
"\u02c8",
|
| 128 |
+
"\u02cc",
|
| 129 |
+
"\u2192",
|
| 130 |
+
"\u2193",
|
| 131 |
+
"\u2191",
|
| 132 |
+
" "
|
| 133 |
+
],
|
| 134 |
+
"speakers": {
|
| 135 |
+
"default": 1,
|
| 136 |
+
"whispering": 2,
|
| 137 |
+
"shouting": 3,
|
| 138 |
+
"excited": 4,
|
| 139 |
+
"cheerful": 5,
|
| 140 |
+
"terrified": 6,
|
| 141 |
+
"angry": 7,
|
| 142 |
+
"sad": 8,
|
| 143 |
+
"friendly": 9
|
| 144 |
+
}
|
| 145 |
+
}
|
OpenVoice/checkpoints/base_speakers/EN/en_default_se.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cab24002eec738d0fe72cb73a34e57fbc3999c1bd4a1670a7b56ee4e3590ac9
|
| 3 |
+
size 1789
|
OpenVoice/checkpoints/base_speakers/EN/en_style_se.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9
|
| 3 |
+
size 1783
|
OpenVoice/checkpoints/base_speakers/ZH/checkpoint.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9
|
| 3 |
+
size 160467309
|
OpenVoice/checkpoints/base_speakers/ZH/config.json
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"data": {
|
| 3 |
+
"text_cleaners": [
|
| 4 |
+
"cjke_cleaners2"
|
| 5 |
+
],
|
| 6 |
+
"sampling_rate": 22050,
|
| 7 |
+
"filter_length": 1024,
|
| 8 |
+
"hop_length": 256,
|
| 9 |
+
"win_length": 1024,
|
| 10 |
+
"n_mel_channels": 80,
|
| 11 |
+
"add_blank": true,
|
| 12 |
+
"cleaned_text": true,
|
| 13 |
+
"n_speakers": 10
|
| 14 |
+
},
|
| 15 |
+
"model": {
|
| 16 |
+
"inter_channels": 192,
|
| 17 |
+
"hidden_channels": 192,
|
| 18 |
+
"filter_channels": 768,
|
| 19 |
+
"n_heads": 2,
|
| 20 |
+
"n_layers": 6,
|
| 21 |
+
"n_layers_trans_flow": 3,
|
| 22 |
+
"kernel_size": 3,
|
| 23 |
+
"p_dropout": 0.1,
|
| 24 |
+
"resblock": "1",
|
| 25 |
+
"resblock_kernel_sizes": [
|
| 26 |
+
3,
|
| 27 |
+
7,
|
| 28 |
+
11
|
| 29 |
+
],
|
| 30 |
+
"resblock_dilation_sizes": [
|
| 31 |
+
[
|
| 32 |
+
1,
|
| 33 |
+
3,
|
| 34 |
+
5
|
| 35 |
+
],
|
| 36 |
+
[
|
| 37 |
+
1,
|
| 38 |
+
3,
|
| 39 |
+
5
|
| 40 |
+
],
|
| 41 |
+
[
|
| 42 |
+
1,
|
| 43 |
+
3,
|
| 44 |
+
5
|
| 45 |
+
]
|
| 46 |
+
],
|
| 47 |
+
"upsample_rates": [
|
| 48 |
+
8,
|
| 49 |
+
8,
|
| 50 |
+
2,
|
| 51 |
+
2
|
| 52 |
+
],
|
| 53 |
+
"upsample_initial_channel": 512,
|
| 54 |
+
"upsample_kernel_sizes": [
|
| 55 |
+
16,
|
| 56 |
+
16,
|
| 57 |
+
4,
|
| 58 |
+
4
|
| 59 |
+
],
|
| 60 |
+
"n_layers_q": 3,
|
| 61 |
+
"use_spectral_norm": false,
|
| 62 |
+
"gin_channels": 256
|
| 63 |
+
},
|
| 64 |
+
"symbols": [
|
| 65 |
+
"_",
|
| 66 |
+
",",
|
| 67 |
+
".",
|
| 68 |
+
"!",
|
| 69 |
+
"?",
|
| 70 |
+
"-",
|
| 71 |
+
"~",
|
| 72 |
+
"\u2026",
|
| 73 |
+
"N",
|
| 74 |
+
"Q",
|
| 75 |
+
"a",
|
| 76 |
+
"b",
|
| 77 |
+
"d",
|
| 78 |
+
"e",
|
| 79 |
+
"f",
|
| 80 |
+
"g",
|
| 81 |
+
"h",
|
| 82 |
+
"i",
|
| 83 |
+
"j",
|
| 84 |
+
"k",
|
| 85 |
+
"l",
|
| 86 |
+
"m",
|
| 87 |
+
"n",
|
| 88 |
+
"o",
|
| 89 |
+
"p",
|
| 90 |
+
"s",
|
| 91 |
+
"t",
|
| 92 |
+
"u",
|
| 93 |
+
"v",
|
| 94 |
+
"w",
|
| 95 |
+
"x",
|
| 96 |
+
"y",
|
| 97 |
+
"z",
|
| 98 |
+
"\u0251",
|
| 99 |
+
"\u00e6",
|
| 100 |
+
"\u0283",
|
| 101 |
+
"\u0291",
|
| 102 |
+
"\u00e7",
|
| 103 |
+
"\u026f",
|
| 104 |
+
"\u026a",
|
| 105 |
+
"\u0254",
|
| 106 |
+
"\u025b",
|
| 107 |
+
"\u0279",
|
| 108 |
+
"\u00f0",
|
| 109 |
+
"\u0259",
|
| 110 |
+
"\u026b",
|
| 111 |
+
"\u0265",
|
| 112 |
+
"\u0278",
|
| 113 |
+
"\u028a",
|
| 114 |
+
"\u027e",
|
| 115 |
+
"\u0292",
|
| 116 |
+
"\u03b8",
|
| 117 |
+
"\u03b2",
|
| 118 |
+
"\u014b",
|
| 119 |
+
"\u0266",
|
| 120 |
+
"\u207c",
|
| 121 |
+
"\u02b0",
|
| 122 |
+
"`",
|
| 123 |
+
"^",
|
| 124 |
+
"#",
|
| 125 |
+
"*",
|
| 126 |
+
"=",
|
| 127 |
+
"\u02c8",
|
| 128 |
+
"\u02cc",
|
| 129 |
+
"\u2192",
|
| 130 |
+
"\u2193",
|
| 131 |
+
"\u2191",
|
| 132 |
+
" "
|
| 133 |
+
],
|
| 134 |
+
"speakers": {
|
| 135 |
+
"default": 0
|
| 136 |
+
}
|
| 137 |
+
}
|
OpenVoice/checkpoints/base_speakers/ZH/zh_default_se.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf
|
| 3 |
+
size 1789
|
OpenVoice/checkpoints/converter/checkpoint.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:89ae83aa4e3668fef64b388b789ff7b0ce0def9f801069edfc18a00ea420748d
|
| 3 |
+
size 131327338
|
OpenVoice/checkpoints/converter/config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"data": {
|
| 3 |
+
"sampling_rate": 22050,
|
| 4 |
+
"filter_length": 1024,
|
| 5 |
+
"hop_length": 256,
|
| 6 |
+
"win_length": 1024,
|
| 7 |
+
"n_speakers": 0
|
| 8 |
+
},
|
| 9 |
+
"model": {
|
| 10 |
+
"inter_channels": 192,
|
| 11 |
+
"hidden_channels": 192,
|
| 12 |
+
"filter_channels": 768,
|
| 13 |
+
"n_heads": 2,
|
| 14 |
+
"n_layers": 6,
|
| 15 |
+
"kernel_size": 3,
|
| 16 |
+
"p_dropout": 0.1,
|
| 17 |
+
"resblock": "1",
|
| 18 |
+
"resblock_kernel_sizes": [
|
| 19 |
+
3,
|
| 20 |
+
7,
|
| 21 |
+
11
|
| 22 |
+
],
|
| 23 |
+
"resblock_dilation_sizes": [
|
| 24 |
+
[
|
| 25 |
+
1,
|
| 26 |
+
3,
|
| 27 |
+
5
|
| 28 |
+
],
|
| 29 |
+
[
|
| 30 |
+
1,
|
| 31 |
+
3,
|
| 32 |
+
5
|
| 33 |
+
],
|
| 34 |
+
[
|
| 35 |
+
1,
|
| 36 |
+
3,
|
| 37 |
+
5
|
| 38 |
+
]
|
| 39 |
+
],
|
| 40 |
+
"upsample_rates": [
|
| 41 |
+
8,
|
| 42 |
+
8,
|
| 43 |
+
2,
|
| 44 |
+
2
|
| 45 |
+
],
|
| 46 |
+
"upsample_initial_channel": 512,
|
| 47 |
+
"upsample_kernel_sizes": [
|
| 48 |
+
16,
|
| 49 |
+
16,
|
| 50 |
+
4,
|
| 51 |
+
4
|
| 52 |
+
],
|
| 53 |
+
"n_layers_q": 3,
|
| 54 |
+
"use_spectral_norm": false,
|
| 55 |
+
"gin_channels": 256
|
| 56 |
+
}
|
| 57 |
+
}
|
OpenVoice/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/myshell-ai/OpenVoice
|
OpenVoiceV2/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
OpenVoiceV2/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
OpenVoiceV2/README.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- audio
|
| 5 |
+
- text-to-speech
|
| 6 |
+
- instant-voice-cloning
|
| 7 |
+
language:
|
| 8 |
+
- en
|
| 9 |
+
- zh
|
| 10 |
+
inference: false
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# OpenVoice V2
|
| 14 |
+
|
| 15 |
+
<a href="https://trendshift.io/repositories/6161" target="_blank"><img src="https://trendshift.io/api/badge/repositories/6161" alt="myshell-ai%2FOpenVoice | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
In April 2024, we release OpenVoice V2, which includes all features in V1 and has:
|
| 19 |
+
|
| 20 |
+
1. Better Audio Quality. OpenVoice V2 adopts a different training strategy that delivers better audio quality.
|
| 21 |
+
|
| 22 |
+
2. Native Multi-lingual Support. English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2.
|
| 23 |
+
|
| 24 |
+
3. Free Commercial Use. Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
<video controls autoplay src="https://cdn-uploads.huggingface.co/production/uploads/641de0213239b631552713e4/uCHTHD9OUotgOflqDu3QK.mp4"></video>
|
| 28 |
+
|
| 29 |
+
### Features
|
| 30 |
+
- **Accurate Tone Color Cloning.** OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
|
| 31 |
+
- **Flexible Voice Style Control.** OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
|
| 32 |
+
- **Zero-shot Cross-lingual Voice Cloning.** Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
|
| 33 |
+
|
| 34 |
+
### How to Use
|
| 35 |
+
Please see [usage](https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md) for detailed instructions.
|
| 36 |
+
|
| 37 |
+
# Usage
|
| 38 |
+
|
| 39 |
+
## Table of Content
|
| 40 |
+
|
| 41 |
+
- [Quick Use](#quick-use): directly use OpenVoice without installation.
|
| 42 |
+
- [Linux Install](#linux-install): for researchers and developers only.
|
| 43 |
+
- [V1](#openvoice-v1)
|
| 44 |
+
- [V2](#openvoice-v2)
|
| 45 |
+
- [Install on Other Platforms](#install-on-other-platforms): unofficial installation guide contributed by the community
|
| 46 |
+
|
| 47 |
+
## Quick Use
|
| 48 |
+
|
| 49 |
+
The input speech audio of OpenVoice can be in **Any Language**. OpenVoice can clone the voice in that speech audio, and use the voice to speak in multiple languages. For quick use, we recommend you to try the already deployed services:
|
| 50 |
+
|
| 51 |
+
- [British English](https://app.myshell.ai/widget/vYjqae)
|
| 52 |
+
- [American English](https://app.myshell.ai/widget/nEFFJf)
|
| 53 |
+
- [Indian English](https://app.myshell.ai/widget/V3iYze)
|
| 54 |
+
- [Australian English](https://app.myshell.ai/widget/fM7JVf)
|
| 55 |
+
- [Spanish](https://app.myshell.ai/widget/NNFFVz)
|
| 56 |
+
- [French](https://app.myshell.ai/widget/z2uyUz)
|
| 57 |
+
- [Chinese](https://app.myshell.ai/widget/fU7nUz)
|
| 58 |
+
- [Japanese](https://app.myshell.ai/widget/IfIB3u)
|
| 59 |
+
- [Korean](https://app.myshell.ai/widget/q6ZjIn)
|
| 60 |
+
|
| 61 |
+
## Linux Install
|
| 62 |
+
|
| 63 |
+
This section is only for developers and researchers who are familiar with Linux, Python and PyTorch. Clone this repo, and run
|
| 64 |
+
|
| 65 |
+
```
|
| 66 |
+
conda create -n openvoice python=3.9
|
| 67 |
+
conda activate openvoice
|
| 68 |
+
git clone git@github.com:myshell-ai/OpenVoice.git
|
| 69 |
+
cd OpenVoice
|
| 70 |
+
pip install -e .
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
No matter if you are using V1 or V2, the above installation is the same.
|
| 74 |
+
|
| 75 |
+
### OpenVoice V1
|
| 76 |
+
|
| 77 |
+
Download the checkpoint from [here](https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_1226.zip) and extract it to the `checkpoints` folder.
|
| 78 |
+
|
| 79 |
+
**1. Flexible Voice Style Control.**
|
| 80 |
+
Please see [`demo_part1.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part1.ipynb) for an example usage of how OpenVoice enables flexible style control over the cloned voice.
|
| 81 |
+
|
| 82 |
+
**2. Cross-Lingual Voice Cloning.**
|
| 83 |
+
Please see [`demo_part2.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb) for an example for languages seen or unseen in the MSML training set.
|
| 84 |
+
|
| 85 |
+
**3. Gradio Demo.**. We provide a minimalist local gradio demo here. We strongly suggest the users to look into `demo_part1.ipynb`, `demo_part2.ipynb` and the [QnA](QA.md) if they run into issues with the gradio demo. Launch a local gradio demo with `python -m openvoice_app --share`.
|
| 86 |
+
|
| 87 |
+
### OpenVoice V2
|
| 88 |
+
|
| 89 |
+
Download the checkpoint from [here](https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip) and extract it to the `checkpoints_v2` folder.
|
| 90 |
+
|
| 91 |
+
Install [MeloTTS](https://github.com/myshell-ai/MeloTTS):
|
| 92 |
+
```
|
| 93 |
+
pip install git+https://github.com/myshell-ai/MeloTTS.git
|
| 94 |
+
python -m unidic download
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
**Demo Usage.** Please see [`demo_part3.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part3.ipynb) for example usage of OpenVoice V2. Now it natively supports English, Spanish, French, Chinese, Japanese and Korean.
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
## Install on Other Platforms
|
| 101 |
+
|
| 102 |
+
This section provides the unofficial installation guides by open-source contributors in the community:
|
| 103 |
+
|
| 104 |
+
- Windows
|
| 105 |
+
- [Guide](https://github.com/Alienpups/OpenVoice/blob/main/docs/USAGE_WINDOWS.md) by [@Alienpups](https://github.com/Alienpups)
|
| 106 |
+
- You are welcome to contribute if you have a better installation guide. We will list you here.
|
| 107 |
+
- Docker
|
| 108 |
+
- [Guide](https://github.com/StevenJSCF/OpenVoice/blob/update-docs/docs/DF_USAGE.md) by [@StevenJSCF](https://github.com/StevenJSCF)
|
| 109 |
+
- You are welcome to contribute if you have a better installation guide. We will list you here.
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
### Links
|
| 113 |
+
- [Github](https://github.com/myshell-ai/OpenVoice)
|
| 114 |
+
- [HFDemo](https://huggingface.co/spaces/myshell-ai/OpenVoiceV2)
|
| 115 |
+
- [Discord](https://discord.gg/myshell)
|
| 116 |
+
|
OpenVoiceV2/base_speakers/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
OpenVoiceV2/base_speakers/ses/en-au.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e9782233deef51fc5289d05ad4dd4ce12b196e282eccf6b6db6256bbd02daaa
|
| 3 |
+
size 1701
|
OpenVoiceV2/base_speakers/ses/en-br.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2bf5a88025cfd10473b25d65d5c0e608338ce4533059c5f9a3383e69c812d389
|
| 3 |
+
size 1701
|
OpenVoiceV2/base_speakers/ses/en-default.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e4139de3bc2ea162f45a5a5f9559b710686c9689749b5ab8945ee5e2a082d154
|
| 3 |
+
size 1783
|
OpenVoiceV2/base_speakers/ses/en-india.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad03d946757e95fe9e13239aa4b11071d98f22316f604f34b1a0b4bdf41cda48
|
| 3 |
+
size 1701
|
OpenVoiceV2/base_speakers/ses/en-newest.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a3798229b1114f0e9cc137b33211809def7dda5a8a9398d5a112c0b42699177
|
| 3 |
+
size 1692
|
OpenVoiceV2/base_speakers/ses/en-us.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d092d4af0815a4bfbc6105b65621ab68dc4c61b2f55044d8a66968a34947c32
|
| 3 |
+
size 1701
|
OpenVoiceV2/base_speakers/ses/es.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8cece8853fb75b9f5217a1f5cda9807bac92a3e4c4547fc651e404d05deff63
|
| 3 |
+
size 1692
|
OpenVoiceV2/base_speakers/ses/fr.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a01f6d30a73efa368c288a542a522a2bcdd4e2ec5589d8646b307cf8e2ad9ae
|
| 3 |
+
size 1692
|
OpenVoiceV2/base_speakers/ses/jp.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b645ff428de4a57a22122318968f1e6127ac81fda2e2aa66062deccd3864416
|
| 3 |
+
size 1692
|
OpenVoiceV2/base_speakers/ses/kr.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f501479d6072741a396725bec79144653e9f4a5381b85901e29683aa169795df
|
| 3 |
+
size 1692
|
OpenVoiceV2/base_speakers/ses/zh.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b353de562700c13faacf096ecfc0adcafd26e6704a9feef572be1279714e031
|
| 3 |
+
size 1692
|
OpenVoiceV2/converter/checkpoint.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
|
| 3 |
+
size 131320490
|
OpenVoiceV2/converter/config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_version_": "v2",
|
| 3 |
+
"data": {
|
| 4 |
+
"sampling_rate": 22050,
|
| 5 |
+
"filter_length": 1024,
|
| 6 |
+
"hop_length": 256,
|
| 7 |
+
"win_length": 1024,
|
| 8 |
+
"n_speakers": 0
|
| 9 |
+
},
|
| 10 |
+
"model": {
|
| 11 |
+
"zero_g": true,
|
| 12 |
+
"inter_channels": 192,
|
| 13 |
+
"hidden_channels": 192,
|
| 14 |
+
"filter_channels": 768,
|
| 15 |
+
"n_heads": 2,
|
| 16 |
+
"n_layers": 6,
|
| 17 |
+
"kernel_size": 3,
|
| 18 |
+
"p_dropout": 0.1,
|
| 19 |
+
"resblock": "1",
|
| 20 |
+
"resblock_kernel_sizes": [
|
| 21 |
+
3,
|
| 22 |
+
7,
|
| 23 |
+
11
|
| 24 |
+
],
|
| 25 |
+
"resblock_dilation_sizes": [
|
| 26 |
+
[
|
| 27 |
+
1,
|
| 28 |
+
3,
|
| 29 |
+
5
|
| 30 |
+
],
|
| 31 |
+
[
|
| 32 |
+
1,
|
| 33 |
+
3,
|
| 34 |
+
5
|
| 35 |
+
],
|
| 36 |
+
[
|
| 37 |
+
1,
|
| 38 |
+
3,
|
| 39 |
+
5
|
| 40 |
+
]
|
| 41 |
+
],
|
| 42 |
+
"upsample_rates": [
|
| 43 |
+
8,
|
| 44 |
+
8,
|
| 45 |
+
2,
|
| 46 |
+
2
|
| 47 |
+
],
|
| 48 |
+
"upsample_initial_channel": 512,
|
| 49 |
+
"upsample_kernel_sizes": [
|
| 50 |
+
16,
|
| 51 |
+
16,
|
| 52 |
+
4,
|
| 53 |
+
4
|
| 54 |
+
],
|
| 55 |
+
"gin_channels": 256
|
| 56 |
+
}
|
| 57 |
+
}
|
OpenVoiceV2/languages.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
English
|
| 2 |
+
Spanish
|
| 3 |
+
French
|
| 4 |
+
Chinese
|
| 5 |
+
Japanese
|
| 6 |
+
Korean
|
OpenVoiceV2/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/myshell-ai/OpenVoiceV2
|
openvoice-tunner-v2/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
openvoice-tunner-v2/README.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
This is a simple copy of the tuner for openvoice v2
|
| 3 |
+
|
| 4 |
+
https://github.com/myshell-ai/OpenVoice
|
| 5 |
+
---
|
| 6 |
+
license: mit
|
| 7 |
+
---
|
openvoice-tunner-v2/checkpoint.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
|
| 3 |
+
size 131320490
|
openvoice-tunner-v2/config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_version_": "v2",
|
| 3 |
+
"data": {
|
| 4 |
+
"sampling_rate": 22050,
|
| 5 |
+
"filter_length": 1024,
|
| 6 |
+
"hop_length": 256,
|
| 7 |
+
"win_length": 1024,
|
| 8 |
+
"n_speakers": 0
|
| 9 |
+
},
|
| 10 |
+
"model": {
|
| 11 |
+
"zero_g": true,
|
| 12 |
+
"inter_channels": 192,
|
| 13 |
+
"hidden_channels": 192,
|
| 14 |
+
"filter_channels": 768,
|
| 15 |
+
"n_heads": 2,
|
| 16 |
+
"n_layers": 6,
|
| 17 |
+
"kernel_size": 3,
|
| 18 |
+
"p_dropout": 0.1,
|
| 19 |
+
"resblock": "1",
|
| 20 |
+
"resblock_kernel_sizes": [
|
| 21 |
+
3,
|
| 22 |
+
7,
|
| 23 |
+
11
|
| 24 |
+
],
|
| 25 |
+
"resblock_dilation_sizes": [
|
| 26 |
+
[
|
| 27 |
+
1,
|
| 28 |
+
3,
|
| 29 |
+
5
|
| 30 |
+
],
|
| 31 |
+
[
|
| 32 |
+
1,
|
| 33 |
+
3,
|
| 34 |
+
5
|
| 35 |
+
],
|
| 36 |
+
[
|
| 37 |
+
1,
|
| 38 |
+
3,
|
| 39 |
+
5
|
| 40 |
+
]
|
| 41 |
+
],
|
| 42 |
+
"upsample_rates": [
|
| 43 |
+
8,
|
| 44 |
+
8,
|
| 45 |
+
2,
|
| 46 |
+
2
|
| 47 |
+
],
|
| 48 |
+
"upsample_initial_channel": 512,
|
| 49 |
+
"upsample_kernel_sizes": [
|
| 50 |
+
16,
|
| 51 |
+
16,
|
| 52 |
+
4,
|
| 53 |
+
4
|
| 54 |
+
],
|
| 55 |
+
"gin_channels": 256
|
| 56 |
+
}
|
| 57 |
+
}
|
openvoice-tunner-v2/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/daswer123/openvoice-tunner-v2
|