File size: 1,995 Bytes
e4406a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from pathlib import Path
from time import time
import os

import soundfile as sf
from misaki import zh
import onnxruntime

from kokoro_onnx import Kokoro

# providers = onnxruntime.get_available_providers()
# print(f"Available onnx runtime providers: {providers}")

def create_session(model_path):
    # See list of providers https://github.com/microsoft/onnxruntime/issues/22101#issuecomment-2357667377
    providers = onnxruntime.get_available_providers()
    providers = providers[1:2]
    print(f"Available onnx runtime providers: {providers}")


    # See session options https://onnxruntime.ai/docs/performance/tune-performance/threading.html#thread-management
    sess_options = onnxruntime.SessionOptions()
    cpu_count = os.cpu_count() // 2
    print(f"Setting threads to CPU cores count: {cpu_count}")
    # sess_options.intra_op_num_threads = cpu_count
    session = onnxruntime.InferenceSession(
        model_path, providers=providers, sess_options=sess_options
    )
    return session

model_folder = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models/kokoro")
model_path = str(model_folder/"kokoro-quant.onnx")
voice_model_path = str(model_folder/"voices-v1.0.bin")
vocab_config = str(model_folder/"zh_config.json")

texts = [
    "千里之行,始于足下。",
    "我想听你唱首歌",
    "窗前明月光,疑是地上霜。举头望明月,低头思故乡。"
]
voice = "zf_xiaoyi"
session = create_session(model_path)
model = Kokoro.from_session(session, voice_model_path, vocab_config=vocab_config)
g2p = zh.ZHG2P()
for i in range(5):
    for index, text in enumerate(texts):
        phonemes, _ = g2p(text)
        start = time()
        samples, sample_rate = model.create(phonemes, voice=voice, speed=1.0, is_phonemes=True)
        end = time()
        time_cost = end - start
        print(f"time cost: {time_cost} for text: {text}")
        sf.write(f"audio_{index}.wav", samples, sample_rate)
        print(f"Created audio_{index}.wav")