File size: 2,407 Bytes
e4406a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from time import time
import soundfile as sf
from misaki import en, espeak, zh
from kokoro_onnx import Kokoro



def run_en():
    # Misaki G2P with espeak-ng fallback
    fallback = espeak.EspeakFallback(british=False)
    g2p = en.G2P(trf=False, british=False, fallback=fallback)

    models = "/Users/jeqin/work/code/TestTranslator/scripts/kokoro_models/"
    # Kokoro
    kokoro = Kokoro(f"{models}kokoro-v1.0.onnx", f"{models}voices-v1.0.bin")

    texts = [
        "[Misaki](/misˈɑki/) is a G2P engine designed for [Kokoro](/kˈOkəɹO/) models.",
        "For example, the geology and terrain along the railway line.",
        " When choosing solid-state drives, we sometimes see reviews or videos discussing whether a particular solid-state drive has a caching scheme or an uncaching scheme in the performance testing section."
    ]
    for index, text in enumerate(texts):
        # Phonemize
        # text = "[Misaki](/misˈɑki/) is a G2P engine designed for [Kokoro](/kˈOkəɹO/) models."
        phonemes, _ = g2p(text)

        # Create
        start = time()
        samples, sample_rate = kokoro.create(phonemes, "af_heart", is_phonemes=True)
        end = time()
        time_cost = end - start
        print(f"time cost: {time_cost} for text: {text}")
        # Save
        sf.write(f"audio{index}.wav", samples, sample_rate)
        print(f"Created audio{index}.wav")

def run_zh():
    # Misaki G2P with espeak-ng fallback
    # fallback = espeak.EspeakFallback(british=False)
    g2p = zh.ZHG2P()

    models = "/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models/kokoro"
    # Kokoro
    kokoro = Kokoro(f"{models}/kokoro-quant.onnx", f"{models}/voices-v1.0.bin", vocab_config=f"{models}/zh_config.json")

    texts = [
        "千里之行,始于足下。",
        "我想听你唱首歌",
        "窗前明月光,疑是地上霜。举头望明月,低头思故乡。"
    ]
    for index, text in enumerate(texts):
        phonemes, _ = g2p(text)

        # Create
        start = time()
        samples, sample_rate = kokoro.create(phonemes, "zf_xiaoyi", is_phonemes=True, speed=1.0)
        end = time()
        time_cost = end - start
        print(f"time cost: {time_cost} for text: {text}")
        # Save
        sf.write(f"audio{index}.wav", samples, sample_rate)
        print(f"Created audio{index}.wav")

if __name__ == '__main__':
    run_zh()