File size: 2,982 Bytes
68a99fc db918ab 68a99fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import json
from typing import List
from pathlib import Path
from ..base import BaseTTS
class KokoroTTSProcessor(BaseTTS):
"""Text-to-Speech processor using KokoroTTS."""
def __init__(self, stream_audio=False, setup_signals=True):
super().__init__("Kokoro", stream_audio=stream_audio, setup_signals=setup_signals)
self.default_voice_index = 8
self.default_speed = 1
self.voices = [
# Default / mixed
'af',
# af_*
'af_alloy',
'af_aoede',
'af_bella',
'af_heart',
'af_jessica',
'af_kore',
'af_nicole',
'af_nova',
'af_river',
'af_sarah',
'af_sky',
# am_*
'am_adam',
'am_echo',
'am_eric',
'am_fenrir',
'am_liam',
'am_michael',
'am_onyx',
'am_puck',
'am_santa',
# bf_*
'bf_alice',
'bf_emma',
'bf_isabella',
'bf_lily',
# bm_*
'bm_daniel',
'bm_fable',
'bm_george',
'bm_lewis',
# ef / em
'ef_dora',
'em_alex',
'em_santa',
# ff
'ff_siwis',
# hf / hm
'hf_alpha',
'hf_beta',
'hm_omega',
'hm_psi',
# if / im
'if_sara',
'im_nicola',
# jf / jm (Japanese)
'jf_alpha',
'jf_gongitsune',
'jf_nezumi',
'jf_tebukuro',
'jm_kumo',
# pf / pm
'pf_dora',
'pm_alex',
'pm_santa',
# zh female
'zf_xiaobei',
'zf_xiaoni',
'zf_xiaoxiao',
'zf_xiaoyi',
# zh male
'zm_yunjian',
'zm_yunxi',
'zm_yunxia',
'zm_yunyang',
]
print("Initialising Kokoro...")
from kokoro import KPipeline
print("Loading Modal...")
self.pipeline = KPipeline(lang_code='a', device=self.device)
print("Model loaded successfully")
def generate_audio_files(self, text: str, voice: str, speed: float, chunk_id: int = None):
generator = self.pipeline(
text,
voice=voice,
speed=speed,
split_pattern=r'\n+'
)
audio_files = []
word_timestamps = []
print(f"Processing text sentences...")
for i, result in enumerate(generator):
tokens = result.tokens
audio = result.audio
callback_words = []
sentence = ""
for word in tokens:
sentence += word.text
word_data = {
"word": word.text,
"phonemes": word.phonemes,
"start_time": word.start_ts,
"end_time": word.end_ts
}
word_timestamps.append(word_data)
callback_words.append(word_data)
if self.stream_audio:
audio_duration = self.queue_audio_for_streaming(audio)
# Call the callback if set (for UI highlighting)
if hasattr(self, 'word_callback') and self.word_callback:
self.word_callback(callback_words, audio_duration)
if self.save_audio_file:
chunk_file = self.generate_chunk_audio_file(audio, chunk_id if chunk_id else i)
audio_files.append(chunk_file)
print(f"Sentence {i + 1} processed -> {chunk_file.name} -> {sentence}")
# Save timestamps to a JSON file
with open(self.final_output_timestamps, 'w') as f:
json.dump(word_timestamps, f, indent=4)
print(f'Timestamps saved as {self.final_output_timestamps}')
return audio_files |