Spaces:

simler
/

Genie-TTS-testing

Sleeping

App Files Files Community

simler commited on Feb 7

Commit

d479a15

verified ·

1 Parent(s): 30ab98f

Upload 68 files

Browse files

Files changed (4) hide show

genie_tts/Core/Inference.py +115 -112
genie_tts/Core/TTSPlayer.py +242 -241
genie_tts/Internal.py +403 -395
genie_tts/Utils/Shared.py +14 -13

genie_tts/Core/Inference.py CHANGED Viewed

@@ -1,112 +1,115 @@
-import onnxruntime as ort
-import numpy as np
-from typing import List, Optional
-import threading
-from ..Audio.ReferenceAudio import ReferenceAudio
-from ..GetPhonesAndBert import get_phones_and_bert
-MAX_T2S_LEN = 1000
-class GENIE:
-    def __init__(self):
-        self.stop_event: threading.Event = threading.Event()
-    def tts(
-            self,
-            text: str,
-            prompt_audio: ReferenceAudio,
-            encoder: ort.InferenceSession,
-            first_stage_decoder: ort.InferenceSession,
-            stage_decoder: ort.InferenceSession,
-            vocoder: ort.InferenceSession,
-            prompt_encoder: Optional[ort.InferenceSession],
-            language: str = 'japanese',
-    ) -> Optional[np.ndarray]:
-        text = '。' + text  # 防止漏第一句。
-        text_seq, text_bert = get_phones_and_bert(text, language=language)
-        semantic_tokens: np.ndarray = self.t2s_cpu(
-            ref_seq=prompt_audio.phonemes_seq,
-            ref_bert=prompt_audio.text_bert,
-            text_seq=text_seq,
-            text_bert=text_bert,
-            ssl_content=prompt_audio.ssl_content,
-            encoder=encoder,
-            first_stage_decoder=first_stage_decoder,
-            stage_decoder=stage_decoder,
-        )
-        eos_indices = np.where(semantic_tokens >= 1024)  # 剔除不合法的元素，例如 EOS Token。
-        if len(eos_indices[0]) > 0:
-            first_eos_index = eos_indices[-1][0]
-            semantic_tokens = semantic_tokens[..., :first_eos_index]
-        if prompt_encoder is None:
-            return vocoder.run(None, {
-                "text_seq": text_seq,
-                "pred_semantic": semantic_tokens,
-                "ref_audio": prompt_audio.audio_32k
-            })[0]
-        else:
-            # V2ProPlus 新增。
-            prompt_audio.update_global_emb(prompt_encoder=prompt_encoder)
-            audio_chunk = vocoder.run(None, {
-                "text_seq": text_seq,
-                "pred_semantic": semantic_tokens,
-                "ge": prompt_audio.global_emb,
-                "ge_advanced": prompt_audio.global_emb_advanced,
-            })[0]
-            return audio_chunk
-    def t2s_cpu(
-            self,
-            ref_seq: np.ndarray,
-            ref_bert: np.ndarray,
-            text_seq: np.ndarray,
-            text_bert: np.ndarray,
-            ssl_content: np.ndarray,
-            encoder: ort.InferenceSession,
-            first_stage_decoder: ort.InferenceSession,
-            stage_decoder: ort.InferenceSession,
-    ) -> Optional[np.ndarray]:
-        """在CPU上运行T2S模型"""
-        # Encoder
-        x, prompts = encoder.run(
-            None,
-            {
-                "ref_seq": ref_seq,
-                "text_seq": text_seq,
-                "ref_bert": ref_bert,
-                "text_bert": text_bert,
-                "ssl_content": ssl_content,
-            },
-        )
-        # First Stage Decoder
-        y, y_emb, *present_key_values = first_stage_decoder.run(
-            None, {"x": x, "prompts": prompts}
-        )
-        # Stage Decoder
-        input_names: List[str] = [inp.name for inp in stage_decoder.get_inputs()]
-        idx: int = 0
-        for idx in range(0, 500):
-            if self.stop_event.is_set():
-                return None
-            input_feed = {
-                name: data
-                for name, data in zip(input_names, [y, y_emb, *present_key_values])
-            }
-            outputs = stage_decoder.run(None, input_feed)
-            y, y_emb, stop_condition_tensor, *present_key_values = outputs
-            if stop_condition_tensor:
-                break
-        y[0, -1] = 0
-        return np.expand_dims(y[:, -idx:], axis=0)
-tts_client: GENIE = GENIE()

+import onnxruntime as ort
+import numpy as np
+from typing import List, Optional
+import threading
+from ..Audio.ReferenceAudio import ReferenceAudio
+from ..GetPhonesAndBert import get_phones_and_bert
+MAX_T2S_LEN = 1000
+class GENIE:
+    def __init__(self):
+        self.stop_event: threading.Event = threading.Event()
+    def tts(
+            self,
+            text: str,
+            prompt_audio: ReferenceAudio,
+            encoder: ort.InferenceSession,
+            first_stage_decoder: ort.InferenceSession,
+            stage_decoder: ort.InferenceSession,
+            vocoder: ort.InferenceSession,
+            prompt_encoder: Optional[ort.InferenceSession],
+            language: str = 'japanese',
+            text_language: str = None,  # 新增：目标文本语言，默认使用参考音频语言
+    ) -> Optional[np.ndarray]:
+        # 如果未指定 text_language，则使用参考音频的语言
+        actual_text_language = text_language if text_language else language
+        text = '。' + text  # 防止漏第一句。
+        text_seq, text_bert = get_phones_and_bert(text, language=actual_text_language)
+        semantic_tokens: np.ndarray = self.t2s_cpu(
+            ref_seq=prompt_audio.phonemes_seq,
+            ref_bert=prompt_audio.text_bert,
+            text_seq=text_seq,
+            text_bert=text_bert,
+            ssl_content=prompt_audio.ssl_content,
+            encoder=encoder,
+            first_stage_decoder=first_stage_decoder,
+            stage_decoder=stage_decoder,
+        )
+        eos_indices = np.where(semantic_tokens >= 1024)  # 剔除不合法的元素，例如 EOS Token。
+        if len(eos_indices[0]) > 0:
+            first_eos_index = eos_indices[-1][0]
+            semantic_tokens = semantic_tokens[..., :first_eos_index]
+        if prompt_encoder is None:
+            return vocoder.run(None, {
+                "text_seq": text_seq,
+                "pred_semantic": semantic_tokens,
+                "ref_audio": prompt_audio.audio_32k
+            })[0]
+        else:
+            # V2ProPlus 新增。
+            prompt_audio.update_global_emb(prompt_encoder=prompt_encoder)
+            audio_chunk = vocoder.run(None, {
+                "text_seq": text_seq,
+                "pred_semantic": semantic_tokens,
+                "ge": prompt_audio.global_emb,
+                "ge_advanced": prompt_audio.global_emb_advanced,
+            })[0]
+            return audio_chunk
+    def t2s_cpu(
+            self,
+            ref_seq: np.ndarray,
+            ref_bert: np.ndarray,
+            text_seq: np.ndarray,
+            text_bert: np.ndarray,
+            ssl_content: np.ndarray,
+            encoder: ort.InferenceSession,
+            first_stage_decoder: ort.InferenceSession,
+            stage_decoder: ort.InferenceSession,
+    ) -> Optional[np.ndarray]:
+        """在CPU上运行T2S模型"""
+        # Encoder
+        x, prompts = encoder.run(
+            None,
+            {
+                "ref_seq": ref_seq,
+                "text_seq": text_seq,
+                "ref_bert": ref_bert,
+                "text_bert": text_bert,
+                "ssl_content": ssl_content,
+            },
+        )
+        # First Stage Decoder
+        y, y_emb, *present_key_values = first_stage_decoder.run(
+            None, {"x": x, "prompts": prompts}
+        )
+        # Stage Decoder
+        input_names: List[str] = [inp.name for inp in stage_decoder.get_inputs()]
+        idx: int = 0
+        for idx in range(0, 500):
+            if self.stop_event.is_set():
+                return None
+            input_feed = {
+                name: data
+                for name, data in zip(input_names, [y, y_emb, *present_key_values])
+            }
+            outputs = stage_decoder.run(None, input_feed)
+            y, y_emb, stop_condition_tensor, *present_key_values = outputs
+            if stop_condition_tensor:
+                break
+        y[0, -1] = 0
+        return np.expand_dims(y[:, -idx:], axis=0)
+tts_client: GENIE = GENIE()

genie_tts/Core/TTSPlayer.py CHANGED Viewed

@@ -1,241 +1,242 @@
-# 文件: .../Core/TTSPlayer.py
-import queue
-import os
-import threading
-import numpy as np
-import wave
-from typing import Optional, List, Callable
-import logging
-from ..Utils.TextSplitter import TextSplitter
-from ..Core.Inference import tts_client
-from ..ModelManager import model_manager
-from ..Utils.Shared import context
-from ..Utils.Utils import clear_queue
-logger = logging.getLogger(__name__)
-STREAM_END = 'STREAM_END'  # 这是一个特殊的标记，表示文本流结束
-AUDIO_STREAM_END = 'AUDIO_STREAM_END'  # 新增：特殊的标记，表示音频流播放结束
-class TTSPlayer:
-    def __init__(self, sample_rate: int = 32000):
-        self._text_splitter = TextSplitter()
-        self.sample_rate: int = sample_rate
-        self.channels: int = 1
-        self.bytes_per_sample: int = 2  # 16-bit audio
-        self._text_queue: queue.Queue = queue.Queue()
-        self._audio_queue: queue.Queue = queue.Queue()
-        self._stop_event: threading.Event = threading.Event()
-        self._tts_done_event: threading.Event = threading.Event()
-        self._playback_done_event: threading.Event = threading.Event()  # 新增：用于标记播放完成
-        self._api_lock: threading.Lock = threading.Lock()
-        self._tts_worker: Optional[threading.Thread] = None
-        self._playback_worker: Optional[threading.Thread] = None
-        self._play: bool = False
-        self._current_save_path: Optional[str] = None
-        self._session_audio_chunks: List[np.ndarray] = []
-        self._split: bool = False
-        self._chunk_callback: Optional[Callable[[Optional[bytes]], None]] = None
-    @staticmethod
-    def _preprocess_for_playback(audio_float: np.ndarray) -> bytes:
-        audio_int16 = (audio_float.squeeze() * 32767).astype(np.int16)
-        return audio_int16.tobytes()
-    def _tts_worker_loop(self):
-        """从文本队列取句子，生成音频，并通过回调函数或音频队列分发。"""
-        while not self._stop_event.is_set():
-            try:
-                sentence = self._text_queue.get(timeout=1)
-                if sentence is None or self._stop_event.is_set():
-                    break
-            except queue.Empty:
-                continue
-            try:
-                if sentence is STREAM_END:
-                    if self._current_save_path and self._session_audio_chunks:
-                        self._save_session_audio()
-                    # 在TTS工作线程完成时，通过回调发送结束信号
-                    if self._chunk_callback:
-                        self._chunk_callback(None)
-                    # 新增：如果开启了播放，通知音频队列流已结束
-                    if self._play:
-                        self._audio_queue.put(AUDIO_STREAM_END)
-                    self._tts_done_event.set()
-                    continue
-                gsv_model = model_manager.get(context.current_speaker)
-                if not gsv_model or not context.current_prompt_audio:
-                    logger.error("Missing model or reference audio.")
-                    continue
-                tts_client.stop_event.clear()
-                audio_chunk = tts_client.tts(
-                    text=sentence,
-                    prompt_audio=context.current_prompt_audio,
-                    encoder=gsv_model.T2S_ENCODER,
-                    first_stage_decoder=gsv_model.T2S_FIRST_STAGE_DECODER,
-                    stage_decoder=gsv_model.T2S_STAGE_DECODER,
-                    vocoder=gsv_model.VITS,
-                    prompt_encoder=gsv_model.PROMPT_ENCODER,
-                    language=gsv_model.LANGUAGE,
-                )
-                if audio_chunk is not None:
-                    if self._play:
-                        self._audio_queue.put(audio_chunk)
-                    if self._current_save_path:
-                        self._session_audio_chunks.append(audio_chunk)
-                    # 使用回调函数处理流式数据
-                    if self._chunk_callback:
-                        audio_data = self._preprocess_for_playback(audio_chunk)
-                        self._chunk_callback(audio_data)
-            except Exception as e:
-                logger.error(f"A critical error occurred while processing the TTS task: {e}", exc_info=True)
-                # 发生错误时，也要确保发送结束信号
-                if self._chunk_callback:
-                    self._chunk_callback(None)
-                self._tts_done_event.set()
-    def _playback_worker_loop(self):
-        try:
-            import sounddevice as sd
-            with sd.OutputStream(samplerate=self.sample_rate,
-                                 channels=self.channels,
-                                 dtype='float32') as stream:
-                while not self._stop_event.is_set():
-                    try:
-                        audio_chunk = self._audio_queue.get(timeout=1)
-                        if audio_chunk is None:
-                            break
-                        if audio_chunk is AUDIO_STREAM_END:
-                            self._playback_done_event.set()
-                            continue
-                        stream.write(audio_chunk.squeeze())
-                    except queue.Empty:
-                        continue
-                    except Exception as e:
-                        logger.error(f"Error during audio playback: {e}", exc_info=True)
-        except Exception as e:
-            logger.warning(f"Failed to initialize sounddevice: {e}. Audio playback will be skipped.")
-            # 如果音频设备初始化失败，即使不播放，也要消费队列中的结束信号，防止主线程死锁
-            while not self._stop_event.is_set():
-                try:
-                    item = self._audio_queue.get(timeout=0.5)
-                    if item is None:
-                        break
-                    if item is AUDIO_STREAM_END:
-                        self._playback_done_event.set()
-                except queue.Empty:
-                    continue
-    def _save_session_audio(self):
-        try:
-            full_audio = np.concatenate(self._session_audio_chunks, axis=0)
-            with wave.open(self._current_save_path, 'wb') as wf:
-                wf.setnchannels(self.channels)
-                wf.setsampwidth(self.bytes_per_sample)
-                wf.setframerate(self.sample_rate)
-                wf.writeframes(self._preprocess_for_playback(full_audio))
-            logger.info(f"Audio successfully saved to {os.path.abspath(self._current_save_path)}")
-        except Exception as e:
-            logger.error(f"Failed to save audio: {e}")
-        finally:
-            self._session_audio_chunks = []
-            self._current_save_path = None
-    def start_session(
-            self,
-            play: bool = False,
-            split: bool = False,
-            save_path: Optional[str] = None,
-            chunk_callback: Optional[Callable[[Optional[bytes]], None]] = None
-    ):
-        with self._api_lock:
-            self._tts_done_event.clear()
-            self._playback_done_event.clear()  # 新增：重置播放完成事件
-            self._chunk_callback = chunk_callback
-            self._stop_event.clear()
-            if self._tts_worker is None or not self._tts_worker.is_alive():
-                self._tts_worker = threading.Thread(target=self._tts_worker_loop, daemon=True)
-                self._tts_worker.start()
-            if self._playback_worker is None or not self._playback_worker.is_alive():
-                self._playback_worker = threading.Thread(target=self._playback_worker_loop, daemon=True)
-                self._playback_worker.start()
-            clear_queue(self._text_queue)
-            clear_queue(self._audio_queue)
-            self._play = play
-            self._split = split
-            self._current_save_path = save_path
-            self._session_audio_chunks = []
-    def feed(self, text_chunk: str):
-        with self._api_lock:
-            if not text_chunk:
-                return
-            if self._split:
-                sentences = self._text_splitter.split(text_chunk.strip())
-                for sentence in sentences:
-                    self._text_queue.put(sentence)
-            else:
-                self._text_queue.put(text_chunk)
-    def end_session(self):
-        with self._api_lock:
-            self._text_queue.put(STREAM_END)
-    def stop(self):
-        with self._api_lock:
-            if self._tts_worker is None and self._playback_worker is None:
-                return
-            if self._stop_event.is_set():
-                return
-            tts_client.stop_event.set()
-            self._stop_event.set()
-            self._tts_done_event.set()
-            self._text_queue.put(None)
-            self._audio_queue.put(None)
-            if self._tts_worker and self._tts_worker.is_alive():
-                self._tts_worker.join()
-            if self._playback_worker and self._playback_worker.is_alive():
-                self._playback_worker.join()
-            self._tts_worker = None
-            self._playback_worker = None
-    def wait_for_tts_completion(self):
-        if self._tts_done_event.is_set():
-            return
-        self._tts_done_event.wait()
-    def wait_for_playback_done(self):
-        # 1. 首先等待TTS生成全部完成
-        self.wait_for_tts_completion()
-        # 2. 如果开启了播放且没有被强制停止，则等待播放结束
-        if self._play and not self._stop_event.is_set():
-            if not self._playback_done_event.is_set():
-                self._playback_done_event.wait()
-tts_player: TTSPlayer = TTSPlayer()

+# 文件: .../Core/TTSPlayer.py
+import queue
+import os
+import threading
+import numpy as np
+import wave
+from typing import Optional, List, Callable
+import logging
+from ..Utils.TextSplitter import TextSplitter
+from ..Core.Inference import tts_client
+from ..ModelManager import model_manager
+from ..Utils.Shared import context
+from ..Utils.Utils import clear_queue
+logger = logging.getLogger(__name__)
+STREAM_END = 'STREAM_END'  # 这是一个特殊的标记，表示文本流结束
+AUDIO_STREAM_END = 'AUDIO_STREAM_END'  # 新增：特殊的标记，表示音频流播放结束
+class TTSPlayer:
+    def __init__(self, sample_rate: int = 32000):
+        self._text_splitter = TextSplitter()
+        self.sample_rate: int = sample_rate
+        self.channels: int = 1
+        self.bytes_per_sample: int = 2  # 16-bit audio
+        self._text_queue: queue.Queue = queue.Queue()
+        self._audio_queue: queue.Queue = queue.Queue()
+        self._stop_event: threading.Event = threading.Event()
+        self._tts_done_event: threading.Event = threading.Event()
+        self._playback_done_event: threading.Event = threading.Event()  # 新增：用于标记播放完成
+        self._api_lock: threading.Lock = threading.Lock()
+        self._tts_worker: Optional[threading.Thread] = None
+        self._playback_worker: Optional[threading.Thread] = None
+        self._play: bool = False
+        self._current_save_path: Optional[str] = None
+        self._session_audio_chunks: List[np.ndarray] = []
+        self._split: bool = False
+        self._chunk_callback: Optional[Callable[[Optional[bytes]], None]] = None
+    @staticmethod
+    def _preprocess_for_playback(audio_float: np.ndarray) -> bytes:
+        audio_int16 = (audio_float.squeeze() * 32767).astype(np.int16)
+        return audio_int16.tobytes()
+    def _tts_worker_loop(self):
+        """从文本队列取句子，生成音频，并通过回调函数或音频队列分发。"""
+        while not self._stop_event.is_set():
+            try:
+                sentence = self._text_queue.get(timeout=1)
+                if sentence is None or self._stop_event.is_set():
+                    break
+            except queue.Empty:
+                continue
+            try:
+                if sentence is STREAM_END:
+                    if self._current_save_path and self._session_audio_chunks:
+                        self._save_session_audio()
+                    # 在TTS工作线程完成时，通过回调发送结束信号
+                    if self._chunk_callback:
+                        self._chunk_callback(None)
+                    # 新增：如果开启了播放，通知音频队列流已结束
+                    if self._play:
+                        self._audio_queue.put(AUDIO_STREAM_END)
+                    self._tts_done_event.set()
+                    continue
+                gsv_model = model_manager.get(context.current_speaker)
+                if not gsv_model or not context.current_prompt_audio:
+                    logger.error("Missing model or reference audio.")
+                    continue
+                tts_client.stop_event.clear()
+                audio_chunk = tts_client.tts(
+                    text=sentence,
+                    prompt_audio=context.current_prompt_audio,
+                    encoder=gsv_model.T2S_ENCODER,
+                    first_stage_decoder=gsv_model.T2S_FIRST_STAGE_DECODER,
+                    stage_decoder=gsv_model.T2S_STAGE_DECODER,
+                    vocoder=gsv_model.VITS,
+                    prompt_encoder=gsv_model.PROMPT_ENCODER,
+                    language=gsv_model.LANGUAGE,
+                    text_language=context.current_text_language,  # 新增：跨语言TTS支持
+                )
+                if audio_chunk is not None:
+                    if self._play:
+                        self._audio_queue.put(audio_chunk)
+                    if self._current_save_path:
+                        self._session_audio_chunks.append(audio_chunk)
+                    # 使用回调函数处理流式数据
+                    if self._chunk_callback:
+                        audio_data = self._preprocess_for_playback(audio_chunk)
+                        self._chunk_callback(audio_data)
+            except Exception as e:
+                logger.error(f"A critical error occurred while processing the TTS task: {e}", exc_info=True)
+                # 发生错误时，也要确保发送结束信号
+                if self._chunk_callback:
+                    self._chunk_callback(None)
+                self._tts_done_event.set()
+    def _playback_worker_loop(self):
+        try:
+            import sounddevice as sd
+            with sd.OutputStream(samplerate=self.sample_rate,
+                                 channels=self.channels,
+                                 dtype='float32') as stream:
+                while not self._stop_event.is_set():
+                    try:
+                        audio_chunk = self._audio_queue.get(timeout=1)
+                        if audio_chunk is None:
+                            break
+                        if audio_chunk is AUDIO_STREAM_END:
+                            self._playback_done_event.set()
+                            continue
+                        stream.write(audio_chunk.squeeze())
+                    except queue.Empty:
+                        continue
+                    except Exception as e:
+                        logger.error(f"Error during audio playback: {e}", exc_info=True)
+        except Exception as e:
+            logger.warning(f"Failed to initialize sounddevice: {e}. Audio playback will be skipped.")
+            # 如果音频设备初始化失败，即使不播放，也要消费队列中的结束信号，防止主线程死锁
+            while not self._stop_event.is_set():
+                try:
+                    item = self._audio_queue.get(timeout=0.5)
+                    if item is None:
+                        break
+                    if item is AUDIO_STREAM_END:
+                        self._playback_done_event.set()
+                except queue.Empty:
+                    continue
+    def _save_session_audio(self):
+        try:
+            full_audio = np.concatenate(self._session_audio_chunks, axis=0)
+            with wave.open(self._current_save_path, 'wb') as wf:
+                wf.setnchannels(self.channels)
+                wf.setsampwidth(self.bytes_per_sample)
+                wf.setframerate(self.sample_rate)
+                wf.writeframes(self._preprocess_for_playback(full_audio))
+            logger.info(f"Audio successfully saved to {os.path.abspath(self._current_save_path)}")
+        except Exception as e:
+            logger.error(f"Failed to save audio: {e}")
+        finally:
+            self._session_audio_chunks = []
+            self._current_save_path = None
+    def start_session(
+            self,
+            play: bool = False,
+            split: bool = False,
+            save_path: Optional[str] = None,
+            chunk_callback: Optional[Callable[[Optional[bytes]], None]] = None
+    ):
+        with self._api_lock:
+            self._tts_done_event.clear()
+            self._playback_done_event.clear()  # 新增：重置播放完成事件
+            self._chunk_callback = chunk_callback
+            self._stop_event.clear()
+            if self._tts_worker is None or not self._tts_worker.is_alive():
+                self._tts_worker = threading.Thread(target=self._tts_worker_loop, daemon=True)
+                self._tts_worker.start()
+            if self._playback_worker is None or not self._playback_worker.is_alive():
+                self._playback_worker = threading.Thread(target=self._playback_worker_loop, daemon=True)
+                self._playback_worker.start()
+            clear_queue(self._text_queue)
+            clear_queue(self._audio_queue)
+            self._play = play
+            self._split = split
+            self._current_save_path = save_path
+            self._session_audio_chunks = []
+    def feed(self, text_chunk: str):
+        with self._api_lock:
+            if not text_chunk:
+                return
+            if self._split:
+                sentences = self._text_splitter.split(text_chunk.strip())
+                for sentence in sentences:
+                    self._text_queue.put(sentence)
+            else:
+                self._text_queue.put(text_chunk)
+    def end_session(self):
+        with self._api_lock:
+            self._text_queue.put(STREAM_END)
+    def stop(self):
+        with self._api_lock:
+            if self._tts_worker is None and self._playback_worker is None:
+                return
+            if self._stop_event.is_set():
+                return
+            tts_client.stop_event.set()
+            self._stop_event.set()
+            self._tts_done_event.set()
+            self._text_queue.put(None)
+            self._audio_queue.put(None)
+            if self._tts_worker and self._tts_worker.is_alive():
+                self._tts_worker.join()
+            if self._playback_worker and self._playback_worker.is_alive():
+                self._playback_worker.join()
+            self._tts_worker = None
+            self._playback_worker = None
+    def wait_for_tts_completion(self):
+        if self._tts_done_event.is_set():
+            return
+        self._tts_done_event.wait()
+    def wait_for_playback_done(self):
+        # 1. 首先等待TTS生成全部完成
+        self.wait_for_tts_completion()
+        # 2. 如果开启了播放且没有被强制停止，则等待播放结束
+        if self._play and not self._stop_event.is_set():
+            if not self._playback_done_event.is_set():
+                self._playback_done_event.wait()
+tts_player: TTSPlayer = TTSPlayer()

genie_tts/Internal.py CHANGED Viewed

@@ -1,395 +1,403 @@
-# 请严格遵循导入顺序。
-# 1、环境变量。
-import os
-from os import PathLike
-os.environ["HF_HUB_ENABLE_PROGRESS_BAR"] = "1"
-# 2、Logging & Warnings。
-import logging
-import warnings
-warnings.filterwarnings("ignore", category=UserWarning, module="jieba_fast._compat")
-logging.basicConfig(level=logging.INFO, format="%(message)s", datefmt="[%X]")
-logger = logging.getLogger(__name__)
-# 3、ONNX。
-import onnxruntime
-onnxruntime.set_default_logger_severity(3)
-# 导入剩余库。
-from pathlib import Path
-import json
-import asyncio
-from typing import AsyncIterator, Optional, Union, Dict
-from .Audio.ReferenceAudio import ReferenceAudio
-from .Core.Resources import ensure_exists, Chinese_G2P_DIR, English_G2P_DIR
-from .Core.TTSPlayer import tts_player
-from .ModelManager import model_manager
-from .Utils.Shared import context
-from .Utils.Language import normalize_language
-from .PredefinedCharacter import download_chara, CHARA_LANG, CHARA_ALIAS_MAP
-# A module-level private dictionary to store reference audio configurations.
-_reference_audios: Dict[str, dict] = {}
-SUPPORTED_AUDIO_EXTS = {'.wav', '.flac', '.ogg', '.aiff', '.aif'}
-def check_onnx_model_dir(onnx_model_dir: Union[str, os.PathLike]) -> None:
-    """
-    Checks if the directory contains the necessary ONNX model files for Genie TTS (v2 or v2ProPlus).
-    Raises a FileNotFoundError with detailed instructions if validation fails.
-    """
-    model_path = Path(onnx_model_dir)
-    # 1. Check if directory exists
-    if not model_path.exists() or not model_path.is_dir():
-        raise FileNotFoundError(f"The model directory '{onnx_model_dir}' does not exist or is not a directory.")
-    # 2. Define required files
-    # Base files required by both v2 and v2ProPlus
-    required_base_files = {
-        "t2s_encoder_fp32.bin",
-        "t2s_encoder_fp32.onnx",
-        "t2s_first_stage_decoder_fp32.onnx",
-        "t2s_shared_fp16.bin",
-        "t2s_stage_decoder_fp32.onnx",
-        "vits_fp16.bin",
-        "vits_fp32.onnx"
-    }
-    # 3. Get current files in directory
-    existing_files = set(f.name for f in model_path.iterdir() if f.is_file())
-    # 4. Validate
-    # We check if the base files exist. If base files are missing, the model is definitely unusable.
-    if not required_base_files.issubset(existing_files):
-        missing = required_base_files - existing_files
-        # Construct detailed error message
-        error_msg = (
-            f"\n\n[Genie Error] Invalid ONNX model directory: '{model_path}'\n"
-            "===============================================================\n"
-            f"Missing base files: {', '.join(missing)}\n"
-            "A valid model folder must contain at least the following files.\n"
-            "1. [v2 Base] (Required for all models):\n"
-            "   - t2s_encoder_fp32.bin\n"
-            "   - t2s_encoder_fp32.onnx\n"
-            "   - t2s_first_stage_decoder_fp32.onnx\n"
-            "   - t2s_shared_fp16.bin\n"
-            "   - t2s_stage_decoder_fp32.onnx\n"
-            "   - vits_fp16.bin\n"
-            "   - vits_fp32.onnx\n"
-            "2. [v2ProPlus Additions] (Required for v2pp features):\n"
-            "   - prompt_encoder_fp16.bin\n"
-            "   - prompt_encoder_fp32.onnx\n"
-            "===============================================================\n"
-        )
-        raise FileNotFoundError(error_msg)
-def load_character(
-        character_name: str,
-        onnx_model_dir: Union[str, PathLike],
-        language: str,
-) -> None:
-    """
-    Loads a character model from an ONNX model directory.
-    Args:
-        character_name (str): The name to assign to the loaded character.
-        onnx_model_dir (str | PathLike): The directory path containing the ONNX model files.
-        language (str): The language of the character model.
-    """
-    check_onnx_model_dir(onnx_model_dir)
-    language = normalize_language(language)
-    if language not in ['Japanese', 'English', 'Chinese']:
-        raise ValueError('Unknown language')
-    if language == 'Chinese':
-        ensure_exists(Chinese_G2P_DIR, "Chinese_G2P_DIR")
-    elif language == 'English':
-        ensure_exists(English_G2P_DIR, "English_G2P_DIR")
-    model_path: str = os.fspath(onnx_model_dir)
-    model_manager.load_character(
-        character_name=character_name,
-        model_dir=model_path,
-        language=language,
-    )
-def unload_character(
-        character_name: str,
-) -> None:
-    """
-    Unloads a previously loaded character model to free up resources.
-    Args:
-        character_name (str): The name of the character to unload.
-    """
-    model_manager.remove_character(
-        character_name=character_name,
-    )
-def set_reference_audio(
-        character_name: str,
-        audio_path: Union[str, PathLike],
-        audio_text: str,
-        language: str = None,
-) -> None:
-    """
-    Sets the reference audio for a character to be used for voice cloning.
-    This must be called for a character before using 'tts' or 'tts_async'.
-    Args:
-        character_name (str): The name of the character.
-        audio_path (str | PathLike): The file path to the reference audio (e.g., a WAV file).
-        audio_text (str): The transcript of the reference audio.
-        language (str): The language of the reference audio.
-    """
-    audio_path: str = os.fspath(audio_path)
-    # 检查文件后缀是否支持
-    ext = os.path.splitext(audio_path)[1].lower()
-    if ext not in SUPPORTED_AUDIO_EXTS:
-        logger.error(
-            f"Audio format '{ext}' is not supported. Only the following formats are supported: {SUPPORTED_AUDIO_EXTS}"
-        )
-        return
-    if language is None:
-        gsv_model = model_manager.get(character_name)
-        if gsv_model:
-            language = gsv_model.LANGUAGE
-        else:
-            raise ValueError('No language specified')
-    language = normalize_language(language)
-    if language not in ['Japanese', 'English', 'Chinese']:
-        raise ValueError('Unknown language')
-    _reference_audios[character_name] = {
-        'audio_path': audio_path,
-        'audio_text': audio_text,
-        'language': language,
-    }
-    # print(_reference_audios[character_name])
-    context.current_prompt_audio = ReferenceAudio(
-        prompt_wav=audio_path,
-        prompt_text=audio_text,
-        language=language,
-    )
-async def tts_async(
-        character_name: str,
-        text: str,
-        play: bool = False,
-        split_sentence: bool = False,
-        save_path: Union[str, PathLike, None] = None,
-) -> AsyncIterator[bytes]:
-    """
-    Asynchronously generates speech from text and yields audio chunks.
-    This function returns an async iterator that provides the audio data in
-    real-time as it's being generated.
-    Args:
-        character_name (str): The name of the character to use for synthesis.
-        text (str): The text to be synthesized into speech.
-        play (bool, optional): If True, plays the audio as it's generated. Defaults to False.
-        split_sentence (bool, optional): If True, splits the text into sentences for synthesis. Defaults to False.
-        save_path (str | PathLike | None, optional): If provided, saves the generated audio to this file path. Defaults to None.
-    Yields:
-        bytes: A chunk of the generated audio data.
-    Raises:
-        ValueError: If 'set_reference_audio' has not been called for the character.
-    """
-    if character_name not in _reference_audios:
-        raise ValueError("Please call 'set_reference_audio' first to set the reference audio.")
-    if save_path:
-        save_path = os.fspath(save_path)
-        parent_dir = os.path.dirname(save_path)
-        if parent_dir:
-            os.makedirs(parent_dir, exist_ok=True)
-    # 1. 创建 asyncio 队列和获取当前事件循环
-    stream_queue: asyncio.Queue[Union[bytes, None]] = asyncio.Queue()
-    loop = asyncio.get_running_loop()
-    # 2. 定义回调函数，用于在线程和 asyncio 之间安全地传递数据
-    def tts_chunk_callback(c: Optional[bytes]):
-        """This callback is called from the TTS worker thread."""
-        loop.call_soon_threadsafe(stream_queue.put_nowait, c)
-    # 设置 TTS 上下文
-    context.current_speaker = character_name
-    context.current_prompt_audio = ReferenceAudio(
-        prompt_wav=_reference_audios[character_name]['audio_path'],
-        prompt_text=_reference_audios[character_name]['audio_text'],
-        language=_reference_audios[character_name]['language'],
-    )
-    # 3. 使用新的回调接口启动 TTS 会话
-    tts_player.start_session(
-        play=play,
-        split=split_sentence,
-        save_path=save_path,
-        chunk_callback=tts_chunk_callback,
-    )
-    # 馈送文本并通知会话结束
-    tts_player.feed(text)
-    tts_player.end_session()
-    # 4. 从队列中异步读取数据并产生
-    while True:
-        chunk = await stream_queue.get()
-        if chunk is None:
-            break
-        yield chunk
-def tts(
-        character_name: str,
-        text: str,
-        play: bool = False,
-        split_sentence: bool = True,
-        save_path: Union[str, PathLike, None] = None,
-) -> None:
-    """
-    Synchronously generates speech from text.
-    This is a blocking function that will not return until the entire TTS
-    process is complete.
-    Args:
-        character_name (str): The name of the character to use for synthesis.
-        text (str): The text to be synthesized into speech.
-        play (bool, optional): If True, plays the audio.
-        split_sentence (bool, optional): If True, splits the text into sentences for synthesis.
-        save_path (str | PathLike | None, optional): If provided, saves the generated audio to this file path. Defaults to None.
-    """
-    if character_name not in _reference_audios:
-        logger.error("Please call 'set_reference_audio' first to set the reference audio.")
-        return
-    if save_path:
-        save_path = os.fspath(save_path)
-        parent_dir = os.path.dirname(save_path)
-        if parent_dir:
-            os.makedirs(parent_dir, exist_ok=True)
-    context.current_speaker = character_name
-    context.current_prompt_audio = ReferenceAudio(
-        prompt_wav=_reference_audios[character_name]['audio_path'],
-        prompt_text=_reference_audios[character_name]['audio_text'],
-        language=_reference_audios[character_name]['language'],
-    )
-    tts_player.start_session(
-        play=play,
-        split=split_sentence,
-        save_path=save_path,
-    )
-    tts_player.feed(text)
-    tts_player.end_session()
-    tts_player.wait_for_tts_completion()
-def wait_for_playback_done() -> None:
-    """
-    Wait until all TTS tasks have finished processing and playback has fully completed.
-    """
-    tts_player.wait_for_playback_done()
-def stop() -> None:
-    """
-    Stops the currently playing text-to-speech audio.
-    """
-    tts_player.stop()
-def convert_to_onnx(
-        torch_ckpt_path: Union[str, PathLike],
-        torch_pth_path: Union[str, PathLike],
-        output_dir: Union[str, PathLike],
-) -> None:
-    """
-    Converts PyTorch model checkpoints to the ONNX format.
-    This function requires PyTorch to be installed.
-    Args:
-        torch_ckpt_path (str | PathLike): The path to the T2S model (.ckpt) file.
-        torch_pth_path (str | PathLike): The path to the VITS model (.pth) file.
-        output_dir (str | PathLike): The directory where the ONNX models will be saved.
-    """
-    try:
-        import torch
-    except ImportError:
-        logger.error("❌ PyTorch is not installed. Please run `pip install torch` first.")
-        return
-    from .Converter.Converter import convert
-    torch_ckpt_path = os.fspath(torch_ckpt_path)
-    torch_pth_path = os.fspath(torch_pth_path)
-    output_dir = os.fspath(output_dir)
-    convert(
-        torch_pth_path=torch_pth_path,
-        torch_ckpt_path=torch_ckpt_path,
-        output_dir=output_dir,
-    )
-def clear_reference_audio_cache() -> None:
-    """
-    Clears the cache of reference audio data.
-    """
-    ReferenceAudio.clear_cache()
-def load_predefined_character(character_name: str) -> None:
-    """
-    Download and load a predefined character model for TTS inference.
-    """
-    character_name = character_name.lower().strip()
-    if character_name not in CHARA_ALIAS_MAP:
-        logger.error(f"No predefined character model found for {character_name}")
-        return
-    character_name = CHARA_ALIAS_MAP[character_name]
-    save_path = download_chara(character_name)
-    model_manager.load_character(
-        character_name=character_name,
-        model_dir=os.path.join(save_path, 'tts_models'),
-        language=CHARA_LANG[character_name],
-    )
-    with open(os.path.join(save_path, "prompt_wav.json"), "r", encoding="utf-8") as f:
-        prompt_wav_dict: Dict[str, Dict[str, str]] = json.load(f)
-    audio_text = prompt_wav_dict["Normal"]["text"]
-    audio_path = os.path.join(save_path, "prompt_wav", prompt_wav_dict["Normal"]["wav"])
-    _reference_audios[character_name] = {
-        'audio_path': audio_path,
-        'audio_text': audio_text,
-        'language': CHARA_LANG[character_name],
-    }
-    context.current_prompt_audio = ReferenceAudio(
-        prompt_wav=audio_path,
-        prompt_text=audio_text,
-        language=CHARA_LANG[character_name],
-    )

+# 请严格遵循导入顺序。
+# 1、环境变量。
+import os
+from os import PathLike
+os.environ["HF_HUB_ENABLE_PROGRESS_BAR"] = "1"
+# 2、Logging & Warnings。
+import logging
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="jieba_fast._compat")
+logging.basicConfig(level=logging.INFO, format="%(message)s", datefmt="[%X]")
+logger = logging.getLogger(__name__)
+# 3、ONNX。
+import onnxruntime
+onnxruntime.set_default_logger_severity(3)
+# 导入剩余库。
+from pathlib import Path
+import json
+import asyncio
+from typing import AsyncIterator, Optional, Union, Dict
+from .Audio.ReferenceAudio import ReferenceAudio
+from .Core.Resources import ensure_exists, Chinese_G2P_DIR, English_G2P_DIR
+from .Core.TTSPlayer import tts_player
+from .ModelManager import model_manager
+from .Utils.Shared import context
+from .Utils.Language import normalize_language
+from .PredefinedCharacter import download_chara, CHARA_LANG, CHARA_ALIAS_MAP
+# A module-level private dictionary to store reference audio configurations.
+_reference_audios: Dict[str, dict] = {}
+SUPPORTED_AUDIO_EXTS = {'.wav', '.flac', '.ogg', '.aiff', '.aif'}
+def check_onnx_model_dir(onnx_model_dir: Union[str, os.PathLike]) -> None:
+    """
+    Checks if the directory contains the necessary ONNX model files for Genie TTS (v2 or v2ProPlus).
+    Raises a FileNotFoundError with detailed instructions if validation fails.
+    """
+    model_path = Path(onnx_model_dir)
+    # 1. Check if directory exists
+    if not model_path.exists() or not model_path.is_dir():
+        raise FileNotFoundError(f"The model directory '{onnx_model_dir}' does not exist or is not a directory.")
+    # 2. Define required files
+    # Base files required by both v2 and v2ProPlus
+    required_base_files = {
+        "t2s_encoder_fp32.bin",
+        "t2s_encoder_fp32.onnx",
+        "t2s_first_stage_decoder_fp32.onnx",
+        "t2s_shared_fp16.bin",
+        "t2s_stage_decoder_fp32.onnx",
+        "vits_fp16.bin",
+        "vits_fp32.onnx"
+    }
+    # 3. Get current files in directory
+    existing_files = set(f.name for f in model_path.iterdir() if f.is_file())
+    # 4. Validate
+    # We check if the base files exist. If base files are missing, the model is definitely unusable.
+    if not required_base_files.issubset(existing_files):
+        missing = required_base_files - existing_files
+        # Construct detailed error message
+        error_msg = (
+            f"\n\n[Genie Error] Invalid ONNX model directory: '{model_path}'\n"
+            "===============================================================\n"
+            f"Missing base files: {', '.join(missing)}\n"
+            "A valid model folder must contain at least the following files.\n"
+            "1. [v2 Base] (Required for all models):\n"
+            "   - t2s_encoder_fp32.bin\n"
+            "   - t2s_encoder_fp32.onnx\n"
+            "   - t2s_first_stage_decoder_fp32.onnx\n"
+            "   - t2s_shared_fp16.bin\n"
+            "   - t2s_stage_decoder_fp32.onnx\n"
+            "   - vits_fp16.bin\n"
+            "   - vits_fp32.onnx\n"
+            "2. [v2ProPlus Additions] (Required for v2pp features):\n"
+            "   - prompt_encoder_fp16.bin\n"
+            "   - prompt_encoder_fp32.onnx\n"
+            "===============================================================\n"
+        )
+        raise FileNotFoundError(error_msg)
+def load_character(
+        character_name: str,
+        onnx_model_dir: Union[str, PathLike],
+        language: str,
+) -> None:
+    """
+    Loads a character model from an ONNX model directory.
+    Args:
+        character_name (str): The name to assign to the loaded character.
+        onnx_model_dir (str | PathLike): The directory path containing the ONNX model files.
+        language (str): The language of the character model.
+    """
+    check_onnx_model_dir(onnx_model_dir)
+    language = normalize_language(language)
+    if language not in ['Japanese', 'English', 'Chinese']:
+        raise ValueError('Unknown language')
+    if language == 'Chinese':
+        ensure_exists(Chinese_G2P_DIR, "Chinese_G2P_DIR")
+    elif language == 'English':
+        ensure_exists(English_G2P_DIR, "English_G2P_DIR")
+    model_path: str = os.fspath(onnx_model_dir)
+    model_manager.load_character(
+        character_name=character_name,
+        model_dir=model_path,
+        language=language,
+    )
+def unload_character(
+        character_name: str,
+) -> None:
+    """
+    Unloads a previously loaded character model to free up resources.
+    Args:
+        character_name (str): The name of the character to unload.
+    """
+    model_manager.remove_character(
+        character_name=character_name,
+    )
+def set_reference_audio(
+        character_name: str,
+        audio_path: Union[str, PathLike],
+        audio_text: str,
+        language: str = None,
+) -> None:
+    """
+    Sets the reference audio for a character to be used for voice cloning.
+    This must be called for a character before using 'tts' or 'tts_async'.
+    Args:
+        character_name (str): The name of the character.
+        audio_path (str | PathLike): The file path to the reference audio (e.g., a WAV file).
+        audio_text (str): The transcript of the reference audio.
+        language (str): The language of the reference audio.
+    """
+    audio_path: str = os.fspath(audio_path)
+    # 检查文件后缀是否支持
+    ext = os.path.splitext(audio_path)[1].lower()
+    if ext not in SUPPORTED_AUDIO_EXTS:
+        logger.error(
+            f"Audio format '{ext}' is not supported. Only the following formats are supported: {SUPPORTED_AUDIO_EXTS}"
+        )
+        return
+    if language is None:
+        gsv_model = model_manager.get(character_name)
+        if gsv_model:
+            language = gsv_model.LANGUAGE
+        else:
+            raise ValueError('No language specified')
+    language = normalize_language(language)
+    if language not in ['Japanese', 'English', 'Chinese']:
+        raise ValueError('Unknown language')
+    _reference_audios[character_name] = {
+        'audio_path': audio_path,
+        'audio_text': audio_text,
+        'language': language,
+    }
+    # print(_reference_audios[character_name])
+    context.current_prompt_audio = ReferenceAudio(
+        prompt_wav=audio_path,
+        prompt_text=audio_text,
+        language=language,
+    )
+async def tts_async(
+        character_name: str,
+        text: str,
+        play: bool = False,
+        split_sentence: bool = False,
+        save_path: Union[str, PathLike, None] = None,
+        text_language: str = None,  # 新增：目标文本语言，用于跨语言TTS
+) -> AsyncIterator[bytes]:
+    """
+    Asynchronously generates speech from text and yields audio chunks.
+    This function returns an async iterator that provides the audio data in
+    real-time as it's being generated.
+    Args:
+        character_name (str): The name of the character to use for synthesis.
+        text (str): The text to be synthesized into speech.
+        play (bool, optional): If True, plays the audio as it's generated. Defaults to False.
+        split_sentence (bool, optional): If True, splits the text into sentences for synthesis. Defaults to False.
+        save_path (str | PathLike | None, optional): If provided, saves the generated audio to this file path. Defaults to None.
+        text_language (str, optional): Language of the target text. If None, uses the reference audio language.
+    Yields:
+        bytes: A chunk of the generated audio data.
+    Raises:
+        ValueError: If 'set_reference_audio' has not been called for the character.
+    """
+    if character_name not in _reference_audios:
+        raise ValueError("Please call 'set_reference_audio' first to set the reference audio.")
+    if save_path:
+        save_path = os.fspath(save_path)
+        parent_dir = os.path.dirname(save_path)
+        if parent_dir:
+            os.makedirs(parent_dir, exist_ok=True)
+    # 1. 创建 asyncio 队列和获取当前事件循环
+    stream_queue: asyncio.Queue[Union[bytes, None]] = asyncio.Queue()
+    loop = asyncio.get_running_loop()
+    # 2. 定义回调函数，用于在线程和 asyncio 之间安全地传递数据
+    def tts_chunk_callback(c: Optional[bytes]):
+        """This callback is called from the TTS worker thread."""
+        loop.call_soon_threadsafe(stream_queue.put_nowait, c)
+    # 设置 TTS 上下文
+    context.current_speaker = character_name
+    context.current_prompt_audio = ReferenceAudio(
+        prompt_wav=_reference_audios[character_name]['audio_path'],
+        prompt_text=_reference_audios[character_name]['audio_text'],
+        language=_reference_audios[character_name]['language'],
+    )
+    # 设置目标文本语言（跨语言TTS）
+    context.current_text_language = normalize_language(text_language) if text_language else None
+    # 3. 使用新的回调接口启动 TTS 会话
+    tts_player.start_session(
+        play=play,
+        split=split_sentence,
+        save_path=save_path,
+        chunk_callback=tts_chunk_callback,
+    )
+    # 馈送文本并通知会话结束
+    tts_player.feed(text)
+    tts_player.end_session()
+    # 4. 从队列中异步读取数据并产生
+    while True:
+        chunk = await stream_queue.get()
+        if chunk is None:
+            break
+        yield chunk
+def tts(
+        character_name: str,
+        text: str,
+        play: bool = False,
+        split_sentence: bool = True,
+        save_path: Union[str, PathLike, None] = None,
+        text_language: str = None,  # 新增：目标文本语言，用于跨语言TTS
+) -> None:
+    """
+    Synchronously generates speech from text.
+    This is a blocking function that will not return until the entire TTS
+    process is complete.
+    Args:
+        character_name (str): The name of the character to use for synthesis.
+        text (str): The text to be synthesized into speech.
+        play (bool, optional): If True, plays the audio.
+        split_sentence (bool, optional): If True, splits the text into sentences for synthesis.
+        save_path (str | PathLike | None, optional): If provided, saves the generated audio to this file path. Defaults to None.
+        text_language (str, optional): Language of the target text. If None, uses the reference audio language.
+    """
+    if character_name not in _reference_audios:
+        logger.error("Please call 'set_reference_audio' first to set the reference audio.")
+        return
+    if save_path:
+        save_path = os.fspath(save_path)
+        parent_dir = os.path.dirname(save_path)
+        if parent_dir:
+            os.makedirs(parent_dir, exist_ok=True)
+    context.current_speaker = character_name
+    context.current_prompt_audio = ReferenceAudio(
+        prompt_wav=_reference_audios[character_name]['audio_path'],
+        prompt_text=_reference_audios[character_name]['audio_text'],
+        language=_reference_audios[character_name]['language'],
+    )
+    # 设置目标文本语言（跨语言TTS）
+    context.current_text_language = normalize_language(text_language) if text_language else None
+    tts_player.start_session(
+        play=play,
+        split=split_sentence,
+        save_path=save_path,
+    )
+    tts_player.feed(text)
+    tts_player.end_session()
+    tts_player.wait_for_tts_completion()
+def wait_for_playback_done() -> None:
+    """
+    Wait until all TTS tasks have finished processing and playback has fully completed.
+    """
+    tts_player.wait_for_playback_done()
+def stop() -> None:
+    """
+    Stops the currently playing text-to-speech audio.
+    """
+    tts_player.stop()
+def convert_to_onnx(
+        torch_ckpt_path: Union[str, PathLike],
+        torch_pth_path: Union[str, PathLike],
+        output_dir: Union[str, PathLike],
+) -> None:
+    """
+    Converts PyTorch model checkpoints to the ONNX format.
+    This function requires PyTorch to be installed.
+    Args:
+        torch_ckpt_path (str | PathLike): The path to the T2S model (.ckpt) file.
+        torch_pth_path (str | PathLike): The path to the VITS model (.pth) file.
+        output_dir (str | PathLike): The directory where the ONNX models will be saved.
+    """
+    try:
+        import torch
+    except ImportError:
+        logger.error("❌ PyTorch is not installed. Please run `pip install torch` first.")
+        return
+    from .Converter.Converter import convert
+    torch_ckpt_path = os.fspath(torch_ckpt_path)
+    torch_pth_path = os.fspath(torch_pth_path)
+    output_dir = os.fspath(output_dir)
+    convert(
+        torch_pth_path=torch_pth_path,
+        torch_ckpt_path=torch_ckpt_path,
+        output_dir=output_dir,
+    )
+def clear_reference_audio_cache() -> None:
+    """
+    Clears the cache of reference audio data.
+    """
+    ReferenceAudio.clear_cache()
+def load_predefined_character(character_name: str) -> None:
+    """
+    Download and load a predefined character model for TTS inference.
+    """
+    character_name = character_name.lower().strip()
+    if character_name not in CHARA_ALIAS_MAP:
+        logger.error(f"No predefined character model found for {character_name}")
+        return
+    character_name = CHARA_ALIAS_MAP[character_name]
+    save_path = download_chara(character_name)
+    model_manager.load_character(
+        character_name=character_name,
+        model_dir=os.path.join(save_path, 'tts_models'),
+        language=CHARA_LANG[character_name],
+    )
+    with open(os.path.join(save_path, "prompt_wav.json"), "r", encoding="utf-8") as f:
+        prompt_wav_dict: Dict[str, Dict[str, str]] = json.load(f)
+    audio_text = prompt_wav_dict["Normal"]["text"]
+    audio_path = os.path.join(save_path, "prompt_wav", prompt_wav_dict["Normal"]["wav"])
+    _reference_audios[character_name] = {
+        'audio_path': audio_path,
+        'audio_text': audio_text,
+        'language': CHARA_LANG[character_name],
+    }
+    context.current_prompt_audio = ReferenceAudio(
+        prompt_wav=audio_path,
+        prompt_text=audio_text,
+        language=CHARA_LANG[character_name],
+    )

genie_tts/Utils/Shared.py CHANGED Viewed

@@ -1,13 +1,14 @@
-from typing import TYPE_CHECKING, Optional
-if TYPE_CHECKING:
-    from ..Audio.ReferenceAudio import ReferenceAudio
-class Context:
-    def __init__(self):
-        self.current_speaker: str = ''
-        self.current_prompt_audio: Optional['ReferenceAudio'] = None
-context: Context = Context()

+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from ..Audio.ReferenceAudio import ReferenceAudio
+class Context:
+    def __init__(self):
+        self.current_speaker: str = ''
+        self.current_prompt_audio: Optional['ReferenceAudio'] = None
+        self.current_text_language: Optional[str] = None  # 新增：目标文本语言（跨语言TTS）
+context: Context = Context()