""" ONNX-based TTS Gradio Application for Japanese PyTorch-free implementation using ONNX Runtime """ import glob import os import tempfile from time import perf_counter from typing import Optional import gradio as gr import numpy as np import onnxruntime as ort import pyopenjtalk import soundfile as sf try: import spaces except ImportError: class spaces: @staticmethod def GPU(func): return func # ============================================================================ # Configuration # ============================================================================ # Get script directory SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) MODELS_DIR = os.path.join(SCRIPT_DIR, "models") DEFAULT_MODEL = "g003_ep5709.onnx" MODEL_PATH = os.getenv("MODEL_PATH", os.path.join(MODELS_DIR, DEFAULT_MODEL)) VOCODER_PATH = os.getenv("VOCODER_PATH", None) USE_GPU = os.getenv("USE_GPU", "false").lower() == "true" SAMPLE_RATE = 22050 DEBUG = os.getenv("DEBUG", "false").lower() == "true" def get_available_models(): """Get list of available ONNX models from models directory""" if not os.path.exists(MODELS_DIR): return [DEFAULT_MODEL] models = glob.glob(os.path.join(MODELS_DIR, "*.onnx")) model_names = [os.path.basename(m) for m in models] if not model_names: return [DEFAULT_MODEL] return sorted(model_names) # ============================================================================ # Text Processing (PyTorch-free) # ============================================================================ # Load symbols from matcha _pad = "_" _punctuation = ';:,.!?¡¿—…"«»"" ' _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) _symbol_to_id = {s: i for i, s in enumerate(symbols)} def text_to_sequence(text): """Convert text to sequence of IDs""" sequence = [] for symbol in text: if symbol in _symbol_to_id: sequence.append(_symbol_to_id[symbol]) else: sequence.append(0) # Unknown symbol return sequence def intersperse(sequence, token): """Intersperse token between elements of sequence""" result = [token] * (len(sequence) * 2 + 1) result[1::2] = sequence return result def process_japanese_text(text: str): """Process Japanese text to phoneme sequence""" if not text.strip(): raise ValueError("Text cannot be empty") # Phonemize using pyopenjtalk phonemes = pyopenjtalk.g2p(text, kana=False) phonemes = phonemes.replace(" ", "") phonemes = phonemes.replace("pau", " ") if DEBUG: print(f"Input: {text}") print(f"Phonemes: {phonemes}") # Text to sequence sequence = text_to_sequence(phonemes) # Intersperse with padding sequence = intersperse(sequence, 0) # Convert to numpy x = np.array(sequence, dtype=np.int64)[np.newaxis, :] x_lengths = np.array([x.shape[-1]], dtype=np.int64) return x, x_lengths # ============================================================================ # ONNX Model Manager # ============================================================================ class ONNXModelManager: """Manages ONNX model loading and inference""" def __init__(self, model_path: str, vocoder_path: Optional[str] = None, use_gpu: bool = False): self.model_path = model_path self.vocoder_path = vocoder_path self.use_gpu = use_gpu # Select execution providers if use_gpu: self.providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] else: self.providers = ["CPUExecutionProvider"] self.model = None self.vocoder = None self.is_multi_speaker = False self.has_vocoder_embedded = False self._load_model() def _load_model(self): """Load ONNX model(s)""" if DEBUG: print(f"Loading model from {self.model_path} with providers {self.providers}") self.model = ort.InferenceSession(self.model_path, providers=self.providers) model_inputs = self.model.get_inputs() model_outputs = list(self.model.get_outputs()) self.is_multi_speaker = len(model_inputs) == 4 self.has_vocoder_embedded = model_outputs[0].name == "wav" if DEBUG: print(f"Model loaded: multi_speaker={self.is_multi_speaker}, " f"vocoder_embedded={self.has_vocoder_embedded}") # Load external vocoder if needed if not self.has_vocoder_embedded and self.vocoder_path: if DEBUG: print(f"Loading external vocoder from {self.vocoder_path}") self.vocoder = ort.InferenceSession(self.vocoder_path, providers=self.providers) def synthesize( self, x: np.ndarray, x_lengths: np.ndarray, scales: np.ndarray, spks: Optional[np.ndarray] = None ): """Run ONNX inference""" inputs = { "x": x, "x_lengths": x_lengths, "scales": scales, } if self.is_multi_speaker and spks is not None: inputs["spks"] = spks # Run Matcha inference outputs = self.model.run(None, inputs) if self.has_vocoder_embedded: # End-to-end: model outputs waveform directly return outputs[0], outputs[1] # wav, wav_lengths else: # Model outputs mel spectrogram mels, mel_lengths = outputs[0], outputs[1] if self.vocoder is not None: # Run external vocoder vocoder_inputs = {self.vocoder.get_inputs()[0].name: mels} wavs = self.vocoder.run(None, vocoder_inputs)[0] wavs = wavs.squeeze(1) wav_lengths = mel_lengths * 256 return wavs, wav_lengths else: # No vocoder available, return mel return mels, mel_lengths # Initialize model managers (one per model) model_managers = {} current_model = None def get_model_manager(model_name: str) -> ONNXModelManager: """Get or create model manager for specified model""" global model_managers, current_model model_path = os.path.join(MODELS_DIR, model_name) if model_name not in model_managers: if DEBUG: print(f"Loading new model: {model_name}") model_managers[model_name] = ONNXModelManager( model_path=model_path, vocoder_path=VOCODER_PATH, use_gpu=USE_GPU ) current_model = model_name return model_managers[model_name] # Pre-load all available models if DEBUG: print("Pre-loading all models for ZeroGPU...") for model_name in get_available_models(): get_model_manager(model_name) if DEBUG: print("All models loaded.") # ============================================================================ # Gradio Interface Functions # ============================================================================ @spaces.GPU def synthesise( text: str, model_name: str, speaker_id: int, temperature: float, speaking_rate: float, ): """ Synthesize speech from Japanese text Args: text: Japanese text input model_name: Model filename speaker_id: Speaker ID (for multi-speaker models) temperature: Sampling temperature speaking_rate: Speaking rate multiplier Returns: Tuple of (audio_path, phonemes_text) """ t0 = perf_counter() try: # Get model manager manager = get_model_manager(model_name) # Process text x, x_lengths = process_japanese_text(text) # Prepare scales scales = np.array([temperature, speaking_rate], dtype=np.float32) # Prepare speaker ID spks = None if manager.is_multi_speaker and speaker_id >= 0: spks = np.array([speaker_id], dtype=np.int64) # Run inference outputs, output_lengths = manager.synthesize(x, x_lengths, scales, spks) # Extract single result audio = outputs[0][:output_lengths[0]] inference_time = perf_counter() - t0 # Calculate RTF audio_duration_sec = len(audio) / SAMPLE_RATE rtf = inference_time / audio_duration_sec if DEBUG: print(f"Inference time: {inference_time:.3f}s, " f"Audio duration: {audio_duration_sec:.3f}s, " f"RTF: {rtf:.3f}") # Save to temporary file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: sf.write(fp.name, audio, SAMPLE_RATE, "PCM_24") audio_path = fp.name # Get phonemes for display phonemes = pyopenjtalk.g2p(text, kana=False) phonemes = phonemes.replace(" ", "") phonemes = phonemes.replace("pau", " ") info = f"Model: {model_name}\n" info += f"Speaker ID: {speaker_id if manager.is_multi_speaker else 'N/A (Single speaker)'}\n" info += f"Phonemes: {phonemes}\n" info += f"RTF: {rtf:.3f}" return audio_path, info except Exception as e: print(f"Error: {e}") raise # ============================================================================ # Gradio Application # ============================================================================ def create_gradio_interface(): """Create Gradio interface""" # Get available models available_models = get_available_models() # Load speaker images imgs_dir = os.path.join(SCRIPT_DIR, "imgs") speaker_images = [] if os.path.exists(imgs_dir): # Sort by numerical filename (0.webp, 1.webp, ...) image_files = sorted(glob.glob(os.path.join(imgs_dir, "*.webp")), key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) speaker_images = [(img, f"Speaker {os.path.splitext(os.path.basename(img))[0]}") for img in image_files] with gr.Blocks( title="AI Gaming Voice", ) as demo: gr.Markdown( """ # AI Gaming Voice - 🍵 Matcha-TTS ONNX (Japanese) / 日本語 ### 6 Voices - 140MB or 42MB(Qint8 but slow) Japanese Text-to-Speech.(Half-width alphanumeric characters are not supported. Please correct/fix it.) 日本語音声合成です。(半角・英数字は未対応・直してください。) """ ) with gr.Row(): with gr.Column(): # Model Selection model_dropdown = gr.Dropdown( label="モデル / Model", choices=available_models, value=DEFAULT_MODEL if DEFAULT_MODEL in available_models else available_models[0], interactive=True ) text_input = gr.Textbox( label="日本語テキスト / Japanese Text", value="こんにちは、世界!", lines=3, placeholder="日本語のテキストを入力してください..." ) # Speaker Selection Gallery if speaker_images: gr.Markdown("### 話者選択 / Select Speaker") speaker_gallery = gr.Gallery( value=speaker_images, label="話者 / Speakers", show_label=False, columns=6, rows=1, height=160, allow_preview=False, interactive=False, object_fit="cover", elem_id="speaker_gallery" ) # Speaker ID speaker_id = gr.Number( label="Speaker ID (スピーカーID)", value=0, minimum=0, maximum=99, precision=0, info="上の画像をタップするか、数値を入力してください" ) with gr.Row(): temperature = gr.Slider( label="Temperature (温度)", minimum=0.0, maximum=1.0, step=0.01, value=0.667, info="サンプリングのランダム性" ) speaking_rate = gr.Slider( label="Speaking Rate (話速)", minimum=0.1, maximum=5.0, step=0.1, value=1.0, info="1.0 = 標準速度" ) with gr.Row(): synthesise_btn = gr.Button( "🎵 音声生成 / Synthesize", variant="primary", size="lg" ) clear_btn = gr.Button( "クリア / Clear", variant="secondary" ) with gr.Column(): audio_output = gr.Audio( label="生成音声 / Generated Audio", type="filepath" ) info_output = gr.Textbox( label="情報 / Information", lines=5, interactive=False ) # Examples gr.Examples( examples=[ ["こんにちは、世界!", "g003_ep5709.onnx", 0, 0.667, 1.0], ["エイアイゲーミングボイス", "g003_ep5709.onnx", 0, 0.667, 0.8], ["わたくしの名前はストラよ", "g003_ep5709.onnx", 0, 0.667, 1.0], ["わたしの名前はシムですよ", "g003_ep5709.onnx", 1, 0.667, 1.0], ["わたしはナラともうします", "g003_ep5709.onnx", 2, 0.667, 1.0], ["わたし、ロールプリンよ!", "g003_ep5709.onnx", 3, 0.667, 1.0], ["僕の名前はショーンだよ", "g003_ep5709.onnx", 4, 0.667, 1.0], ["私の名前はありません", "g003_ep5709.onnx", 5, 0.667, 1.0], ], inputs=[text_input, model_dropdown, speaker_id, temperature, speaking_rate], label="例文 / Examples" ) # Event handlers # Gallery click handler if speaker_images: def on_gallery_select(evt: gr.SelectData): return evt.index speaker_gallery.select( fn=on_gallery_select, inputs=None, outputs=speaker_id ).then( fn=synthesise, inputs=[text_input, model_dropdown, speaker_id, temperature, speaking_rate], outputs=[audio_output, info_output] ) synthesise_btn.click( fn=synthesise, inputs=[text_input, model_dropdown, speaker_id, temperature, speaking_rate], outputs=[audio_output, info_output] ) clear_btn.click( fn=lambda: (None, None, ""), outputs=[audio_output, info_output] ) gr.Markdown( """ --- ### ℹ️ Information / 情報 - **Model / モデル**: Matcha-TTS (ONNX) - **Inference / 推論**: ONNX Runtime - **Phonemizer / 音素化**: `pyopenjtalk` - **ZeroGPU**: Optimized for fast startup & inference / 高速起動・推論に最適化 ### 🗣️ Speaker Selection / 話者選択 - **Click Image / 画像クリック**: Selects speaker & generates audio / 話者を選択して音声を生成 - **Speaker ID**: Manual input also supported / 手動入力も可能 ### FAQ **Why AI Gaming Voice?** - I have a plan to support another ONNX models. **Model Difference** - **qint8**: 1/3 size but slow. **How to create my voice** - [Github](https://github.com/akjava/Matcha-TTS-Japanese) - I'll update here. **Model** - [Huggingface:matcha-tts_ja_100speakers_group003f-CL-V2](https://huggingface.co/Akjava/matcha-tts_ja_100speakers_group003f-CL-V2) **Who are they?** - [Youtube:4 of them are member of AI Gaming Circle](https://www.youtube.com/@ai-gaming-circle) """ ) return demo # ============================================================================ # Main # ============================================================================ if __name__ == "__main__": demo = create_gradio_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )