Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| ONNX-based TTS Gradio Application for Japanese | |
| PyTorch-free implementation using ONNX Runtime | |
| """ | |
| import glob | |
| import os | |
| import tempfile | |
| from time import perf_counter | |
| from typing import Optional | |
| import gradio as gr | |
| import numpy as np | |
| import onnxruntime as ort | |
| import pyopenjtalk | |
| import soundfile as sf | |
| try: | |
| import spaces | |
| except ImportError: | |
| class spaces: | |
| def GPU(func): | |
| return func | |
| # ============================================================================ | |
| # Configuration | |
| # ============================================================================ | |
| # Get script directory | |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| MODELS_DIR = os.path.join(SCRIPT_DIR, "models") | |
| DEFAULT_MODEL = "g003_ep5709.onnx" | |
| MODEL_PATH = os.getenv("MODEL_PATH", os.path.join(MODELS_DIR, DEFAULT_MODEL)) | |
| VOCODER_PATH = os.getenv("VOCODER_PATH", None) | |
| USE_GPU = os.getenv("USE_GPU", "false").lower() == "true" | |
| SAMPLE_RATE = 22050 | |
| DEBUG = os.getenv("DEBUG", "false").lower() == "true" | |
| def get_available_models(): | |
| """Get list of available ONNX models from models directory""" | |
| if not os.path.exists(MODELS_DIR): | |
| return [DEFAULT_MODEL] | |
| models = glob.glob(os.path.join(MODELS_DIR, "*.onnx")) | |
| model_names = [os.path.basename(m) for m in models] | |
| if not model_names: | |
| return [DEFAULT_MODEL] | |
| return sorted(model_names) | |
| # ============================================================================ | |
| # Text Processing (PyTorch-free) | |
| # ============================================================================ | |
| # Load symbols from matcha | |
| _pad = "_" | |
| _punctuation = ';:,.!?¡¿—…"«»"" ' | |
| _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | |
| _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" | |
| symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) | |
| _symbol_to_id = {s: i for i, s in enumerate(symbols)} | |
| def text_to_sequence(text): | |
| """Convert text to sequence of IDs""" | |
| sequence = [] | |
| for symbol in text: | |
| if symbol in _symbol_to_id: | |
| sequence.append(_symbol_to_id[symbol]) | |
| else: | |
| sequence.append(0) # Unknown symbol | |
| return sequence | |
| def intersperse(sequence, token): | |
| """Intersperse token between elements of sequence""" | |
| result = [token] * (len(sequence) * 2 + 1) | |
| result[1::2] = sequence | |
| return result | |
| def process_japanese_text(text: str): | |
| """Process Japanese text to phoneme sequence""" | |
| if not text.strip(): | |
| raise ValueError("Text cannot be empty") | |
| # Phonemize using pyopenjtalk | |
| phonemes = pyopenjtalk.g2p(text, kana=False) | |
| phonemes = phonemes.replace(" ", "") | |
| phonemes = phonemes.replace("pau", " ") | |
| if DEBUG: | |
| print(f"Input: {text}") | |
| print(f"Phonemes: {phonemes}") | |
| # Text to sequence | |
| sequence = text_to_sequence(phonemes) | |
| # Intersperse with padding | |
| sequence = intersperse(sequence, 0) | |
| # Convert to numpy | |
| x = np.array(sequence, dtype=np.int64)[np.newaxis, :] | |
| x_lengths = np.array([x.shape[-1]], dtype=np.int64) | |
| return x, x_lengths | |
| # ============================================================================ | |
| # ONNX Model Manager | |
| # ============================================================================ | |
| class ONNXModelManager: | |
| """Manages ONNX model loading and inference""" | |
| def __init__(self, model_path: str, vocoder_path: Optional[str] = None, use_gpu: bool = False): | |
| self.model_path = model_path | |
| self.vocoder_path = vocoder_path | |
| self.use_gpu = use_gpu | |
| # Select execution providers | |
| if use_gpu: | |
| self.providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] | |
| else: | |
| self.providers = ["CPUExecutionProvider"] | |
| self.model = None | |
| self.vocoder = None | |
| self.is_multi_speaker = False | |
| self.has_vocoder_embedded = False | |
| self._load_model() | |
| def _load_model(self): | |
| """Load ONNX model(s)""" | |
| if DEBUG: | |
| print(f"Loading model from {self.model_path} with providers {self.providers}") | |
| self.model = ort.InferenceSession(self.model_path, providers=self.providers) | |
| model_inputs = self.model.get_inputs() | |
| model_outputs = list(self.model.get_outputs()) | |
| self.is_multi_speaker = len(model_inputs) == 4 | |
| self.has_vocoder_embedded = model_outputs[0].name == "wav" | |
| if DEBUG: | |
| print(f"Model loaded: multi_speaker={self.is_multi_speaker}, " | |
| f"vocoder_embedded={self.has_vocoder_embedded}") | |
| # Load external vocoder if needed | |
| if not self.has_vocoder_embedded and self.vocoder_path: | |
| if DEBUG: | |
| print(f"Loading external vocoder from {self.vocoder_path}") | |
| self.vocoder = ort.InferenceSession(self.vocoder_path, providers=self.providers) | |
| def synthesize( | |
| self, | |
| x: np.ndarray, | |
| x_lengths: np.ndarray, | |
| scales: np.ndarray, | |
| spks: Optional[np.ndarray] = None | |
| ): | |
| """Run ONNX inference""" | |
| inputs = { | |
| "x": x, | |
| "x_lengths": x_lengths, | |
| "scales": scales, | |
| } | |
| if self.is_multi_speaker and spks is not None: | |
| inputs["spks"] = spks | |
| # Run Matcha inference | |
| outputs = self.model.run(None, inputs) | |
| if self.has_vocoder_embedded: | |
| # End-to-end: model outputs waveform directly | |
| return outputs[0], outputs[1] # wav, wav_lengths | |
| else: | |
| # Model outputs mel spectrogram | |
| mels, mel_lengths = outputs[0], outputs[1] | |
| if self.vocoder is not None: | |
| # Run external vocoder | |
| vocoder_inputs = {self.vocoder.get_inputs()[0].name: mels} | |
| wavs = self.vocoder.run(None, vocoder_inputs)[0] | |
| wavs = wavs.squeeze(1) | |
| wav_lengths = mel_lengths * 256 | |
| return wavs, wav_lengths | |
| else: | |
| # No vocoder available, return mel | |
| return mels, mel_lengths | |
| # Initialize model managers (one per model) | |
| model_managers = {} | |
| current_model = None | |
| def get_model_manager(model_name: str) -> ONNXModelManager: | |
| """Get or create model manager for specified model""" | |
| global model_managers, current_model | |
| model_path = os.path.join(MODELS_DIR, model_name) | |
| if model_name not in model_managers: | |
| if DEBUG: | |
| print(f"Loading new model: {model_name}") | |
| model_managers[model_name] = ONNXModelManager( | |
| model_path=model_path, | |
| vocoder_path=VOCODER_PATH, | |
| use_gpu=USE_GPU | |
| ) | |
| current_model = model_name | |
| return model_managers[model_name] | |
| # Pre-load all available models | |
| if DEBUG: | |
| print("Pre-loading all models for ZeroGPU...") | |
| for model_name in get_available_models(): | |
| get_model_manager(model_name) | |
| if DEBUG: | |
| print("All models loaded.") | |
| # ============================================================================ | |
| # Gradio Interface Functions | |
| # ============================================================================ | |
| def synthesise( | |
| text: str, | |
| model_name: str, | |
| speaker_id: int, | |
| temperature: float, | |
| speaking_rate: float, | |
| ): | |
| """ | |
| Synthesize speech from Japanese text | |
| Args: | |
| text: Japanese text input | |
| model_name: Model filename | |
| speaker_id: Speaker ID (for multi-speaker models) | |
| temperature: Sampling temperature | |
| speaking_rate: Speaking rate multiplier | |
| Returns: | |
| Tuple of (audio_path, phonemes_text) | |
| """ | |
| t0 = perf_counter() | |
| try: | |
| # Get model manager | |
| manager = get_model_manager(model_name) | |
| # Process text | |
| x, x_lengths = process_japanese_text(text) | |
| # Prepare scales | |
| scales = np.array([temperature, speaking_rate], dtype=np.float32) | |
| # Prepare speaker ID | |
| spks = None | |
| if manager.is_multi_speaker and speaker_id >= 0: | |
| spks = np.array([speaker_id], dtype=np.int64) | |
| # Run inference | |
| outputs, output_lengths = manager.synthesize(x, x_lengths, scales, spks) | |
| # Extract single result | |
| audio = outputs[0][:output_lengths[0]] | |
| inference_time = perf_counter() - t0 | |
| # Calculate RTF | |
| audio_duration_sec = len(audio) / SAMPLE_RATE | |
| rtf = inference_time / audio_duration_sec | |
| if DEBUG: | |
| print(f"Inference time: {inference_time:.3f}s, " | |
| f"Audio duration: {audio_duration_sec:.3f}s, " | |
| f"RTF: {rtf:.3f}") | |
| # Save to temporary file | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: | |
| sf.write(fp.name, audio, SAMPLE_RATE, "PCM_24") | |
| audio_path = fp.name | |
| # Get phonemes for display | |
| phonemes = pyopenjtalk.g2p(text, kana=False) | |
| phonemes = phonemes.replace(" ", "") | |
| phonemes = phonemes.replace("pau", " ") | |
| info = f"Model: {model_name}\n" | |
| info += f"Speaker ID: {speaker_id if manager.is_multi_speaker else 'N/A (Single speaker)'}\n" | |
| info += f"Phonemes: {phonemes}\n" | |
| info += f"RTF: {rtf:.3f}" | |
| return audio_path, info | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| raise | |
| # ============================================================================ | |
| # Gradio Application | |
| # ============================================================================ | |
| def create_gradio_interface(): | |
| """Create Gradio interface""" | |
| # Get available models | |
| available_models = get_available_models() | |
| # Load speaker images | |
| imgs_dir = os.path.join(SCRIPT_DIR, "imgs") | |
| speaker_images = [] | |
| if os.path.exists(imgs_dir): | |
| # Sort by numerical filename (0.webp, 1.webp, ...) | |
| image_files = sorted(glob.glob(os.path.join(imgs_dir, "*.webp")), | |
| key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) | |
| speaker_images = [(img, f"Speaker {os.path.splitext(os.path.basename(img))[0]}") for img in image_files] | |
| with gr.Blocks( | |
| title="AI Gaming Voice", | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # AI Gaming Voice - 🍵 Matcha-TTS ONNX (Japanese) / 日本語 | |
| ### 6 Voices - 140MB or 42MB(Qint8 but slow) | |
| Japanese Text-to-Speech.(Half-width alphanumeric characters are not supported. Please correct/fix it.) | |
| 日本語音声合成です。(半角・英数字は未対応・直してください。) | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Model Selection | |
| model_dropdown = gr.Dropdown( | |
| label="モデル / Model", | |
| choices=available_models, | |
| value=DEFAULT_MODEL if DEFAULT_MODEL in available_models else available_models[0], | |
| interactive=True | |
| ) | |
| text_input = gr.Textbox( | |
| label="日本語テキスト / Japanese Text", | |
| value="こんにちは、世界!", | |
| lines=3, | |
| placeholder="日本語のテキストを入力してください..." | |
| ) | |
| # Speaker Selection Gallery | |
| if speaker_images: | |
| gr.Markdown("### 話者選択 / Select Speaker") | |
| speaker_gallery = gr.Gallery( | |
| value=speaker_images, | |
| label="話者 / Speakers", | |
| show_label=False, | |
| columns=6, | |
| rows=1, | |
| height=160, | |
| allow_preview=False, | |
| interactive=False, | |
| object_fit="cover", | |
| elem_id="speaker_gallery" | |
| ) | |
| # Speaker ID | |
| speaker_id = gr.Number( | |
| label="Speaker ID (スピーカーID)", | |
| value=0, | |
| minimum=0, | |
| maximum=99, | |
| precision=0, | |
| info="上の画像をタップするか、数値を入力してください" | |
| ) | |
| with gr.Row(): | |
| temperature = gr.Slider( | |
| label="Temperature (温度)", | |
| minimum=0.0, | |
| maximum=1.0, | |
| step=0.01, | |
| value=0.667, | |
| info="サンプリングのランダム性" | |
| ) | |
| speaking_rate = gr.Slider( | |
| label="Speaking Rate (話速)", | |
| minimum=0.1, | |
| maximum=5.0, | |
| step=0.1, | |
| value=1.0, | |
| info="1.0 = 標準速度" | |
| ) | |
| with gr.Row(): | |
| synthesise_btn = gr.Button( | |
| "🎵 音声生成 / Synthesize", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| clear_btn = gr.Button( | |
| "クリア / Clear", | |
| variant="secondary" | |
| ) | |
| with gr.Column(): | |
| audio_output = gr.Audio( | |
| label="生成音声 / Generated Audio", | |
| type="filepath" | |
| ) | |
| info_output = gr.Textbox( | |
| label="情報 / Information", | |
| lines=5, | |
| interactive=False | |
| ) | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| ["こんにちは、世界!", "g003_ep5709.onnx", 0, 0.667, 1.0], | |
| ["エイアイゲーミングボイス", "g003_ep5709.onnx", 0, 0.667, 0.8], | |
| ["わたくしの名前はストラよ", "g003_ep5709.onnx", 0, 0.667, 1.0], | |
| ["わたしの名前はシムですよ", "g003_ep5709.onnx", 1, 0.667, 1.0], | |
| ["わたしはナラともうします", "g003_ep5709.onnx", 2, 0.667, 1.0], | |
| ["わたし、ロールプリンよ!", "g003_ep5709.onnx", 3, 0.667, 1.0], | |
| ["僕の名前はショーンだよ", "g003_ep5709.onnx", 4, 0.667, 1.0], | |
| ["私の名前はありません", "g003_ep5709.onnx", 5, 0.667, 1.0], | |
| ], | |
| inputs=[text_input, model_dropdown, speaker_id, temperature, speaking_rate], | |
| label="例文 / Examples" | |
| ) | |
| # Event handlers | |
| # Gallery click handler | |
| if speaker_images: | |
| def on_gallery_select(evt: gr.SelectData): | |
| return evt.index | |
| speaker_gallery.select( | |
| fn=on_gallery_select, | |
| inputs=None, | |
| outputs=speaker_id | |
| ).then( | |
| fn=synthesise, | |
| inputs=[text_input, model_dropdown, speaker_id, temperature, speaking_rate], | |
| outputs=[audio_output, info_output] | |
| ) | |
| synthesise_btn.click( | |
| fn=synthesise, | |
| inputs=[text_input, model_dropdown, speaker_id, temperature, speaking_rate], | |
| outputs=[audio_output, info_output] | |
| ) | |
| clear_btn.click( | |
| fn=lambda: (None, None, ""), | |
| outputs=[audio_output, info_output] | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### ℹ️ Information / 情報 | |
| - **Model / モデル**: Matcha-TTS (ONNX) | |
| - **Inference / 推論**: ONNX Runtime | |
| - **Phonemizer / 音素化**: `pyopenjtalk` | |
| - **ZeroGPU**: Optimized for fast startup & inference / 高速起動・推論に最適化 | |
| ### 🗣️ Speaker Selection / 話者選択 | |
| - **Click Image / 画像クリック**: Selects speaker & generates audio / 話者を選択して音声を生成 | |
| - **Speaker ID**: Manual input also supported / 手動入力も可能 | |
| ### FAQ | |
| **Why AI Gaming Voice?** | |
| - I have a plan to support another ONNX models. | |
| **Model Difference** | |
| - **qint8**: 1/3 size but slow. | |
| **How to create my voice** | |
| - [Github](https://github.com/akjava/Matcha-TTS-Japanese) - I'll update here. | |
| **Model** | |
| - [Huggingface:matcha-tts_ja_100speakers_group003f-CL-V2](https://huggingface.co/Akjava/matcha-tts_ja_100speakers_group003f-CL-V2) | |
| **Who are they?** | |
| - [Youtube:4 of them are member of AI Gaming Circle](https://www.youtube.com/@ai-gaming-circle) | |
| """ | |
| ) | |
| return demo | |
| # ============================================================================ | |
| # Main | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| demo = create_gradio_interface() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True | |
| ) | |