Spaces:
Running
Running
| import gradio as gr | |
| import numpy as np | |
| import time | |
| import re | |
| import os | |
| import soundfile as sf | |
| import warnings | |
| from kokoro_onnx import Kokoro | |
| from kokoro_onnx.tokenizer import Tokenizer | |
| # Suppress warnings | |
| warnings.filterwarnings("ignore") | |
| # Initialize tokenizer and model | |
| tokenizer = Tokenizer() | |
| kokoro = Kokoro("onnx_deps/kokoro-v1.0.onnx", "onnx_deps/voices-v1.0.bin") | |
| # Constants | |
| SUPPORTED_LANGUAGES = ["en-us", "en-gb", "es", "fr-fr", "hi", "it", "ja", "pt-br", "zh"] | |
| AUDIO_DIR = "audio_exports" | |
| CURRENT_VOICE = "af_sky" # Default voice | |
| # Create output directory if it doesn't exist | |
| os.makedirs(AUDIO_DIR, exist_ok=True) | |
| # Split pattern presets | |
| SPLIT_PATTERNS = { | |
| "Paragraphs (one or more newlines)": r"\n+", | |
| "Sentences (periods, question marks, exclamation points)": r"(?<=[.!?])\s+", | |
| "Commas and semicolons": r"[,;]\s+", | |
| "No splitting (process as one chunk)": r"$^", # Pattern that won't match anything | |
| "Custom": "custom", | |
| } | |
| def preview_text_splitting(text, split_pattern): | |
| """ | |
| Preview how text will be split based on the pattern | |
| """ | |
| try: | |
| if split_pattern == "$^": # Special case for no splitting | |
| return [text] | |
| chunks = re.split(split_pattern, text) | |
| # Filter out empty chunks | |
| chunks = [chunk.strip() for chunk in chunks if chunk.strip()] | |
| return chunks | |
| except Exception as e: | |
| return [f"Error previewing split: {e}"] | |
| def run_performance_tests(text, voice, language, split_pattern, speed): | |
| """ | |
| Run performance tests comparing different approaches | |
| Returns: | |
| String with detailed test results | |
| """ | |
| results = [] | |
| results.append("=== KOKORO-ONNX PERFORMANCE TEST RESULTS ===\n") | |
| # Split text into chunks for comparison | |
| chunks = re.split(split_pattern, text) | |
| chunks = [chunk.strip() for chunk in chunks if chunk.strip()] | |
| results.append(f"Text split into {len(chunks)} chunks\n") | |
| # Test 1: Per-chunk vs. Full-text tokenization | |
| results.append("TEST #1: TOKENIZATION STRATEGIES") | |
| # Approach 1: Per-chunk tokenization | |
| start_time = time.time() | |
| all_phonemes = [] | |
| for chunk in chunks: | |
| phonemes = tokenizer.phonemize(chunk, lang=language) | |
| all_phonemes.append(phonemes) | |
| per_chunk_time = time.time() - start_time | |
| results.append(f"Per-chunk tokenization: {per_chunk_time:.6f}s") | |
| # Approach 2: Single tokenization for entire text | |
| start_time = time.time() | |
| full_phonemes = tokenizer.phonemize(text, lang=language) | |
| full_tokenization_time = time.time() - start_time | |
| results.append(f"Full text tokenization: {full_tokenization_time:.6f}s") | |
| if full_tokenization_time > 0: | |
| results.append(f"Speedup: {per_chunk_time / full_tokenization_time:.2f}x\n") | |
| # Test 2: Audio generation strategies | |
| results.append("TEST #2: AUDIO GENERATION STRATEGIES") | |
| # Approach 1: Generate per chunk | |
| start_time = time.time() | |
| audio_chunks = [] | |
| for p in all_phonemes: | |
| if p.strip(): # Skip empty phonemes | |
| audio, _ = kokoro.create(p, voice=voice, speed=speed, is_phonemes=True) | |
| audio_chunks.append(audio) | |
| split_gen_time = time.time() - start_time | |
| results.append(f"Generate per chunk: {split_gen_time:.6f}s") | |
| # Approach 2: Generate for full text | |
| start_time = time.time() | |
| audio_full, _ = kokoro.create( | |
| full_phonemes, voice=voice, speed=speed, is_phonemes=True | |
| ) | |
| full_gen_time = time.time() - start_time | |
| results.append(f"Generate full text: {full_gen_time:.6f}s") | |
| if full_gen_time > 0: | |
| results.append(f"Speedup: {split_gen_time / full_gen_time:.2f}x\n") | |
| # Test 3: Total processing time comparison | |
| results.append("TEST #3: TOTAL PROCESSING TIME") | |
| total_chunked = per_chunk_time + split_gen_time | |
| total_full = full_tokenization_time + full_gen_time | |
| results.append(f"Total time (chunked): {total_chunked:.6f}s") | |
| results.append(f"Total time (full text): {total_full:.6f}s") | |
| if total_full > 0: | |
| results.append(f"Overall speedup: {total_chunked / total_full:.2f}x") | |
| # Recommendations | |
| results.append("\nRECOMMENDATIONS:") | |
| if per_chunk_time > full_tokenization_time: | |
| results.append("- Tokenize entire text at once instead of per-chunk") | |
| if split_gen_time > full_gen_time: | |
| results.append("- Generate audio for entire text rather than per-chunk") | |
| elif split_gen_time < full_gen_time: | |
| results.append("- Keep generating audio in chunks for better performance") | |
| return "\n".join(results) | |
| # [OLD] Chunking create func | |
| def create(text: str, voice: str, language: str, blend_voice_name: str = None, | |
| blend_ratio: float = 0.5, split_pattern: str = r"\n+", speed: float = 1.0, | |
| output_dir: str = AUDIO_DIR): | |
| """ | |
| Generate audio using Kokoro-ONNX with added features | |
| Args: | |
| text: Text to synthesize | |
| voice: Primary voice to use | |
| language: Language code | |
| blend_voice_name: Optional secondary voice for blending | |
| blend_ratio: Ratio of primary to secondary voice (0.0-1.0) | |
| split_pattern: Pattern to split text into chunks | |
| speed: Speech rate | |
| output_dir: Directory to save audio files | |
| Returns: | |
| Tuple of (audio_tuple, phonemes, split_info, timing_info) | |
| """ | |
| global CURRENT_VOICE | |
| # Create output directory if it doesn't exist | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Update current voice | |
| if voice != CURRENT_VOICE and not blend_voice_name: | |
| print(f"Voice changed from {CURRENT_VOICE} to {voice}") | |
| CURRENT_VOICE = voice | |
| # Start total timing | |
| start_total_time = time.time() | |
| # Split text into chunks | |
| chunks = preview_text_splitting(text, split_pattern) | |
| split_info = f"Text split into {len(chunks)} chunks using pattern: '{split_pattern}'" | |
| print(split_info) | |
| # Initialize variables for processing | |
| all_audio = [] | |
| all_phonemes = [] | |
| sample_rate = 24000 # Kokoro's sample rate | |
| # Timing metrics | |
| phoneme_times = [] | |
| generation_times = [] | |
| save_times = [] | |
| # Process each chunk | |
| for i, chunk in enumerate(chunks): | |
| # Skip empty chunks | |
| if not chunk.strip(): | |
| continue | |
| # Time phonemization | |
| phoneme_start = time.time() | |
| phonemes = tokenizer.phonemize(chunk, lang=language) | |
| phoneme_time = time.time() - phoneme_start | |
| phoneme_times.append(phoneme_time) | |
| print(f"Chunk {i+1} phonemized in {phoneme_time:.6f}s") | |
| # Save phonemes | |
| all_phonemes.append(f"Chunk {i+1}: {phonemes}") | |
| # Handle voice blending | |
| voice_blend_start = time.time() | |
| voice_to_use = voice | |
| if blend_voice_name: | |
| first_voice = kokoro.get_voice_style(voice) | |
| second_voice = kokoro.get_voice_style(blend_voice_name) | |
| voice_to_use = np.add(first_voice * blend_ratio, second_voice * (1 - blend_ratio)) | |
| print(f"Voices blended in {time.time() - voice_blend_start:.6f}s") | |
| # Generate audio | |
| gen_start = time.time() | |
| audio, sr = kokoro.create(phonemes, voice=voice_to_use, speed=speed, is_phonemes=True) | |
| gen_time = time.time() - gen_start | |
| generation_times.append(gen_time) | |
| print(f"Chunk {i+1} audio generated in {gen_time:.6f}s") | |
| # Add to audio list | |
| all_audio.append(audio) | |
| # Save individual chunk to file | |
| save_start = time.time() | |
| voice_label = voice.split('_')[1] if isinstance(voice, str) else 'blend' | |
| chunk_filename = os.path.join(output_dir, f"chunk_{i+1}_{voice_label}.wav") | |
| sf.write(chunk_filename, audio, sr) | |
| save_time = time.time() - save_start | |
| save_times.append(save_time) | |
| print(f"Chunk {i+1} saved to {chunk_filename} in {save_time:.6f}s") | |
| # Time to combine chunks | |
| combine_start = time.time() | |
| if len(all_audio) > 1: | |
| audio_data = np.concatenate(all_audio) | |
| combine_time = time.time() - combine_start | |
| print(f"Combined {len(all_audio)} chunks in {combine_time:.6f}s") | |
| else: | |
| audio_data = all_audio[0] if all_audio else np.array([]) | |
| combine_time = 0 | |
| # Time to save combined file | |
| save_combined_start = time.time() | |
| voice_label = voice.split('_')[1] if isinstance(voice, str) else 'blend' | |
| combined_filename = os.path.join(output_dir, f"combined_{voice_label}.wav") | |
| sf.write(combined_filename, audio_data, sample_rate) | |
| save_combined_time = time.time() - save_combined_start | |
| print(f"Combined audio saved to {combined_filename} in {save_combined_time:.6f}s") | |
| # Calculate total time | |
| total_time = time.time() - start_total_time | |
| # Create detailed timing info | |
| chunks_count = len(all_audio) | |
| timing_lines = [] | |
| # Add summary of processing times | |
| timing_lines.append(f"Phonemization time: {sum(phoneme_times):.6f}s") | |
| timing_lines.append(f"Audio generation time: {sum(generation_times):.6f}s") | |
| # Per-chunk timing | |
| if chunks_count > 1: | |
| timing_lines.append("\nChunk details:") | |
| for i in range(chunks_count): | |
| timing_lines.append(f" Chunk {i+1}: Phoneme {phoneme_times[i]:.6f}s, Gen {generation_times[i]:.6f}s, Save {save_times[i]:.6f}s") | |
| # Combine and save timing | |
| if chunks_count > 1: | |
| timing_lines.append(f"\nCombine chunks: {combine_time:.6f}s") | |
| timing_lines.append(f"Save combined: {save_combined_time:.6f}s") | |
| # Total timing | |
| timing_lines.append(f"\nTotal processing time: {total_time:.6f}s") | |
| # Format timing info for display | |
| timing_info = "\n".join(timing_lines) | |
| # Combine phonemes | |
| phonemes_text = "\n\n".join(all_phonemes) | |
| # Update split info | |
| if chunks_count > 1: | |
| split_info = f"Text was split into {chunks_count} chunks and saved to {output_dir}" | |
| else: | |
| split_info = f"Text processed as a single chunk and saved to {output_dir}" | |
| return [(sample_rate, audio_data), phonemes_text, split_info, timing_info] | |
| # Optimized -- over rides paragraph splitting behavior... | |
| # def create( | |
| # text: str, | |
| # voice: str, | |
| # language: str, | |
| # blend_voice_name: str = None, | |
| # blend_ratio: float = 0.5, | |
| # split_pattern: str = r"\n+", | |
| # speed: float = 1.0, | |
| # output_dir: str = AUDIO_DIR, | |
| # ): | |
| # """ | |
| # Generate audio using Kokoro-ONNX with optimized processing | |
| # Args: | |
| # text: Text to synthesize | |
| # voice: Primary voice to use | |
| # language: Language code | |
| # blend_voice_name: Optional secondary voice for blending | |
| # blend_ratio: Ratio of primary to secondary voice (0.0-1.0) | |
| # split_pattern: Pattern to split text into chunks | |
| # speed: Speech rate | |
| # output_dir: Directory to save audio files | |
| # Returns: | |
| # Tuple of (audio_tuple, phonemes, split_info, timing_info) | |
| # """ | |
| # global CURRENT_VOICE | |
| # # Create output directory if it doesn't exist | |
| # os.makedirs(output_dir, exist_ok=True) | |
| # # Update current voice | |
| # if voice != CURRENT_VOICE and not blend_voice_name: | |
| # print(f"Voice changed from {CURRENT_VOICE} to {voice}") | |
| # CURRENT_VOICE = voice | |
| # # Start total timing | |
| # start_total_time = time.time() | |
| # # Split text only for display purposes | |
| # chunks = preview_text_splitting(text, split_pattern) | |
| # split_info = ( | |
| # f"Text split into {len(chunks)} chunks using pattern: '{split_pattern}'" | |
| # ) | |
| # print(split_info) | |
| # # Phonemize the entire text at once (optimization #1) | |
| # phoneme_start = time.time() | |
| # phonemes = tokenizer.phonemize(text, lang=language) | |
| # phoneme_time = time.time() - phoneme_start | |
| # print(f"Text phonemized in {phoneme_time:.6f}s") | |
| # # Handle voice blending | |
| # voice_blend_start = time.time() | |
| # voice_to_use = voice | |
| # if blend_voice_name: | |
| # first_voice = kokoro.get_voice_style(voice) | |
| # second_voice = kokoro.get_voice_style(blend_voice_name) | |
| # voice_to_use = np.add( | |
| # first_voice * blend_ratio, second_voice * (1 - blend_ratio) | |
| # ) | |
| # voice_blend_time = time.time() - voice_blend_start | |
| # print(f"Voices blended in {voice_blend_time:.6f}s") | |
| # # Generate audio for entire text at once (optimization #2) | |
| # gen_start = time.time() | |
| # audio, sample_rate = kokoro.create( | |
| # phonemes, voice=voice_to_use, speed=speed, is_phonemes=True | |
| # ) | |
| # gen_time = time.time() - gen_start | |
| # print(f"Audio generated in {gen_time:.6f}s") | |
| # # Save to file | |
| # save_start = time.time() | |
| # voice_label = voice.split("_")[1] if isinstance(voice, str) else "blend" | |
| # filename = os.path.join(output_dir, f"full_{voice_label}.wav") | |
| # sf.write(filename, audio, sample_rate) | |
| # save_time = time.time() - save_start | |
| # print(f"Audio saved to {filename} in {save_time:.6f}s") | |
| # # Calculate total time | |
| # total_time = time.time() - start_total_time | |
| # # Create timing info | |
| # timing_lines = [ | |
| # f"Phonemization time: {phoneme_time:.6f}s", | |
| # f"Audio generation time: {gen_time:.6f}s", | |
| # f"Save time: {save_time:.6f}s", | |
| # f"\nTotal processing time: {total_time:.6f}s", | |
| # f"\nOptimized approach: Processing entire text at once (2.1x faster)", | |
| # ] | |
| # timing_info = "\n".join(timing_lines) | |
| # # For display, still show the text chunks | |
| # chunk_display = [] | |
| # for i, chunk in enumerate(chunks): | |
| # chunk_display.append(f"Chunk {i + 1}: Text: {chunk[:50]}...") | |
| # phonemes_display = ( | |
| # "Full text phonemes (first 100 chars):\n" + phonemes[:100] + "..." | |
| # ) | |
| # return [(sample_rate, audio), phonemes_display, split_info, timing_info] | |
| def on_split_pattern_change(pattern_name, custom_pattern): | |
| """ | |
| Handle changes to the split pattern selection | |
| """ | |
| if pattern_name == "Custom": | |
| return custom_pattern, gr.update(visible=True) | |
| else: | |
| return SPLIT_PATTERNS[pattern_name], gr.update(visible=False) | |
| def preview_splits(text, pattern): | |
| """ | |
| Preview how text will be split based on the pattern | |
| """ | |
| chunks = preview_text_splitting(text, pattern) | |
| if len(chunks) == 1 and pattern == "$^": | |
| return "Text will be processed as a single chunk (no splitting)" | |
| result = f"Text will be split into {len(chunks)} chunks:\n\n" | |
| for i, chunk in enumerate(chunks): | |
| # Truncate very long chunks in the preview | |
| display_chunk = chunk[:100] + "..." if len(chunk) > 100 else chunk | |
| result += f"Chunk {i + 1}: {display_chunk}\n\n" | |
| return result | |
| def create_app(): | |
| with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Lato"), gr.themes.GoogleFont("Roboto"), "system-ui", "sans-serif"])) as ui: | |
| # Title | |
| gr.Markdown("# Kokoro-ONNX TTS Demo") | |
| gr.Markdown("#### Optimized ONNX implementation with Voice Blending") | |
| # Input controls | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| text_input = gr.TextArea( | |
| label="Input Text", | |
| rtl=False, | |
| value="Hello!\n\nThis is a multi-paragraph test.\nWith multiple lines.\n\nKokoro can split on paragraphs, sentences, or other patterns.", | |
| lines=8, | |
| ) | |
| # Information about split patterns | |
| with gr.Accordion("About Text Splitting", open=False): | |
| gr.Markdown(""" | |
| ### Understanding Text Splitting | |
| The splitting pattern controls how Kokoro breaks your text into manageable chunks for processing. | |
| **Common patterns:** | |
| - `\\n+`: Split on one or more newlines (paragraphs) | |
| - `(?<=[.!?])\\s+`: Split after periods, question marks, and exclamation points (sentences) | |
| - `[,;]\\s+`: Split after commas and semicolons | |
| - `$^`: Special pattern that won't match anything (processes the entire text as one chunk) | |
| **Benefits of splitting:** | |
| - Better phrasing and natural pauses | |
| - Improved handling of longer texts | |
| - More consistent pronunciation across chunks | |
| """) | |
| # Split Pattern Selection | |
| split_pattern_dropdown = gr.Dropdown( | |
| label="Split Text Using", | |
| value="Paragraphs (one or more newlines)", | |
| choices=list(SPLIT_PATTERNS.keys()), | |
| info="Select how to split your text into chunks", | |
| ) | |
| custom_pattern_input = gr.Textbox( | |
| label="Custom Split Pattern (Regular Expression)", | |
| value=r"\n+", | |
| visible=False, | |
| info="Enter a custom regex pattern for splitting text", | |
| ) | |
| preview_button = gr.Button("Preview Text Splitting") | |
| split_preview = gr.Textbox( | |
| label="Split Preview", | |
| value="Click 'Preview Text Splitting' to see how your text will be divided", | |
| lines=5, | |
| ) | |
| with gr.Column(scale=1): | |
| # Language selection | |
| language_input = gr.Dropdown( | |
| label="Language", | |
| value="en-us", | |
| choices=SUPPORTED_LANGUAGES, | |
| info="Select the language for text processing", | |
| ) | |
| # Voice selection | |
| voice_input = gr.Dropdown( | |
| label="Primary Voice", | |
| value="af_sky", | |
| choices=sorted(kokoro.get_voices()), | |
| info="Select primary voice for synthesis", | |
| ) | |
| # Voice blending | |
| with gr.Accordion("Voice Blending (Optional)", open=False): | |
| blend_voice_input = gr.Dropdown( | |
| label="Secondary Voice for Blending", | |
| value=None, | |
| choices=[None] + sorted(kokoro.get_voices()), | |
| info="Select secondary voice to blend with primary voice", | |
| ) | |
| blend_ratio = gr.Slider( | |
| label="Blend Ratio (Primary:Secondary)", | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.5, | |
| step=0.05, | |
| info="0.0 = 100% Secondary, 1.0 = 100% Primary", | |
| ) | |
| gr.Markdown(""" | |
| **Voice blending lets you combine characteristics of two voices.** | |
| - A 50:50 blend gives equal weight to both voices | |
| - Higher values emphasize the primary voice | |
| - Lower values emphasize the secondary voice | |
| """) | |
| # Speed slider | |
| speed_input = gr.Slider( | |
| label="Speech Speed", | |
| minimum=0.5, | |
| maximum=1.5, | |
| value=1.0, | |
| step=0.1, | |
| info="Adjust speaking rate", | |
| ) | |
| # Add a testing mode toggle | |
| with gr.Accordion("Performance Testing", open=False): | |
| test_mode = gr.Checkbox(label="Enable Test Mode", value=False) | |
| gr.Markdown(""" | |
| ### Performance Testing | |
| When enabled, clicking "Generate Audio" will run performance tests instead of generating audio. | |
| Tests compare different processing approaches to identify the most efficient method. | |
| Use this to optimize your implementation based on your specific hardware and text content. | |
| """) | |
| with gr.Column(scale=1): | |
| # Generate button | |
| submit_button = gr.Button("Generate Audio", variant="primary") | |
| # Outputs | |
| audio_output = gr.Audio( | |
| label="Generated Audio", format="wav", show_download_button=True | |
| ) | |
| audio_gen_timing_output = gr.Textbox( | |
| label="Performance Metrics", lines=12 | |
| ) | |
| phonemes_output = gr.Textbox(label="Phoneme Representation", lines=10) | |
| split_info_output = gr.Textbox(label="Processing Information", lines=5) | |
| test_results = gr.Textbox( | |
| label="Test Results", | |
| lines=15, | |
| visible=False, # Hidden until test is run | |
| ) | |
| # Handle split pattern change | |
| split_pattern_dropdown.change( | |
| fn=on_split_pattern_change, | |
| inputs=[split_pattern_dropdown, custom_pattern_input], | |
| outputs=[custom_pattern_input, custom_pattern_input], | |
| ) | |
| # Preview splitting button | |
| preview_button.click( | |
| fn=preview_splits, | |
| inputs=[text_input, custom_pattern_input], | |
| outputs=[split_preview], | |
| ) | |
| # Button click handler | |
| def on_generate( | |
| text, | |
| voice, | |
| language, | |
| blend_voice, | |
| blend_ratio, | |
| split_pattern, | |
| speed, | |
| test_mode, | |
| ): | |
| if test_mode: | |
| # Run performance tests | |
| results = run_performance_tests( | |
| text, voice, language, split_pattern, speed | |
| ) | |
| # Make the results visible | |
| return None, None, None, None, gr.update(visible=True, value=results) | |
| else: | |
| # Regular generation | |
| audio_tuple, phonemes, split_info, timing_info = create( | |
| text, | |
| voice, | |
| language, | |
| blend_voice_name=blend_voice, | |
| blend_ratio=blend_ratio, | |
| split_pattern=split_pattern, | |
| speed=speed, | |
| output_dir=AUDIO_DIR, | |
| ) | |
| # Return results and hide test results | |
| return ( | |
| audio_tuple, | |
| timing_info, | |
| phonemes, | |
| split_info, | |
| gr.update(visible=False), | |
| ) | |
| submit_button.click( | |
| fn=on_generate, | |
| inputs=[ | |
| text_input, | |
| voice_input, | |
| language_input, | |
| blend_voice_input, | |
| blend_ratio, | |
| custom_pattern_input, | |
| speed_input, | |
| test_mode, | |
| ], | |
| outputs=[ | |
| audio_output, | |
| audio_gen_timing_output, | |
| phonemes_output, | |
| split_info_output, | |
| test_results, | |
| ], | |
| ) | |
| return ui | |
| # Create and launch the app | |
| ui = create_app() | |
| ui.launch( | |
| debug=True, | |
| server_name="0.0.0.0", # Make accessible externally | |
| server_port=7860, # Choose your port | |
| share=True, # Set to True if you want a public link | |
| ) | |