Spaces:
Running
Running
| """ | |
| Text-to-Speech Application | |
| ================================================ | |
| Created by: Yash Chowdhary | |
| A comprehensive TTS application using Kokoro-82M model with full voice control. | |
| Features: | |
| - 28 built-in voices (American & British English, Male & Female) | |
| - Speed control (0.5x - 2.0x) | |
| - Pitch adjustment via audio post-processing | |
| - Configurable pause insertion | |
| - Style presets for different tones (Neutral, Dramatic, Whisper, etc.) | |
| License: Apache 2.0 (same as Kokoro model) | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import soundfile as sf | |
| import io | |
| import re | |
| from typing import Optional, Tuple, Generator | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| # Kokoro TTS imports | |
| from kokoro import KPipeline | |
| # ============================================================================ | |
| # CONFIGURATION & CONSTANTS | |
| # ============================================================================ | |
| SAMPLE_RATE = 24000 # Kokoro outputs 24kHz audio | |
| MAX_CHAR_LIMIT = 5000 # Maximum characters per generation | |
| # Voice definitions with metadata | |
| # Format: voice_id -> (display_name, gender, accent, quality_grade, description) | |
| VOICE_CATALOG = { | |
| # American English - Female | |
| "af_heart": ("Heart β€οΈ", "Female", "American", "A", "Premium quality, warm and natural"), | |
| "af_bella": ("Bella π₯", "Female", "American", "A-", "Clear and expressive"), | |
| "af_nicole": ("Nicole π§", "Female", "American", "B-", "Professional narrator style"), | |
| "af_aoede": ("Aoede", "Female", "American", "C+", "Melodic and pleasant"), | |
| "af_kore": ("Kore", "Female", "American", "C+", "Youthful and energetic"), | |
| "af_sarah": ("Sarah", "Female", "American", "C+", "Friendly and approachable"), | |
| "af_nova": ("Nova", "Female", "American", "C", "Modern and crisp"), | |
| "af_sky": ("Sky", "Female", "American", "C-", "Light and airy"), | |
| "af_alloy": ("Alloy", "Female", "American", "C", "Balanced and versatile"), | |
| "af_jessica": ("Jessica", "Female", "American", "D", "Casual conversational"), | |
| "af_river": ("River", "Female", "American", "D", "Gentle and flowing"), | |
| # American English - Male | |
| "am_michael": ("Michael", "Male", "American", "C+", "Authoritative and clear"), | |
| "am_fenrir": ("Fenrir", "Male", "American", "C+", "Deep and resonant"), | |
| "am_puck": ("Puck", "Male", "American", "C+", "Playful and dynamic"), | |
| "am_echo": ("Echo", "Male", "American", "D", "Warm and reflective"), | |
| "am_eric": ("Eric", "Male", "American", "D", "Professional and steady"), | |
| "am_liam": ("Liam", "Male", "American", "D", "Young and natural"), | |
| "am_onyx": ("Onyx", "Male", "American", "D", "Rich and smooth"), | |
| "am_santa": ("Santa π ", "Male", "American", "D-", "Jolly and festive"), | |
| "am_adam": ("Adam", "Male", "American", "F+", "Basic male voice"), | |
| # British English - Female | |
| "bf_emma": ("Emma", "Female", "British", "B-", "Elegant British accent"), | |
| "bf_isabella": ("Isabella", "Female", "British", "C", "Sophisticated and refined"), | |
| "bf_alice": ("Alice", "Female", "British", "D", "Classic British tone"), | |
| "bf_lily": ("Lily", "Female", "British", "D", "Soft and gentle"), | |
| # British English - Male | |
| "bm_george": ("George", "Male", "British", "C", "Distinguished gentleman"), | |
| "bm_fable": ("Fable", "Male", "British", "C", "Storyteller quality"), | |
| "bm_lewis": ("Lewis", "Male", "British", "D+", "Conversational British"), | |
| "bm_daniel": ("Daniel", "Male", "British", "D", "Standard British male"), | |
| } | |
| class StylePreset: | |
| """Defines a style preset with associated audio parameters.""" | |
| name: str | |
| description: str | |
| speed: float | |
| pitch_shift: float # semitones | |
| pause_multiplier: float | |
| recommended_voices: list | |
| # Style presets for different tones | |
| STYLE_PRESETS = { | |
| "neutral": StylePreset( | |
| name="Neutral Narrator", | |
| description="Clear, balanced narration suitable for most content", | |
| speed=1.0, | |
| pitch_shift=0, | |
| pause_multiplier=1.0, | |
| recommended_voices=["af_heart", "af_bella", "am_michael", "bf_emma"] | |
| ), | |
| "dramatic": StylePreset( | |
| name="Dramatic / Horror", | |
| description="Slower, deeper voice for suspenseful or dramatic content", | |
| speed=0.85, | |
| pitch_shift=-2, | |
| pause_multiplier=1.5, | |
| recommended_voices=["am_fenrir", "am_onyx", "bm_george", "af_nicole"] | |
| ), | |
| "excited": StylePreset( | |
| name="Excited / Surprised", | |
| description="Faster, higher energy delivery", | |
| speed=1.2, | |
| pitch_shift=1, | |
| pause_multiplier=0.7, | |
| recommended_voices=["af_kore", "am_puck", "af_nova", "af_sky"] | |
| ), | |
| "calm": StylePreset( | |
| name="Calm / Meditative", | |
| description="Slow, soothing voice for relaxation content", | |
| speed=0.8, | |
| pitch_shift=-1, | |
| pause_multiplier=1.8, | |
| recommended_voices=["af_heart", "bf_lily", "am_echo", "bf_emma"] | |
| ), | |
| "storyteller": StylePreset( | |
| name="Storyteller", | |
| description="Engaging pace for audiobooks and stories", | |
| speed=0.95, | |
| pitch_shift=0, | |
| pause_multiplier=1.2, | |
| recommended_voices=["bm_fable", "af_bella", "am_michael", "bf_isabella"] | |
| ), | |
| "professional": StylePreset( | |
| name="Professional / Corporate", | |
| description="Clear, authoritative delivery for business content", | |
| speed=1.05, | |
| pitch_shift=0, | |
| pause_multiplier=1.0, | |
| recommended_voices=["af_nicole", "am_eric", "bf_emma", "bm_george"] | |
| ), | |
| "cheerful": StylePreset( | |
| name="Cheerful / Friendly", | |
| description="Warm, upbeat tone for friendly content", | |
| speed=1.1, | |
| pitch_shift=0.5, | |
| pause_multiplier=0.9, | |
| recommended_voices=["af_sarah", "am_puck", "af_kore", "am_liam"] | |
| ), | |
| } | |
| # ============================================================================ | |
| # AUDIO PROCESSING UTILITIES | |
| # ============================================================================ | |
| def pitch_shift_audio(audio: np.ndarray, sample_rate: int, semitones: float) -> np.ndarray: | |
| """ | |
| Shift the pitch of audio by a given number of semitones. | |
| Uses simple resampling-based pitch shifting (no external dependencies). | |
| Args: | |
| audio: Input audio array | |
| sample_rate: Sample rate of the audio | |
| semitones: Number of semitones to shift (positive = higher, negative = lower) | |
| Returns: | |
| Pitch-shifted audio array | |
| """ | |
| if semitones == 0: | |
| return audio | |
| # Calculate the pitch shift factor | |
| # Each semitone is a factor of 2^(1/12) | |
| factor = 2 ** (semitones / 12) | |
| # Resample to shift pitch | |
| # To raise pitch: stretch time, then resample to original length | |
| # To lower pitch: compress time, then resample to original length | |
| original_length = len(audio) | |
| # Create new sample indices | |
| new_length = int(original_length / factor) | |
| indices = np.linspace(0, original_length - 1, new_length) | |
| # Linear interpolation for resampling | |
| shifted = np.interp(indices, np.arange(original_length), audio) | |
| # Resample back to original length to maintain duration | |
| final_indices = np.linspace(0, len(shifted) - 1, original_length) | |
| result = np.interp(final_indices, np.arange(len(shifted)), shifted) | |
| return result.astype(np.float32) | |
| def insert_pauses(audio_segments: list, pause_duration_ms: int, sample_rate: int) -> np.ndarray: | |
| """ | |
| Insert silence between audio segments. | |
| Args: | |
| audio_segments: List of audio arrays | |
| pause_duration_ms: Pause duration in milliseconds | |
| sample_rate: Sample rate of the audio | |
| Returns: | |
| Combined audio with pauses inserted | |
| """ | |
| if not audio_segments: | |
| return np.array([], dtype=np.float32) | |
| # Create silence array | |
| pause_samples = int(sample_rate * pause_duration_ms / 1000) | |
| silence = np.zeros(pause_samples, dtype=np.float32) | |
| # Combine segments with pauses | |
| combined = [] | |
| for i, segment in enumerate(audio_segments): | |
| combined.append(segment) | |
| if i < len(audio_segments) - 1: # Don't add pause after last segment | |
| combined.append(silence) | |
| return np.concatenate(combined) | |
| def normalize_audio(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray: | |
| """ | |
| Normalize audio to a target dB level. | |
| Args: | |
| audio: Input audio array | |
| target_db: Target peak level in dB (default -3 dB) | |
| Returns: | |
| Normalized audio array | |
| """ | |
| if len(audio) == 0: | |
| return audio | |
| # Find the peak amplitude | |
| peak = np.max(np.abs(audio)) | |
| if peak == 0: | |
| return audio | |
| # Calculate the gain needed | |
| target_amplitude = 10 ** (target_db / 20) | |
| gain = target_amplitude / peak | |
| return (audio * gain).astype(np.float32) | |
| def preprocess_text(text: str, add_pauses: bool = True) -> str: | |
| """ | |
| Preprocess text to improve TTS output quality. | |
| Args: | |
| text: Input text | |
| add_pauses: Whether to add pause hints at sentence boundaries | |
| Returns: | |
| Preprocessed text | |
| """ | |
| # Clean up whitespace | |
| text = re.sub(r'\s+', ' ', text.strip()) | |
| # Normalize common abbreviations | |
| abbreviations = { | |
| r'\bDr\.': 'Doctor', | |
| r'\bMr\.': 'Mister', | |
| r'\bMrs\.': 'Missus', | |
| r'\bMs\.': 'Miss', | |
| r'\bProf\.': 'Professor', | |
| r'\betc\.': 'etcetera', | |
| r'\be\.g\.': 'for example', | |
| r'\bi\.e\.': 'that is', | |
| } | |
| for pattern, replacement in abbreviations.items(): | |
| text = re.sub(pattern, replacement, text, flags=re.IGNORECASE) | |
| return text | |
| # ============================================================================ | |
| # TTS ENGINE | |
| # ============================================================================ | |
| class KokoroTTSEngine: | |
| """ | |
| Wrapper class for Kokoro TTS with additional processing capabilities. | |
| """ | |
| def __init__(self): | |
| """Initialize the TTS engine with both American and British English pipelines.""" | |
| print("Initializing Kokoro TTS Engine...") | |
| # Initialize pipelines for both accents | |
| self.pipelines = { | |
| 'a': KPipeline(lang_code='a'), # American English | |
| 'b': KPipeline(lang_code='b'), # British English | |
| } | |
| # Add custom pronunciation for "Kokoro" | |
| self.pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kΛOkΙΙΉO' | |
| self.pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kΛQkΙΙΉQ' | |
| # Pre-load voice packs for faster inference | |
| print("Pre-loading voice packs...") | |
| for voice_id in VOICE_CATALOG.keys(): | |
| lang_code = voice_id[0] # 'a' or 'b' | |
| try: | |
| self.pipelines[lang_code].load_voice(voice_id) | |
| except Exception as e: | |
| print(f"Warning: Could not pre-load voice {voice_id}: {e}") | |
| print("TTS Engine initialized successfully!") | |
| def generate( | |
| self, | |
| text: str, | |
| voice: str = "af_heart", | |
| speed: float = 1.0, | |
| pitch_shift: float = 0.0, | |
| pause_between_sentences_ms: int = 300, | |
| ) -> Tuple[int, np.ndarray]: | |
| """ | |
| Generate speech from text with full parameter control. | |
| Args: | |
| text: Input text to synthesize | |
| voice: Voice ID from VOICE_CATALOG | |
| speed: Speed multiplier (0.5 to 2.0) | |
| pitch_shift: Pitch adjustment in semitones (-5 to +5) | |
| pause_between_sentences_ms: Pause duration between sentences | |
| Returns: | |
| Tuple of (sample_rate, audio_array) | |
| """ | |
| # Validate inputs | |
| text = preprocess_text(text.strip()[:MAX_CHAR_LIMIT]) | |
| if not text: | |
| return SAMPLE_RATE, np.zeros(1, dtype=np.float32) | |
| speed = max(0.5, min(2.0, speed)) | |
| pitch_shift = max(-5, min(5, pitch_shift)) | |
| # Get the appropriate pipeline | |
| lang_code = voice[0] if voice[0] in self.pipelines else 'a' | |
| pipeline = self.pipelines[lang_code] | |
| # Generate audio segments | |
| audio_segments = [] | |
| try: | |
| for _, phonemes, audio in pipeline(text, voice=voice, speed=speed): | |
| if audio is not None: | |
| audio_segments.append(audio.numpy() if hasattr(audio, 'numpy') else audio) | |
| except Exception as e: | |
| print(f"Generation error: {e}") | |
| return SAMPLE_RATE, np.zeros(1, dtype=np.float32) | |
| if not audio_segments: | |
| return SAMPLE_RATE, np.zeros(1, dtype=np.float32) | |
| # Combine segments with pauses | |
| combined_audio = insert_pauses(audio_segments, pause_between_sentences_ms, SAMPLE_RATE) | |
| # Apply pitch shift if requested | |
| if pitch_shift != 0: | |
| combined_audio = pitch_shift_audio(combined_audio, SAMPLE_RATE, pitch_shift) | |
| # Normalize the final audio | |
| combined_audio = normalize_audio(combined_audio) | |
| return SAMPLE_RATE, combined_audio | |
| def generate_with_style( | |
| self, | |
| text: str, | |
| voice: str, | |
| style_preset: str, | |
| custom_speed: Optional[float] = None, | |
| custom_pitch: Optional[float] = None, | |
| custom_pause: Optional[int] = None, | |
| ) -> Tuple[int, np.ndarray]: | |
| """ | |
| Generate speech using a style preset with optional custom overrides. | |
| Args: | |
| text: Input text to synthesize | |
| voice: Voice ID | |
| style_preset: Style preset name from STYLE_PRESETS | |
| custom_speed: Override the preset speed (optional) | |
| custom_pitch: Override the preset pitch (optional) | |
| custom_pause: Override the preset pause (optional) | |
| Returns: | |
| Tuple of (sample_rate, audio_array) | |
| """ | |
| preset = STYLE_PRESETS.get(style_preset, STYLE_PRESETS["neutral"]) | |
| speed = custom_speed if custom_speed is not None else preset.speed | |
| pitch = custom_pitch if custom_pitch is not None else preset.pitch_shift | |
| pause = custom_pause if custom_pause is not None else int(300 * preset.pause_multiplier) | |
| return self.generate( | |
| text=text, | |
| voice=voice, | |
| speed=speed, | |
| pitch_shift=pitch, | |
| pause_between_sentences_ms=pause, | |
| ) | |
| # ============================================================================ | |
| # GRADIO INTERFACE | |
| # ============================================================================ | |
| def create_voice_choices(): | |
| """Create organized voice choices for the dropdown.""" | |
| choices = [] | |
| # Group by accent and gender | |
| groups = { | |
| ("American", "Female"): [], | |
| ("American", "Male"): [], | |
| ("British", "Female"): [], | |
| ("British", "Male"): [], | |
| } | |
| for voice_id, (name, gender, accent, grade, desc) in VOICE_CATALOG.items(): | |
| groups[(accent, gender)].append((voice_id, name, grade)) | |
| # Build choices with group labels | |
| for (accent, gender), voices in groups.items(): | |
| flag = "πΊπΈ" if accent == "American" else "π¬π§" | |
| gender_icon = "πΊ" if gender == "Female" else "πΉ" | |
| for voice_id, name, grade in sorted(voices, key=lambda x: x[2]): # Sort by grade | |
| label = f"{flag} {gender_icon} {name} [{grade}]" | |
| choices.append((label, voice_id)) | |
| return choices | |
| def create_style_choices(): | |
| """Create style preset choices for the dropdown.""" | |
| return [(preset.name, key) for key, preset in STYLE_PRESETS.items()] | |
| # Initialize the TTS engine globally | |
| print("Loading Kokoro TTS Engine...") | |
| tts_engine = KokoroTTSEngine() | |
| def generate_speech( | |
| text: str, | |
| voice: str, | |
| style: str, | |
| speed: float, | |
| pitch: float, | |
| pause: int, | |
| use_style_defaults: bool, | |
| ) -> Tuple[int, np.ndarray]: | |
| """ | |
| Main generation function for Gradio interface. | |
| """ | |
| if not text.strip(): | |
| gr.Warning("Please enter some text to synthesize.") | |
| return None | |
| try: | |
| if use_style_defaults: | |
| sample_rate, audio = tts_engine.generate_with_style( | |
| text=text, | |
| voice=voice, | |
| style_preset=style, | |
| ) | |
| else: | |
| sample_rate, audio = tts_engine.generate( | |
| text=text, | |
| voice=voice, | |
| speed=speed, | |
| pitch_shift=pitch, | |
| pause_between_sentences_ms=pause, | |
| ) | |
| return (sample_rate, audio) | |
| except Exception as e: | |
| gr.Error(f"Generation failed: {str(e)}") | |
| return None | |
| def update_style_info(style: str) -> str: | |
| """Update the style information display.""" | |
| preset = STYLE_PRESETS.get(style, STYLE_PRESETS["neutral"]) | |
| recommended = ", ".join([ | |
| VOICE_CATALOG[v][0] for v in preset.recommended_voices if v in VOICE_CATALOG | |
| ]) | |
| return f"""**{preset.name}** | |
| {preset.description} | |
| - **Speed:** {preset.speed}x | |
| - **Pitch Shift:** {preset.pitch_shift:+.1f} semitones | |
| - **Pause Multiplier:** {preset.pause_multiplier}x | |
| **Recommended Voices:** {recommended} | |
| """ | |
| def update_controls_from_style(style: str, use_defaults: bool): | |
| """Update the control sliders based on selected style.""" | |
| if not use_defaults: | |
| return gr.update(), gr.update(), gr.update() | |
| preset = STYLE_PRESETS.get(style, STYLE_PRESETS["neutral"]) | |
| return ( | |
| gr.update(value=preset.speed), | |
| gr.update(value=preset.pitch_shift), | |
| gr.update(value=int(300 * preset.pause_multiplier)), | |
| ) | |
| # Sample texts for demonstration | |
| SAMPLE_TEXTS = { | |
| "welcome": """Welcome to Kokoro Text-to-Speech! This is an open-source model with 82 million parameters, | |
| capable of producing natural-sounding speech. Try different voices and styles to find your perfect combination.""", | |
| "horror": """The old house creaked as I pushed open the door. Something moved in the shadows. | |
| A whisper echoed through the empty halls... "You shouldn't have come here." | |
| I turned to run, but the door had vanished.""", | |
| "news": """Breaking news tonight: Scientists have made a groundbreaking discovery that could change | |
| our understanding of the universe. The research team announced their findings at a press conference | |
| held earlier today at the National Science Foundation.""", | |
| "story": """Once upon a time, in a kingdom far away, there lived a young princess who dreamed of adventure. | |
| One day, she discovered a magical map hidden in the castle library. | |
| Little did she know, this map would lead her to the greatest journey of her life.""", | |
| "technical": """The system architecture consists of three main components: the frontend user interface, | |
| the backend API server, and the database layer. Each component is designed for scalability and | |
| can be deployed independently using container orchestration.""", | |
| } | |
| def load_sample_text(sample_key: str) -> str: | |
| """Load a sample text.""" | |
| return SAMPLE_TEXTS.get(sample_key, "") | |
| # Build the Gradio interface | |
| with gr.Blocks( | |
| title="Text-to-Speech", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .main-title { | |
| text-align: center; | |
| margin-bottom: 1rem; | |
| } | |
| .info-box { | |
| border: 1px solid var(--border-color-primary); | |
| border-radius: 8px; | |
| padding: 1rem; | |
| margin: 0.5rem 0; | |
| } | |
| .info-box strong { | |
| color: var(--body-text-color); | |
| } | |
| """ | |
| ) as demo: | |
| # Header | |
| gr.Markdown( | |
| """ | |
| # ποΈ Text-to-Speech | |
| **Created by Yash Chowdhary** | |
| An open-source, high-quality TTS system powered by [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) | |
| Features: 28 voices β’ Style presets β’ Speed/Pitch/Pause control β’ CPU-friendly | |
| """, | |
| elem_classes=["main-title"] | |
| ) | |
| with gr.Row(): | |
| # Left column - Input controls | |
| with gr.Column(scale=1): | |
| # Text input | |
| text_input = gr.Textbox( | |
| label="π Text to Synthesize", | |
| placeholder="Enter your text here...", | |
| lines=6, | |
| max_lines=15, | |
| info=f"Maximum {MAX_CHAR_LIMIT} characters" | |
| ) | |
| # Sample text buttons | |
| with gr.Accordion("π Sample Texts", open=False): | |
| with gr.Row(): | |
| gr.Button("Welcome", size="sm").click( | |
| lambda: SAMPLE_TEXTS["welcome"], outputs=text_input | |
| ) | |
| gr.Button("Horror π»", size="sm").click( | |
| lambda: SAMPLE_TEXTS["horror"], outputs=text_input | |
| ) | |
| gr.Button("News π°", size="sm").click( | |
| lambda: SAMPLE_TEXTS["news"], outputs=text_input | |
| ) | |
| with gr.Row(): | |
| gr.Button("Story π", size="sm").click( | |
| lambda: SAMPLE_TEXTS["story"], outputs=text_input | |
| ) | |
| gr.Button("Technical π»", size="sm").click( | |
| lambda: SAMPLE_TEXTS["technical"], outputs=text_input | |
| ) | |
| # Voice selection | |
| voice_dropdown = gr.Dropdown( | |
| choices=create_voice_choices(), | |
| value="af_heart", | |
| label="π Voice", | |
| info="Select a voice (sorted by quality grade)" | |
| ) | |
| # Style preset | |
| style_dropdown = gr.Dropdown( | |
| choices=create_style_choices(), | |
| value="neutral", | |
| label="π¨ Style Preset", | |
| info="Choose a style for different content types" | |
| ) | |
| # Style info display | |
| style_info = gr.Markdown( | |
| value=update_style_info("neutral"), | |
| elem_classes=["info-box"] | |
| ) | |
| # Use style defaults checkbox | |
| use_style_defaults = gr.Checkbox( | |
| label="Use Style Preset Defaults", | |
| value=True, | |
| info="When checked, style preset values override manual controls" | |
| ) | |
| # Right column - Advanced controls and output | |
| with gr.Column(scale=1): | |
| # Advanced controls | |
| with gr.Accordion("βοΈ Advanced Controls", open=True): | |
| speed_slider = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.05, | |
| label="π Speed", | |
| info="Speaking rate (0.5x = slow, 2.0x = fast)" | |
| ) | |
| pitch_slider = gr.Slider( | |
| minimum=-5.0, | |
| maximum=5.0, | |
| value=0.0, | |
| step=0.5, | |
| label="π΅ Pitch Shift (semitones)", | |
| info="Adjust voice pitch (-5 = deeper, +5 = higher)" | |
| ) | |
| pause_slider = gr.Slider( | |
| minimum=0, | |
| maximum=1000, | |
| value=300, | |
| step=50, | |
| label="βΈοΈ Pause Between Sentences (ms)", | |
| info="Silence duration between sentences" | |
| ) | |
| # Generate button | |
| generate_btn = gr.Button( | |
| "ποΈ Generate Speech", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| # Audio output | |
| audio_output = gr.Audio( | |
| label="π Generated Audio", | |
| type="numpy", | |
| interactive=False, | |
| autoplay=True | |
| ) | |
| # Download info | |
| gr.Markdown( | |
| """ | |
| π‘ **Tips:** | |
| - Click the download button (β¬οΈ) on the audio player to save | |
| - Try different voices with the same text to compare | |
| - Use style presets as starting points, then customize | |
| """ | |
| ) | |
| # Footer | |
| gr.Markdown( | |
| """ | |
| --- | |
| **Created by Yash Chowdhary** | Powered by [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) (Apache 2.0) | |
| **Resources:** [Model Card](https://huggingface.co/hexgrad/Kokoro-82M) | | |
| [Voice List](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md) | | |
| [GitHub](https://github.com/hexgrad/kokoro) | |
| """ | |
| ) | |
| # Event handlers | |
| style_dropdown.change( | |
| fn=update_style_info, | |
| inputs=[style_dropdown], | |
| outputs=[style_info] | |
| ) | |
| style_dropdown.change( | |
| fn=update_controls_from_style, | |
| inputs=[style_dropdown, use_style_defaults], | |
| outputs=[speed_slider, pitch_slider, pause_slider] | |
| ) | |
| use_style_defaults.change( | |
| fn=update_controls_from_style, | |
| inputs=[style_dropdown, use_style_defaults], | |
| outputs=[speed_slider, pitch_slider, pause_slider] | |
| ) | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[ | |
| text_input, | |
| voice_dropdown, | |
| style_dropdown, | |
| speed_slider, | |
| pitch_slider, | |
| pause_slider, | |
| use_style_defaults, | |
| ], | |
| outputs=[audio_output] | |
| ) | |
| # Launch configuration | |
| if __name__ == "__main__": | |
| demo.queue().launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_api=True | |
| ) |