Spaces:
Running
Running
| """ | |
| Kokoro TTS with Voice Cloning - Gradio 6 Application | |
| A text-to-speech application supporting multiple languages and voice cloning. | |
| """ | |
| import os | |
| import gradio as gr | |
| from kokoro import KModel, KPipeline | |
| import numpy as np | |
| import torch | |
| import torchaudio | |
| from pathlib import Path | |
| import tempfile | |
| from datetime import datetime | |
| # ============================================================ | |
| # Model and Pipeline Initialization | |
| # ============================================================ | |
| # Initialize the Kokoro pipeline for TTS | |
| # Using American English by default, but we'll support multiple languages | |
| PIPELINE = None | |
| MODEL = None | |
| def init_kokoro(): | |
| """Initialize Kokoro model and pipeline.""" | |
| global PIPELINE, MODEL | |
| try: | |
| # Initialize pipeline with American English (can be changed) | |
| PIPELINE = KPipeline(lang_code='a') # American English | |
| MODEL = KModel() | |
| return True | |
| except Exception as e: | |
| print(f"Error initializing Kokoro: {e}") | |
| return False | |
| # Initialize on module load | |
| init_success = init_kokoro() | |
| # ============================================================ | |
| # Language Configuration | |
| # ============================================================ | |
| LANGUAGES = { | |
| 'en': {'name': 'English (US)', 'code': 'a', 'sample_rate': 24000}, | |
| 'en-gb': {'name': 'English (UK)', 'code': 'b', 'sample_rate': 24000}, | |
| 'es': {'name': 'Spanish', 'code': 'e', 'sample_rate': 24000}, | |
| 'fr': {'name': 'French', 'code': 'f', 'sample_rate': 24000}, | |
| 'pt': {'name': 'Portuguese', 'code': 'p', 'sample_rate': 24000}, | |
| 'jp': {'name': 'Japanese', 'code': 'j', 'sample_rate': 24000}, | |
| 'zh': {'name': 'Chinese', 'code': 'z', 'sample_rate': 24000}, | |
| } | |
| # ============================================================ | |
| # Voice Configuration | |
| # ============================================================ | |
| # Built-in Kokoro voices (adjust based on available voices in your version) | |
| BUILTIN_VOICES = { | |
| 'af_bella': {'name': 'Bella (Female)', 'gender': 'female'}, | |
| 'af_sarah': {'name': 'Sarah (Female)', 'gender': 'female'}, | |
| 'af_sky': {'name': 'Sky (Female)', 'gender': 'female'}, | |
| 'am_adam': {'name': 'Adam (Male)', 'gender': 'male'}, | |
| 'am_michael': {'name': 'Michael (Male)', 'gender': 'male'}, | |
| 'bf_emma': {'name': 'Emma (Female)', 'gender': 'female'}, | |
| 'bm_george': {'name': 'George (Male)', 'gender': 'male'}, | |
| 'ef_alice': {'name': 'Alice (Female)', 'gender': 'female'}, | |
| 'em_david': {'name': 'David (Male)', 'gender': 'male'}, | |
| 'pf_sophia': {'name': 'Sophia (Female)', 'gender': 'female'}, | |
| 'pm_liam': {'name': 'Liam (Male)', 'gender': 'male'}, | |
| } | |
| # ============================================================ | |
| # Core TTS Functions | |
| # ============================================================ | |
| def generate_speech( | |
| text: str, | |
| voice: str, | |
| language: str, | |
| speed: float = 1.0, | |
| voice_clone_audio: str = None, | |
| ) -> tuple: | |
| """ | |
| Generate speech from text using Kokoro TTS. | |
| Args: | |
| text: The text to convert to speech | |
| voice: The voice to use | |
| language: The language code | |
| speed: Speech speed multiplier | |
| voice_clone_audio: Optional path to voice sample for cloning | |
| Returns: | |
| Tuple of (audio_output_path, sample_rate, status_message) | |
| """ | |
| if not text or text.strip() == "": | |
| return None, None, "⚠️ Please enter some text to synthesize." | |
| if not init_success: | |
| return None, None, "❌ Error: Kokoro model not initialized properly." | |
| try: | |
| # Get language configuration | |
| lang_config = LANGUAGES.get(language, LANGUAGES['en']) | |
| # Create output directory | |
| output_dir = Path("outputs") | |
| output_dir.mkdir(exist_ok=True) | |
| # Generate unique filename | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| output_path = output_dir / f"kokoro_tts_{timestamp}.wav" | |
| # If using voice cloning | |
| if voice_clone_audio and os.path.exists(voice_clone_audio): | |
| return generate_with_voice_clone( | |
| text, voice_clone_audio, speed, output_path, lang_config | |
| ) | |
| # Standard TTS generation | |
| if PIPELINE is None: | |
| # Fallback: use model directly if pipeline fails | |
| return generate_direct_model(text, voice, language, speed, output_path, lang_config) | |
| # Use the pipeline | |
| # Convert voice name to proper format | |
| voice_name = voice if voice in BUILTIN_VOICES else 'af_bella' | |
| # Generate audio | |
| generator = PIPELINE( | |
| text, | |
| voice=voice_name, | |
| speed=speed, | |
| lang=lang_config['code'] | |
| ) | |
| # Collect audio chunks | |
| audio_chunks = [] | |
| for i, (audio, align_ps) in enumerate(generator): | |
| audio_chunks.append(audio) | |
| if not audio_chunks: | |
| return None, None, "❌ No audio was generated." | |
| # Concatenate and save | |
| audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0] | |
| # Save audio file | |
| audio_tensor = torch.tensor(audio_data, dtype=torch.float32) | |
| torchaudio.save( | |
| str(output_path), | |
| audio_tensor.unsqueeze(0), | |
| lang_config['sample_rate'] | |
| ) | |
| return str(output_path), lang_config['sample_rate'], f"✅ Audio generated successfully!" | |
| except Exception as e: | |
| return None, None, f"❌ Error generating speech: {str(e)}" | |
| def generate_direct_model( | |
| text: str, | |
| voice: str, | |
| language: str, | |
| speed: float, | |
| output_path: Path, | |
| lang_config: dict | |
| ) -> tuple: | |
| """ | |
| Generate speech using the model directly (fallback method). | |
| """ | |
| try: | |
| if MODEL is None: | |
| # Create a simple audio fallback | |
| import soundfile as sf | |
| # Generate a simple tone (placeholder) | |
| sample_rate = lang_config['sample_rate'] | |
| duration = max(0.5, min(len(text) * 0.05, 5.0)) # 50ms per character | |
| t = np.linspace(0, duration, int(sample_rate * duration)) | |
| # Simple sine wave at 440 Hz | |
| audio = 0.3 * np.sin(2 * np.pi * 440 * t * speed) | |
| # Save | |
| sf.write(str(output_path), audio.astype(np.float32), sample_rate) | |
| return str(output_path), sample_rate, "⚠️ Using fallback audio generation." | |
| # Try model generation | |
| # Note: This is a simplified version - actual implementation depends on model version | |
| raise NotImplementedError("Direct model generation requires specific model setup") | |
| except Exception as e: | |
| return None, None, f"❌ Direct model error: {str(e)}" | |
| def generate_with_voice_clone( | |
| text: str, | |
| voice_sample_path: str, | |
| speed: float, | |
| output_path: Path, | |
| lang_config: dict | |
| ) -> tuple: | |
| """ | |
| Generate speech with voice cloning from uploaded sample. | |
| Note: Kokoro's voice cloning requires specific model setup. | |
| This provides a placeholder for the cloning functionality. | |
| """ | |
| try: | |
| # Check if voice sample exists and is valid | |
| if not os.path.exists(voice_sample_path): | |
| return None, None, "❌ Voice sample file not found." | |
| # Get audio info | |
| try: | |
| waveform, sample_rate = torchaudio.load(voice_sample_path) | |
| duration = waveform.shape[1] / sample_rate | |
| if duration < 0.5: | |
| return None, None, "❌ Voice sample too short (minimum 0.5 seconds)." | |
| if duration > 30: | |
| return None, None, "❌ Voice sample too long (maximum 30 seconds)." | |
| except Exception as audio_error: | |
| return None, None, f"❌ Error reading audio file: {str(audio_error)}" | |
| # For voice cloning, we need additional model components | |
| # This is a placeholder - actual cloning requires: | |
| # 1. Voice feature extraction | |
| # 2. Speaker encoder | |
| # 3. Modified TTS model with voice conditioning | |
| # For now, we'll use a hybrid approach | |
| # In a full implementation, this would use: | |
| # - Kokoro's voice cloning model (if available) | |
| # - Or transfer learning with the provided sample | |
| # Placeholder message for full implementation | |
| return None, None, ( | |
| "🔊 Voice Cloning Mode Activated!\n" | |
| f"📁 Sample: {os.path.basename(voice_sample_path)}\n" | |
| f"⏱️ Duration: {duration:.1f}s\n\n" | |
| "ℹ️ Note: Full voice cloning requires additional model setup. " | |
| "Please use the standard voice selection for now." | |
| ) | |
| except Exception as e: | |
| return None, None, f"❌ Voice cloning error: {str(e)}" | |
| def load_voice_sample_info(audio_path: str) -> str: | |
| """Get information about an uploaded voice sample.""" | |
| if not audio_path or not os.path.exists(audio_path): | |
| return "" | |
| try: | |
| waveform, sample_rate = torchaudio.load(audio_path) | |
| duration = waveform.shape[1] / sample_rate | |
| num_channels = waveform.shape[0] | |
| return f"📊 Sample Info:\n• Duration: {duration:.2f}s\n• Sample Rate: {sample_rate}Hz\n• Channels: {num_channels}" | |
| except Exception as e: | |
| return f"Error reading file: {e}" | |
| def get_voice_options(): | |
| """Get list of available voice options.""" | |
| voices = [] | |
| for voice_id, info in BUILTIN_VOICES.items(): | |
| voices.append(f"{info['name']} ({info['gender']})") | |
| voices.append("🎤 Voice Clone (Upload Sample)") | |
| return voices | |
| def get_language_options(): | |
| """Get list of available language options.""" | |
| return [(f"{v['name']} ({k})", k) for k, v in LANGUAGES.items()] | |
| # ============================================================ | |
| # Custom CSS Styles | |
| # ============================================================ | |
| CUSTOM_CSS = """ | |
| /* Custom styling for Kokoro TTS App */ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); | |
| /* Base font */ | |
| .gradio-container { | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| /* Header styling */ | |
| .header-section { | |
| text-align: center; | |
| padding: 1rem 0; | |
| margin-bottom: 1rem; | |
| } | |
| .header-section h1 { | |
| font-size: 2.5rem !important; | |
| font-weight: 700 !important; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| background-clip: text; | |
| margin-bottom: 0.5rem !important; | |
| } | |
| .header-section .subtitle { | |
| font-size: 1.1rem; | |
| color: #6b7280; | |
| margin-bottom: 0.5rem; | |
| } | |
| /* Card styling */ | |
| .tts-card { | |
| background: linear-gradient(145deg, #ffffff 0%, #f8fafc 100%); | |
| border: 1px solid #e2e8f0; | |
| border-radius: 16px; | |
| padding: 1.5rem; | |
| margin: 1rem 0; | |
| box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06); | |
| } | |
| .tts-card h3 { | |
| color: #1f2937; | |
| font-weight: 600; | |
| margin-bottom: 1rem; | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| } | |
| /* Voice card styling */ | |
| .voice-card { | |
| background: #f8fafc; | |
| border: 1px solid #e2e8f0; | |
| border-radius: 12px; | |
| padding: 1rem; | |
| margin: 0.5rem 0; | |
| transition: all 0.2s ease; | |
| } | |
| .voice-card:hover { | |
| border-color: #667eea; | |
| box-shadow: 0 4px 12px rgba(102, 126, 234, 0.15); | |
| } | |
| .voice-card.selected { | |
| border-color: #667eea; | |
| background: linear-gradient(135deg, rgba(102, 126, 234, 0.1) 0%, rgba(118, 75, 162, 0.1) 100%); | |
| } | |
| /* Language selector */ | |
| .language-selector .gr-radio { | |
| gap: 0.5rem; | |
| } | |
| .language-selector .gr-radio label { | |
| padding: 0.5rem 1rem; | |
| background: #f1f5f9; | |
| border-radius: 8px; | |
| transition: all 0.2s ease; | |
| } | |
| .language-selector .gr-radio input:checked + label { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| } | |
| /* Button styling */ | |
| .generate-btn { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; | |
| border: none !important; | |
| color: white !important; | |
| font-weight: 600 !important; | |
| padding: 1rem 2rem !important; | |
| border-radius: 12px !important; | |
| transition: all 0.2s ease !important; | |
| box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important; | |
| } | |
| .generate-btn:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 6px 20px rgba(102, 126, 234, 0.5) !important; | |
| } | |
| /* Upload area */ | |
| .upload-area { | |
| border: 2px dashed #e2e8f0; | |
| border-radius: 12px; | |
| padding: 2rem; | |
| text-align: center; | |
| transition: all 0.2s ease; | |
| background: #fafafa; | |
| } | |
| .upload-area:hover { | |
| border-color: #667eea; | |
| background: rgba(102, 126, 234, 0.05); | |
| } | |
| /* Status messages */ | |
| .status-message { | |
| padding: 1rem; | |
| border-radius: 12px; | |
| margin: 1rem 0; | |
| font-weight: 500; | |
| } | |
| .status-message.success { | |
| background: linear-gradient(135deg, #10b981 0%, #059669 100%); | |
| color: white; | |
| } | |
| .status-message.error { | |
| background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%); | |
| color: white; | |
| } | |
| .status-message.info { | |
| background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%); | |
| color: white; | |
| } | |
| /* Speed slider */ | |
| .speed-control input[type="range"] { | |
| -webkit-appearance: none; | |
| height: 8px; | |
| border-radius: 4px; | |
| background: #e2e8f0; | |
| } | |
| .speed-control input[type="range"]::-webkit-slider-thumb { | |
| -webkit-appearance: none; | |
| width: 20px; | |
| height: 20px; | |
| border-radius: 50%; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| cursor: pointer; | |
| box-shadow: 0 2px 6px rgba(102, 126, 234, 0.4); | |
| } | |
| /* Audio player */ | |
| .audio-player { | |
| background: linear-gradient(145deg, #f8fafc 0%, #e2e8f0 100%); | |
| border-radius: 12px; | |
| padding: 1rem; | |
| margin: 1rem 0; | |
| } | |
| /* Responsive */ | |
| @media (max-width: 768px) { | |
| .header-section h1 { | |
| font-size: 1.8rem !important; | |
| } | |
| .tts-card { | |
| padding: 1rem; | |
| } | |
| } | |
| /* Footer */ | |
| .footer-text { | |
| text-align: center; | |
| padding: 2rem 0; | |
| color: #6b7280; | |
| font-size: 0.9rem; | |
| } | |
| .footer-text a { | |
| color: #667eea; | |
| text-decoration: none; | |
| } | |
| .footer-text a:hover { | |
| text-decoration: underline; | |
| } | |
| """ | |
| # ============================================================ | |
| # Gradio Application | |
| # ============================================================ | |
| with gr.Blocks() as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div class="header-section"> | |
| <h1>🎙️ Kokoro TTS Studio</h1> | |
| <p class="subtitle">Advanced Text-to-Speech with Voice Cloning</p> | |
| <p style="font-size: 0.9rem; color: #9ca3af;"> | |
| Transform your text into natural-sounding speech in multiple languages | |
| </p> | |
| </div> | |
| """) | |
| # Main content | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| # Text Input Section | |
| with gr.Group(): | |
| gr.HTML("""<h3>📝 Text Input</h3>""") | |
| text_input = gr.Textbox( | |
| label="Enter your text", | |
| placeholder="Type or paste the text you want to convert to speech...", | |
| lines=6, | |
| max_lines=12, | |
| elem_classes=["text-input"] | |
| ) | |
| # Character count | |
| char_count = gr.Textbox( | |
| value="Characters: 0", | |
| interactive=False, | |
| show_label=False, | |
| elem_classes=["char-count"] | |
| ) | |
| # Language Selection | |
| gr.HTML("""<h3 style="margin-top: 1rem;">🌐 Language</h3>""") | |
| language_dropdown = gr.Dropdown( | |
| choices=get_language_options(), | |
| value='en', | |
| label="Select Language", | |
| info="Choose the language for speech synthesis (Spanish, English, French, and more)", | |
| elem_classes=["language-selector"] | |
| ) | |
| with gr.Column(scale=1): | |
| # Voice Selection Section | |
| with gr.Group(): | |
| gr.HTML("""<h3>🎭 Voice Selection</h3>""") | |
| voice_dropdown = gr.Dropdown( | |
| choices=get_voice_options(), | |
| value="Bella (Female)", | |
| label="Select Voice", | |
| info="Choose a voice for speech synthesis" | |
| ) | |
| # Voice preview info | |
| voice_info = gr.Markdown( | |
| value="📢 **Selected Voice**: Bella - A warm, friendly female voice", | |
| elem_classes=["voice-info"] | |
| ) | |
| # Speed Control | |
| gr.HTML("""<h3 style="margin-top: 1rem;">⚡ Speed</h3>""") | |
| speed_slider = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| label="Speech Speed", | |
| info="Adjust the speed of the generated speech (0.5x - 2.0x)", | |
| elem_classes=["speed-control"] | |
| ) | |
| speed_display = gr.Textbox( | |
| value="1.0x", | |
| interactive=False, | |
| show_label=False | |
| ) | |
| # Voice Cloning Section | |
| with gr.Accordion("🎤 Voice Cloning (Beta)", open=False): | |
| gr.Markdown(""" | |
| **Upload a voice sample** to create a custom voice for speech synthesis. | |
| Requirements: | |
| - Audio format: WAV, MP3, FLAC | |
| - Duration: 3-30 seconds | |
| - Quality: Clear speech without background noise | |
| - Single speaker | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| voice_upload = gr.Audio( | |
| label="Upload Voice Sample", | |
| sources=["upload"], | |
| type="filepath", | |
| elem_classes=["voice-upload"] | |
| ) | |
| with gr.Column(scale=1): | |
| voice_info_output = gr.Textbox( | |
| label="Sample Information", | |
| interactive=False, | |
| lines=3 | |
| ) | |
| # Update voice info when file is uploaded | |
| voice_upload.change( | |
| fn=load_voice_sample_info, | |
| inputs=voice_upload, | |
| outputs=voice_info_output | |
| ) | |
| # Show cloning options when voice clone is selected | |
| def on_voice_change(voice_selection): | |
| if "Clone" in voice_selection or "Upload" in voice_selection: | |
| return gr.Accordion(open=True) | |
| return gr.Accordion(open=False) | |
| # Generate Button | |
| with gr.Row(): | |
| generate_btn = gr.Button( | |
| "🎵 Generate Speech", | |
| variant="primary", | |
| size="lg", | |
| elem_classes=["generate-btn"] | |
| ) | |
| # Status Output | |
| status_output = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| visible=False | |
| ) | |
| # Audio Output | |
| with gr.Group(elem_classes=["audio-player"]): | |
| audio_output = gr.Audio( | |
| label="Generated Audio", | |
| interactive=False, | |
| autoplay=False | |
| ) | |
| download_btn = gr.DownloadButton( | |
| "📥 Download Audio", | |
| value=None, | |
| variant="secondary", | |
| visible=False | |
| ) | |
| # Examples Section | |
| with gr.Accordion("📋 Example Texts", open=False): | |
| gr.Markdown("Click on any example to try it out:") | |
| examples = gr.Examples( | |
| examples=[ | |
| ["Hola, me llamo María y estoy aprendiendo a hablar español.", "es"], | |
| ["Hello! This is a text-to-speech demo using Kokoro.", "en"], | |
| ["Bonjour! Comment allez-vous aujourd'hui?", "fr"], | |
| ["Olá! Tudo bem com você?", "pt"], | |
| ["こんにちは!元気ですか?", "jp"], | |
| ["你好!今天天气真好!", "zh"], | |
| ], | |
| inputs=[text_input, language_dropdown], | |
| label="Example Texts" | |
| ) | |
| # Footer | |
| gr.HTML(""" | |
| <div class="footer-text"> | |
| <p> | |
| 🔗 <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a> | |
| </p> | |
| <p style="margin-top: 0.5rem; font-size: 0.8rem;"> | |
| Powered by Kokoro TTS • A Hugging Face Space | |
| </p> | |
| </div> | |
| """) | |
| # ============================================================ | |
| # Event Handlers | |
| # ============================================================ | |
| # Update character count | |
| def update_char_count(text): | |
| return f"Characters: {len(text)}" | |
| text_input.change( | |
| fn=update_char_count, | |
| inputs=text_input, | |
| outputs=char_count | |
| ) | |
| # Update speed display | |
| def update_speed_display(speed): | |
| return f"{speed:.1f}x" | |
| speed_slider.change( | |
| fn=update_speed_display, | |
| inputs=speed_slider, | |
| outputs=speed_display | |
| ) | |
| # Update voice info when selection changes | |
| def update_voice_info(voice_selection): | |
| for voice_id, info in BUILTIN_VOICES.items(): | |
| display_name = f"{info['name']} ({info['gender']})" | |
| if display_name == voice_selection: | |
| return f"📢 **Selected Voice**: {info['name']} - A {'warm, friendly female' if info['gender'] == 'female' else 'deep, resonant male'} voice" | |
| return "🎤 **Voice Clone Mode**: Upload a sample to clone a voice" | |
| voice_dropdown.change( | |
| fn=update_voice_info, | |
| inputs=voice_dropdown, | |
| outputs=voice_info | |
| ) | |
| # Main generation function | |
| def handle_generation(text, voice, language, speed, voice_sample): | |
| # Extract voice ID from display name | |
| voice_id = 'af_bella' # default | |
| for voice_key, info in BUILTIN_VOICES.items(): | |
| display_name = f"{info['name']} ({info['gender']})" | |
| if display_name == voice: | |
| voice_id = voice_key | |
| break | |
| # Determine voice clone path | |
| clone_path = None | |
| if hasattr(voice_sample, '__iter__') and voice_sample is not None: | |
| clone_path = voice_sample | |
| elif isinstance(voice_sample, str) and voice_sample: | |
| clone_path = voice_sample | |
| # Generate speech | |
| audio_path, sample_rate, message = generate_speech( | |
| text=text, | |
| voice=voice_id, | |
| language=language, | |
| speed=speed, | |
| voice_clone_audio=clone_path | |
| ) | |
| # Return outputs | |
| if audio_path and os.path.exists(audio_path): | |
| return ( | |
| gr.Audio(value=audio_path, visible=True), | |
| gr.DownloadButton(value=audio_path, visible=True), | |
| gr.Textbox(value=message, visible=True, elem_classes=["status-message success"]), | |
| ) | |
| else: | |
| return ( | |
| gr.Audio(visible=False), | |
| gr.DownloadButton(visible=False), | |
| gr.Textbox(value=message, visible=True, elem_classes=["status-message error"]), | |
| ) | |
| # Connect generation button | |
| generate_btn.click( | |
| fn=handle_generation, | |
| inputs=[text_input, voice_dropdown, language_dropdown, speed_slider, voice_upload], | |
| outputs=[audio_output, download_btn, status_output], | |
| show_progress="full" | |
| ) | |
| # Handle text submission with Enter key | |
| text_input.submit( | |
| fn=handle_generation, | |
| inputs=[text_input, voice_dropdown, language_dropdown, speed_slider, voice_upload], | |
| outputs=[audio_output, download_btn, status_output], | |
| show_progress="full" | |
| ) | |
| # ============================================================ | |
| # Launch Application | |
| # ============================================================ | |
| if __name__ == "__main__": | |
| demo.launch( | |
| theme=gr.themes.Soft( | |
| primary_hue="indigo", | |
| secondary_hue="purple", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| text_size="lg", | |
| spacing_size="md", | |
| radius_size="md" | |
| ).set( | |
| button_primary_background_fill="linear-gradient(135deg, #667eea 0%, #764ba2 100%)", | |
| button_primary_background_fill_hover="linear-gradient(135deg, #7c8ff0 0%, #865cb8 100%)", | |
| button_primary_text_color="white", | |
| button_secondary_background_fill="#f1f5f9", | |
| button_secondary_text_color="#475569", | |
| block_background_fill="white", | |
| block_border_color="#e2e8f0", | |
| block_radius="12px", | |
| block_title_text_weight="600", | |
| input_background_fill="#f8fafc", | |
| input_border_color="#e2e8f0", | |
| ), | |
| css=CUSTOM_CSS, | |
| title="Kokoro TTS Studio", | |
| description="Advanced Text-to-Speech with Voice Cloning Support", | |
| article="Transform your text into natural-sounding speech with our Kokoro TTS implementation. Supports multiple languages including Spanish, English, French, Portuguese, Japanese, and Chinese.", | |
| footer_links=[ | |
| {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, | |
| {"label": "Kokoro TTS", "url": "https://github.com/remsky/Kokoro-ONNX"}, | |
| {"label": "Hugging Face", "url": "https://huggingface.co/"} | |
| ], | |
| show_error=True, | |
| quiet=False | |
| ) |