Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import os | |
| from datetime import datetime | |
| # Try to import the pocket-tts model, with graceful fallback | |
| try: | |
| from pocket_tts import TextToSpeech, Voice, VoiceProfile, Speaker | |
| MODEL_AVAILABLE = True | |
| except ImportError: | |
| MODEL_AVAILABLE = False | |
| print("pocket-tts not installed. Run: pip install pocket-tts") | |
| # Voice configuration - Pocket-TTS typically supports multiple speakers | |
| VOICE_OPTIONS = { | |
| "en_US_male_1": "American Male (Deep)", | |
| "en_US_female_1": "American Female (Clear)", | |
| "en_US_female_2": "American Female (Warm)", | |
| "en_UK_male_1": "British Male (Formal)", | |
| "en_UK_female_1": "British Female (Elegant)", | |
| "en_AU_male_1": "Australian Male (Casual)", | |
| "en_AU_female_1": "Australian Female (Friendly)", | |
| } | |
| # Language options | |
| LANGUAGE_OPTIONS = [ | |
| ("English (US)", "en_US"), | |
| ("English (UK)", "en_UK"), | |
| ("English (Australia)", "en_AU"), | |
| ] | |
| # Speed options (0.5x to 2.0x) | |
| SPEED_OPTIONS = [0.5, 0.75, 1.0, 1.25, 1.5, 2.0] | |
| # Pitch options (-12 to +12 semitones) | |
| PITCH_OPTIONS = [-12, -6, 0, 6, 12] | |
| def load_model(device="cpu"): | |
| """ | |
| Load the Pocket-TTS model. | |
| Args: | |
| device: Device to load the model on ('cpu' or 'cuda') | |
| Returns: | |
| TextToSpeech model instance or None if not available | |
| """ | |
| if not MODEL_AVAILABLE: | |
| return None | |
| try: | |
| model = TextToSpeech(device=device) | |
| return model | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| return None | |
| def generate_speech( | |
| text: str, | |
| voice: str, | |
| language: str, | |
| speed: float, | |
| pitch: int, | |
| sample_rate: int, | |
| model_instance | |
| ) -> tuple: | |
| """ | |
| Generate speech from text using Pocket-TTS. | |
| Args: | |
| text: Input text to synthesize | |
| voice: Voice identifier | |
| language: Language code | |
| speed: Speech speed multiplier (0.5 - 2.0) | |
| pitch: Pitch adjustment in semitones (-12 to +12) | |
| sample_rate: Output audio sample rate | |
| model_instance: Loaded TTS model | |
| Returns: | |
| Tuple of (audio_data, sample_rate) or error message | |
| """ | |
| # Validate input | |
| if not text or not text.strip(): | |
| return None, "Please enter some text to synthesize." | |
| if len(text.strip()) < 2: | |
| return None, "Text is too short. Please enter at least 2 characters." | |
| if not MODEL_AVAILABLE: | |
| # Return a demo message when model is not available | |
| return None, "Model not available. Please install pocket-tts: pip install pocket-tts" | |
| if model_instance is None: | |
| return None, "Model failed to load. Please check your installation." | |
| try: | |
| # Construct voice configuration | |
| voice_config = Voice( | |
| profile=VoiceProfile.from_id(voice), | |
| speaker=Speaker.from_id(voice) | |
| ) | |
| # Generate speech with options | |
| audio = model_instance.tts( | |
| text=text.strip(), | |
| voice=voice_config, | |
| speed=speed, | |
| pitch_shift=pitch, | |
| sample_rate=sample_rate | |
| ) | |
| return (sample_rate, audio), None | |
| except Exception as e: | |
| return None, f"Error generating speech: {str(e)}" | |
| def clear_all(): | |
| """Reset all inputs to default values.""" | |
| return "", "en_US", list(VOICE_OPTIONS.keys())[0], 1.0, 0, 24000, None | |
| def get_voice_list(language: str): | |
| """Get available voices for the selected language.""" | |
| # Filter voices by language prefix | |
| lang_prefix = language.split("_")[0] | |
| lang_voices = {k: v for k, v in VOICE_OPTIONS.items() if k.startswith(lang_prefix)} | |
| # If no language-specific voices, return all voices | |
| if not lang_voices: | |
| lang_voices = VOICE_OPTIONS | |
| return list(lang_voices.keys()), list(lang_voices.values()) | |
| # Custom CSS for the app | |
| CUSTOM_CSS = """ | |
| :root { | |
| --primary-color: #6366f1; | |
| --secondary-color: #8b5cf6; | |
| } | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| .header-title { | |
| text-align: center; | |
| background: linear-gradient(135deg, #6366f1, #8b5cf6, #a855f7); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| background-clip: text; | |
| font-size: 2.5rem; | |
| font-weight: 700; | |
| margin-bottom: 0.5rem; | |
| } | |
| .header-subtitle { | |
| text-align: center; | |
| color: #6b7280; | |
| font-size: 1.1rem; | |
| margin-bottom: 1.5rem; | |
| } | |
| .built-with { | |
| text-align: center; | |
| margin-top: 1rem; | |
| padding: 0.75rem; | |
| background: linear-gradient(135deg, #f0f9ff, #e0f2fe); | |
| border-radius: 0.5rem; | |
| border: 1px solid #bae6fd; | |
| } | |
| .built-with a { | |
| color: #0369a1; | |
| text-decoration: none; | |
| font-weight: 500; | |
| } | |
| .built-with a:hover { | |
| text-decoration: underline; | |
| } | |
| .audio-player { | |
| border-radius: 0.75rem; | |
| overflow: hidden; | |
| } | |
| .section-title { | |
| font-size: 1.1rem; | |
| font-weight: 600; | |
| color: #374151; | |
| margin-bottom: 0.5rem; | |
| padding-bottom: 0.25rem; | |
| border-bottom: 2px solid #e5e7eb; | |
| } | |
| .info-box { | |
| background: #fef3c7; | |
| border: 1px solid #fcd34d; | |
| border-radius: 0.5rem; | |
| padding: 0.75rem; | |
| margin: 0.5rem 0; | |
| font-size: 0.9rem; | |
| color: #92400e; | |
| } | |
| .success-box { | |
| background: #d1fae5; | |
| border: 1px solid #6ee7b7; | |
| border-radius: 0.5rem; | |
| padding: 0.75rem; | |
| margin: 0.5rem 0; | |
| font-size: 0.9rem; | |
| color: #065f46; | |
| } | |
| .tips-box { | |
| background: #f3f4f6; | |
| border: 1px solid #d1d5db; | |
| border-radius: 0.5rem; | |
| padding: 0.75rem; | |
| margin: 0.5rem 0; | |
| font-size: 0.85rem; | |
| color: #4b5563; | |
| } | |
| .tips-box ul { | |
| margin: 0.5rem 0 0 0; | |
| padding-left: 1.25rem; | |
| } | |
| .tips-box li { | |
| margin: 0.25rem 0; | |
| } | |
| """ | |
| # Create custom theme | |
| custom_theme = gr.themes.Soft( | |
| primary_hue="indigo", | |
| secondary_hue="violet", | |
| neutral_hue="slate", | |
| text_size="lg", | |
| spacing_size="lg", | |
| radius_size="md", | |
| font=gr.themes.GoogleFont("Inter") | |
| ).set( | |
| button_primary_background_fill="*primary_600", | |
| button_primary_background_fill_hover="*primary_700", | |
| button_secondary_background_fill="*secondary_200", | |
| button_secondary_background_fill_hover="*secondary_300", | |
| block_title_text_weight="600", | |
| block_title_text_color="*primary_700", | |
| input_background_fill="*neutral_100", | |
| ) | |
| def create_app(): | |
| """Create the Gradio application.""" | |
| # Load model on startup | |
| model = load_model("cpu") | |
| with gr.Blocks( | |
| theme=custom_theme, | |
| css=CUSTOM_CSS, | |
| title="Pocket-TTS - Text to Speech Converter", | |
| fill_height=True | |
| ) as demo: | |
| # Header with branding | |
| gr.HTML(""" | |
| <div class="header-section"> | |
| <h1 class="header-title">🎙️ Pocket-TTS</h1> | |
| <p class="header-subtitle">High-Quality Text-to-Speech Synthesis with Natural Voices</p> | |
| </div> | |
| """) | |
| # Main content | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=2): | |
| # Text input section | |
| gr.HTML('<p class="section-title">📝 Text Input</p>') | |
| text_input = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Enter your text here... e.g., Hello! This is a text-to-speech demonstration.", | |
| lines=5, | |
| max_lines=10, | |
| info="Enter the text you want to convert to speech", | |
| interactive=True, | |
| elem_id="text-input" | |
| ) | |
| # Quick text buttons | |
| with gr.Row(): | |
| gr.Button("👋 Hello World", size="sm").click( | |
| lambda: "Hello World! Welcome to the Pocket-TTS demo.", | |
| outputs=text_input | |
| ) | |
| gr.Button("📖 Sample Text", size="sm").click( | |
| lambda: "The quick brown fox jumps over the lazy dog. This is a sample sentence to test the text-to-speech system.", | |
| outputs=text_input | |
| ) | |
| gr.Button("🧪 Long Text", size="sm").click( | |
| lambda: "Artificial intelligence has revolutionized the way we interact with technology. From virtual assistants to autonomous vehicles, AI is everywhere. Text-to-speech systems have improved dramatically, offering more natural and expressive voices than ever before.", | |
| outputs=text_input | |
| ) | |
| # Voice settings section | |
| gr.HTML('<p class="section-title">🎵 Voice Settings</p>') | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| language_dropdown = gr.Dropdown( | |
| choices=LANGUAGE_OPTIONS, | |
| value="en_US", | |
| label="Language", | |
| info="Select the language of your text", | |
| elem_id="language" | |
| ) | |
| with gr.Column(scale=2): | |
| voice_dropdown = gr.Dropdown( | |
| choices=list(VOICE_OPTIONS.keys()), | |
| value=list(VOICE_OPTIONS.keys())[0], | |
| label="Voice", | |
| info="Choose a voice for synthesis", | |
| elem_id="voice" | |
| ) | |
| # Advanced settings accordion | |
| with gr.Accordion("⚙️ Advanced Settings", open=False): | |
| with gr.Row(): | |
| with gr.Column(): | |
| speed_slider = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.25, | |
| label="Speed", | |
| info="Speech speed (0.5x - 2.0x)", | |
| elem_id="speed" | |
| ) | |
| with gr.Column(): | |
| pitch_slider = gr.Slider( | |
| minimum=-12, | |
| maximum=12, | |
| value=0, | |
| step=1, | |
| label="Pitch", | |
| info="Pitch shift (-12 to +12 semitones)", | |
| elem_id="pitch" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| sample_rate_dropdown = gr.Dropdown( | |
| choices=[(str(sr), sr) for sr in [16000, 22050, 24000, 44100, 48000]], | |
| value=24000, | |
| type="index", | |
| label="Sample Rate", | |
| info="Output audio sample rate (Hz)", | |
| elem_id="sample-rate" | |
| ) | |
| # Generate button | |
| generate_btn = gr.Button( | |
| "🎙️ Generate Speech", | |
| variant="primary", | |
| size="lg", | |
| elem_id="generate-btn" | |
| ) | |
| with gr.Column(scale=1): | |
| # Output section | |
| gr.HTML('<p class="section-title">🔊 Audio Output</p>') | |
| audio_output = gr.Audio( | |
| label="Generated Audio", | |
| type="numpy", | |
| interactive=False, | |
| elem_id="audio-output" | |
| ) | |
| # Download button (appears when audio is generated) | |
| download_btn = gr.DownloadButton( | |
| "📥 Download Audio", | |
| variant="secondary", | |
| size="sm", | |
| visible=False, | |
| elem_id="download-btn" | |
| ) | |
| # Status message | |
| status_output = gr.Markdown( | |
| value="", | |
| visible=False, | |
| elem_id="status" | |
| ) | |
| # Model info | |
| with gr.Accordion("ℹ️ Model Information", open=False): | |
| gr.Markdown(""" | |
| **Pocket-TTS** by Kyutai Labs | |
| - Lightweight text-to-speech model | |
| - Optimized for CPU inference | |
| - Multiple voice options | |
| - Real-time synthesis support | |
| **Requirements:** | |
| - Python 3.8+ | |
| - PyTorch | |
| - 2GB+ RAM | |
| """) | |
| # Tips section | |
| with gr.Accordion("💡 Tips", open=False): | |
| gr.HTML(""" | |
| <div class="tips-box"> | |
| <strong>Tips for best results:</strong> | |
| <ul> | |
| <li>Use proper punctuation for natural pauses</li> | |
| <li>Try different voices for different contexts</li> | |
| <li>Adjust speed for clarity (slower = clearer)</li> | |
| <li>Pitch works best within ±6 semitones</li> | |
| </ul> | |
| </div> | |
| """) | |
| # Footer with "Built with anycoder" | |
| gr.HTML(""" | |
| <div class="built-with"> | |
| <p>🚀 Powered by Pocket-TTS from Kyutai Labs</p> | |
| <p>🔧 Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a> — Deploy ML models in minutes</p> | |
| </div> | |
| """) | |
| # Event handlers | |
| def update_voices(language): | |
| """Update voice dropdown based on selected language.""" | |
| voices, labels = get_voice_list(language) | |
| return gr.Dropdown(value=voices[0] if voices else None) | |
| def update_status(message, success=False): | |
| """Update status message.""" | |
| if success: | |
| return gr.Markdown( | |
| value=f'<div class="success-box">✅ {message}</div>', | |
| visible=True | |
| ) | |
| elif message: | |
| return gr.Markdown( | |
| value=f'<div class="info-box">ℹ️ {message}</div>', | |
| visible=True | |
| ) | |
| return gr.Markdown(value="", visible=False) | |
| # Connect events | |
| language_dropdown.change( | |
| update_voices, | |
| inputs=language_dropdown, | |
| outputs=voice_dropdown | |
| ) | |
| generate_btn.click( | |
| generate_speech, | |
| inputs=[ | |
| text_input, | |
| voice_dropdown, | |
| language_dropdown, | |
| speed_slider, | |
| pitch_slider, | |
| sample_rate_dropdown, | |
| gr.State(model) | |
| ], | |
| outputs=[audio_output, status_output], | |
| show_progress="full" | |
| ) | |
| # Enable download button when audio is generated | |
| audio_output.change( | |
| lambda x: (gr.DownloadButton(visible=True) if x is not None else gr.DownloadButton(visible=False)), | |
| inputs=audio_output, | |
| outputs=download_btn | |
| ) | |
| # Clear button functionality | |
| clear_btn = gr.Button("🗑️ Clear", size="sm", variant="stop") | |
| clear_btn.click( | |
| clear_all, | |
| outputs=[ | |
| text_input, | |
| language_dropdown, | |
| voice_dropdown, | |
| speed_slider, | |
| pitch_slider, | |
| sample_rate_dropdown, | |
| audio_output | |
| ] | |
| ) | |
| return demo | |
| def main(): | |
| """Main entry point for the application.""" | |
| demo = create_app() | |
| # Launch the application | |
| demo.launch( | |
| theme=custom_theme, | |
| title="Pocket-TTS - Text to Speech", | |
| description="High-Quality Text-to-Speech with Pocket-TTS by Kyutai Labs", | |
| article="## About Pocket-TTS\n\nPocket-TTS is a lightweight, efficient text-to-speech model developed by Kyutai Labs. It offers natural-sounding voice synthesis optimized for CPU inference.", | |
| footer_links=[ | |
| {"label": "Kyutai Labs", "url": "https://kyutai.org"}, | |
| {"label": "GitHub", "url": "https://github.com/kyutai-labs/pocket-tts"}, | |
| {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"} | |
| ], | |
| show_error=True, | |
| quiet=False | |
| ) | |
| if __name__ == "__main__": | |
| main() |