import gradio as gr import torch import os from datetime import datetime # Try to import the pocket-tts model, with graceful fallback try: from pocket_tts import TextToSpeech, Voice, VoiceProfile, Speaker MODEL_AVAILABLE = True except ImportError: MODEL_AVAILABLE = False print("pocket-tts not installed. Run: pip install pocket-tts") # Voice configuration - Pocket-TTS typically supports multiple speakers VOICE_OPTIONS = { "en_US_male_1": "American Male (Deep)", "en_US_female_1": "American Female (Clear)", "en_US_female_2": "American Female (Warm)", "en_UK_male_1": "British Male (Formal)", "en_UK_female_1": "British Female (Elegant)", "en_AU_male_1": "Australian Male (Casual)", "en_AU_female_1": "Australian Female (Friendly)", } # Language options LANGUAGE_OPTIONS = [ ("English (US)", "en_US"), ("English (UK)", "en_UK"), ("English (Australia)", "en_AU"), ] # Speed options (0.5x to 2.0x) SPEED_OPTIONS = [0.5, 0.75, 1.0, 1.25, 1.5, 2.0] # Pitch options (-12 to +12 semitones) PITCH_OPTIONS = [-12, -6, 0, 6, 12] def load_model(device="cpu"): """ Load the Pocket-TTS model. Args: device: Device to load the model on ('cpu' or 'cuda') Returns: TextToSpeech model instance or None if not available """ if not MODEL_AVAILABLE: return None try: model = TextToSpeech(device=device) return model except Exception as e: print(f"Error loading model: {e}") return None def generate_speech( text: str, voice: str, language: str, speed: float, pitch: int, sample_rate: int, model_instance ) -> tuple: """ Generate speech from text using Pocket-TTS. Args: text: Input text to synthesize voice: Voice identifier language: Language code speed: Speech speed multiplier (0.5 - 2.0) pitch: Pitch adjustment in semitones (-12 to +12) sample_rate: Output audio sample rate model_instance: Loaded TTS model Returns: Tuple of (audio_data, sample_rate) or error message """ # Validate input if not text or not text.strip(): return None, "Please enter some text to synthesize." if len(text.strip()) < 2: return None, "Text is too short. Please enter at least 2 characters." if not MODEL_AVAILABLE: # Return a demo message when model is not available return None, "Model not available. Please install pocket-tts: pip install pocket-tts" if model_instance is None: return None, "Model failed to load. Please check your installation." try: # Construct voice configuration voice_config = Voice( profile=VoiceProfile.from_id(voice), speaker=Speaker.from_id(voice) ) # Generate speech with options audio = model_instance.tts( text=text.strip(), voice=voice_config, speed=speed, pitch_shift=pitch, sample_rate=sample_rate ) return (sample_rate, audio), None except Exception as e: return None, f"Error generating speech: {str(e)}" def clear_all(): """Reset all inputs to default values.""" return "", "en_US", list(VOICE_OPTIONS.keys())[0], 1.0, 0, 24000, None def get_voice_list(language: str): """Get available voices for the selected language.""" # Filter voices by language prefix lang_prefix = language.split("_")[0] lang_voices = {k: v for k, v in VOICE_OPTIONS.items() if k.startswith(lang_prefix)} # If no language-specific voices, return all voices if not lang_voices: lang_voices = VOICE_OPTIONS return list(lang_voices.keys()), list(lang_voices.values()) # Custom CSS for the app CUSTOM_CSS = """ :root { --primary-color: #6366f1; --secondary-color: #8b5cf6; } .gradio-container { max-width: 1200px !important; } .header-title { text-align: center; background: linear-gradient(135deg, #6366f1, #8b5cf6, #a855f7); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; font-size: 2.5rem; font-weight: 700; margin-bottom: 0.5rem; } .header-subtitle { text-align: center; color: #6b7280; font-size: 1.1rem; margin-bottom: 1.5rem; } .built-with { text-align: center; margin-top: 1rem; padding: 0.75rem; background: linear-gradient(135deg, #f0f9ff, #e0f2fe); border-radius: 0.5rem; border: 1px solid #bae6fd; } .built-with a { color: #0369a1; text-decoration: none; font-weight: 500; } .built-with a:hover { text-decoration: underline; } .audio-player { border-radius: 0.75rem; overflow: hidden; } .section-title { font-size: 1.1rem; font-weight: 600; color: #374151; margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 2px solid #e5e7eb; } .info-box { background: #fef3c7; border: 1px solid #fcd34d; border-radius: 0.5rem; padding: 0.75rem; margin: 0.5rem 0; font-size: 0.9rem; color: #92400e; } .success-box { background: #d1fae5; border: 1px solid #6ee7b7; border-radius: 0.5rem; padding: 0.75rem; margin: 0.5rem 0; font-size: 0.9rem; color: #065f46; } .tips-box { background: #f3f4f6; border: 1px solid #d1d5db; border-radius: 0.5rem; padding: 0.75rem; margin: 0.5rem 0; font-size: 0.85rem; color: #4b5563; } .tips-box ul { margin: 0.5rem 0 0 0; padding-left: 1.25rem; } .tips-box li { margin: 0.25rem 0; } """ # Create custom theme custom_theme = gr.themes.Soft( primary_hue="indigo", secondary_hue="violet", neutral_hue="slate", text_size="lg", spacing_size="lg", radius_size="md", font=gr.themes.GoogleFont("Inter") ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", button_secondary_background_fill="*secondary_200", button_secondary_background_fill_hover="*secondary_300", block_title_text_weight="600", block_title_text_color="*primary_700", input_background_fill="*neutral_100", ) def create_app(): """Create the Gradio application.""" # Load model on startup model = load_model("cpu") with gr.Blocks( theme=custom_theme, css=CUSTOM_CSS, title="Pocket-TTS - Text to Speech Converter", fill_height=True ) as demo: # Header with branding gr.HTML("""

๐ŸŽ™๏ธ Pocket-TTS

High-Quality Text-to-Speech Synthesis with Natural Voices

""") # Main content with gr.Row(equal_height=True): with gr.Column(scale=2): # Text input section gr.HTML('

๐Ÿ“ Text Input

') text_input = gr.Textbox( label="Text to Synthesize", placeholder="Enter your text here... e.g., Hello! This is a text-to-speech demonstration.", lines=5, max_lines=10, info="Enter the text you want to convert to speech", interactive=True, elem_id="text-input" ) # Quick text buttons with gr.Row(): gr.Button("๐Ÿ‘‹ Hello World", size="sm").click( lambda: "Hello World! Welcome to the Pocket-TTS demo.", outputs=text_input ) gr.Button("๐Ÿ“– Sample Text", size="sm").click( lambda: "The quick brown fox jumps over the lazy dog. This is a sample sentence to test the text-to-speech system.", outputs=text_input ) gr.Button("๐Ÿงช Long Text", size="sm").click( lambda: "Artificial intelligence has revolutionized the way we interact with technology. From virtual assistants to autonomous vehicles, AI is everywhere. Text-to-speech systems have improved dramatically, offering more natural and expressive voices than ever before.", outputs=text_input ) # Voice settings section gr.HTML('

๐ŸŽต Voice Settings

') with gr.Row(): with gr.Column(scale=1): language_dropdown = gr.Dropdown( choices=LANGUAGE_OPTIONS, value="en_US", label="Language", info="Select the language of your text", elem_id="language" ) with gr.Column(scale=2): voice_dropdown = gr.Dropdown( choices=list(VOICE_OPTIONS.keys()), value=list(VOICE_OPTIONS.keys())[0], label="Voice", info="Choose a voice for synthesis", elem_id="voice" ) # Advanced settings accordion with gr.Accordion("โš™๏ธ Advanced Settings", open=False): with gr.Row(): with gr.Column(): speed_slider = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.25, label="Speed", info="Speech speed (0.5x - 2.0x)", elem_id="speed" ) with gr.Column(): pitch_slider = gr.Slider( minimum=-12, maximum=12, value=0, step=1, label="Pitch", info="Pitch shift (-12 to +12 semitones)", elem_id="pitch" ) with gr.Row(): with gr.Column(): sample_rate_dropdown = gr.Dropdown( choices=[(str(sr), sr) for sr in [16000, 22050, 24000, 44100, 48000]], value=24000, type="index", label="Sample Rate", info="Output audio sample rate (Hz)", elem_id="sample-rate" ) # Generate button generate_btn = gr.Button( "๐ŸŽ™๏ธ Generate Speech", variant="primary", size="lg", elem_id="generate-btn" ) with gr.Column(scale=1): # Output section gr.HTML('

๐Ÿ”Š Audio Output

') audio_output = gr.Audio( label="Generated Audio", type="numpy", interactive=False, elem_id="audio-output" ) # Download button (appears when audio is generated) download_btn = gr.DownloadButton( "๐Ÿ“ฅ Download Audio", variant="secondary", size="sm", visible=False, elem_id="download-btn" ) # Status message status_output = gr.Markdown( value="", visible=False, elem_id="status" ) # Model info with gr.Accordion("โ„น๏ธ Model Information", open=False): gr.Markdown(""" **Pocket-TTS** by Kyutai Labs - Lightweight text-to-speech model - Optimized for CPU inference - Multiple voice options - Real-time synthesis support **Requirements:** - Python 3.8+ - PyTorch - 2GB+ RAM """) # Tips section with gr.Accordion("๐Ÿ’ก Tips", open=False): gr.HTML("""
Tips for best results:
""") # Footer with "Built with anycoder" gr.HTML("""

๐Ÿš€ Powered by Pocket-TTS from Kyutai Labs

๐Ÿ”ง Built with anycoder โ€” Deploy ML models in minutes

""") # Event handlers def update_voices(language): """Update voice dropdown based on selected language.""" voices, labels = get_voice_list(language) return gr.Dropdown(value=voices[0] if voices else None) def update_status(message, success=False): """Update status message.""" if success: return gr.Markdown( value=f'
โœ… {message}
', visible=True ) elif message: return gr.Markdown( value=f'
โ„น๏ธ {message}
', visible=True ) return gr.Markdown(value="", visible=False) # Connect events language_dropdown.change( update_voices, inputs=language_dropdown, outputs=voice_dropdown ) generate_btn.click( generate_speech, inputs=[ text_input, voice_dropdown, language_dropdown, speed_slider, pitch_slider, sample_rate_dropdown, gr.State(model) ], outputs=[audio_output, status_output], show_progress="full" ) # Enable download button when audio is generated audio_output.change( lambda x: (gr.DownloadButton(visible=True) if x is not None else gr.DownloadButton(visible=False)), inputs=audio_output, outputs=download_btn ) # Clear button functionality clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear", size="sm", variant="stop") clear_btn.click( clear_all, outputs=[ text_input, language_dropdown, voice_dropdown, speed_slider, pitch_slider, sample_rate_dropdown, audio_output ] ) return demo def main(): """Main entry point for the application.""" demo = create_app() # Launch the application demo.launch( theme=custom_theme, title="Pocket-TTS - Text to Speech", description="High-Quality Text-to-Speech with Pocket-TTS by Kyutai Labs", article="## About Pocket-TTS\n\nPocket-TTS is a lightweight, efficient text-to-speech model developed by Kyutai Labs. It offers natural-sounding voice synthesis optimized for CPU inference.", footer_links=[ {"label": "Kyutai Labs", "url": "https://kyutai.org"}, {"label": "GitHub", "url": "https://github.com/kyutai-labs/pocket-tts"}, {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"} ], show_error=True, quiet=False ) if __name__ == "__main__": main()