Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import numpy as np | |
| import os | |
| from huggingface_hub import login | |
| from pocket_tts import TTSModel | |
| # HF Token for gated models in Spaces | |
| hf_token = os.getenv("HF_TOKEN") | |
| if hf_token: | |
| print("HF_TOKEN found, logging in...") | |
| login(token=hf_token) | |
| # Load model once at startup | |
| print("Loading PocketTTS model...") | |
| model = TTSModel.load_model() | |
| print("Model loaded.") | |
| VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma'] | |
| import traceback | |
| def generate_speech(text, voice_mode, voice_dropdown, voice_upload): | |
| if not text: | |
| return None | |
| try: | |
| if voice_mode == "Kyutai Voices": | |
| voice_path = voice_dropdown | |
| else: | |
| if not voice_upload: | |
| raise gr.Error("Please upload an audio file for voice cloning.") | |
| voice_path = voice_upload | |
| print(f"Generating with voice: {voice_path}") | |
| try: | |
| voice_state = model.get_state_for_audio_prompt(voice_path) | |
| audio = model.generate_audio(voice_state, text) | |
| except Exception as e: | |
| full_error = traceback.format_exc() | |
| print(f"Error in model processing: {full_error}") | |
| raise gr.Error(f"Model error: {str(e)}") | |
| # Convert to 16-bit PCM to avoid Gradio warnings | |
| audio_np = audio.cpu().numpy() | |
| audio_int16 = (audio_np * 32767).astype(np.int16) | |
| return (model.sample_rate, audio_int16) | |
| except gr.Error: | |
| raise | |
| except Exception as e: | |
| full_error = traceback.format_exc() | |
| print(f"Unexpected error: {full_error}") | |
| raise gr.Error(f"An unexpected error occurred: {str(e)}") | |
| # Load custom theme with fallback | |
| try: | |
| theme = gr.Theme.from_hub("JohnSmith9982/small_and_pretty") | |
| except Exception as e: | |
| print(f"Warning: Could not load custom theme: {e}. Using default Soft theme.") | |
| theme = gr.themes.Soft() | |
| css = """ | |
| footer {visibility: hidden} | |
| .gradio-container { | |
| max-width: 100% !important; | |
| padding: 0 !important; | |
| } | |
| @media (min-width: 768px) { | |
| .gradio-container { | |
| padding-left: 2% !important; | |
| padding-right: 2% !important; | |
| } | |
| } | |
| .header-section { | |
| text-align: left; | |
| margin-bottom: 1.5rem; | |
| } | |
| .main-title { | |
| color: #10b981; | |
| font-weight: 800; | |
| font-size: 1.8rem; | |
| margin: 5px 0; | |
| } | |
| @media (min-width: 768px) { | |
| .main-title { | |
| font-size: 2.2rem; | |
| } | |
| } | |
| .logo-container { | |
| display: flex; | |
| justify-content: flex-start; | |
| align-items: center; | |
| gap: 10px; | |
| margin-bottom: 10px; | |
| } | |
| .logo-img { | |
| height: 40px; | |
| border-radius: 8px; | |
| } | |
| @media (min-width: 768px) { | |
| .logo-img { | |
| height: 50px; | |
| } | |
| .logo-container { | |
| gap: 15px; | |
| } | |
| } | |
| .description { | |
| max-width: 900px; | |
| margin: 10px 0; | |
| font-size: 0.95rem; | |
| line-height: 1.5; | |
| color: #4b5563; | |
| } | |
| .links-row { | |
| display: flex; | |
| flex-wrap: wrap; | |
| justify-content: flex-start; | |
| gap: 8px; | |
| margin: 10px 0; | |
| font-size: 0.85rem; | |
| } | |
| @media (min-width: 768px) { | |
| .links-row { | |
| gap: 10px; | |
| font-size: 0.9rem; | |
| } | |
| } | |
| .links-row a { | |
| color: #10b981; | |
| text-decoration: none; | |
| padding: 3px 10px; | |
| border: 1px solid #10b981; | |
| border-radius: 15px; | |
| transition: all 0.2s; | |
| white-space: nowrap; | |
| } | |
| .links-row a:hover { | |
| background-color: #10b981; | |
| color: white; | |
| } | |
| .social-handles { | |
| display: flex; | |
| justify-content: center; | |
| gap: 20px; | |
| margin: 15px 0; | |
| } | |
| .social-icon { | |
| width: 28px; | |
| height: 28px; | |
| transition: all 0.3s ease; | |
| } | |
| .social-icon:hover { | |
| transform: scale(1.1) translateY(-3px); | |
| } | |
| .disclaimer { | |
| text-align: center; | |
| font-size: 0.8rem; | |
| color: #9ca3af; | |
| margin-top: 30px; | |
| padding: 15px; | |
| border-top: 1px solid #f3f4f6; | |
| } | |
| @media (min-width: 768px) { | |
| .disclaimer { | |
| margin-top: 40px; | |
| padding: 20px; | |
| } | |
| } | |
| #voice-mode .wrap { | |
| display: flex !important; | |
| flex-direction: row !important; | |
| width: 100% !important; | |
| } | |
| #voice-mode .wrap label { | |
| flex: 1 !important; | |
| justify-content: center !important; | |
| text-align: center !important; | |
| } | |
| """ | |
| with gr.Blocks() as demo: | |
| with gr.Column(elem_classes="header-section"): | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| gr.HTML(""" | |
| <div class="logo-container"> | |
| <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/6355a3c1805be5a8f30fea49/8xGdIOlfkopZfhbMitw_k.jpeg" class="logo-img" alt="Kyutai Logo"> | |
| <img src="https://raw.githubusercontent.com/kyutai-labs/pocket-tts/refs/heads/main/docs/logo.png" class="logo-img" alt="PocketTTS Logo"> | |
| <h1 class='main-title'>PocketTTS</h1> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div class="description"> | |
| <b>Lightweight CPU-based Text-to-Speech.</b> | |
| Forget GPUs and web APIs. Pocket TTS is a simple pip install away. | |
| <br> | |
| <small>Supports Python 3.10+ and PyTorch 2.5+ (CPU versions supported).</small> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div class="links-row"> | |
| <a href="https://kyutai.org/tts" target="_blank">🔊 Demo</a> | |
| <a href="https://github.com/kyutai-labs/pocket-tts" target="_blank">🐱💻 GitHub</a> | |
| <a href="https://huggingface.co/kyutai/pocket-tts" target="_blank">🤗 Model Card</a> | |
| <a href="https://huggingface.co/spaces/D3vShoaib/pocket-tts" target="_blank">🤗 Space</a> | |
| <a href="https://arxiv.org/abs/2509.06926" target="_blank">📄 Paper</a> | |
| <a href="https://github.com/kyutai-labs/pocket-tts/tree/main/docs" target="_blank">📚 Docs</a> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| text_input = gr.Textbox( | |
| label="Text to Speak", | |
| placeholder="Enter text here...", | |
| lines=8, | |
| elem_id="text-input" | |
| ) | |
| voice_mode = gr.Radio( | |
| choices=["Kyutai Voices", "Voice Cloning"], | |
| value="Kyutai Voices", | |
| label="Voice Mode", | |
| elem_id="voice-mode" | |
| ) | |
| with gr.Column(visible=True) as standard_voice_col: | |
| voice_select = gr.Dropdown( | |
| choices=VOICES, | |
| value="alba", | |
| label="Select from Kyutai Voices", | |
| elem_id="voice-select" | |
| ) | |
| with gr.Column(visible=False) as cloning_voice_col: | |
| voice_upload = gr.Audio( | |
| label="Upload Voice for Cloning (WAV/MP3)", | |
| type="filepath", | |
| elem_id="voice-upload" | |
| ) | |
| with gr.Row(): | |
| clear_btn = gr.Button("🗑️ Clear", variant="secondary") | |
| generate_btn = gr.Button("⚡ Generate", variant="primary") | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio( | |
| label="Audio Output", | |
| autoplay=True, | |
| elem_id="audio-output" | |
| ) | |
| gr.Markdown(""" | |
| ### 🚀 Performance | |
| - **Latency**: ~200ms first chunk (local install) | |
| - **Speed**: 6x real-time | |
| - **Engine**: CPU Optimized | |
| - **Note**: Demo limited by Gradio hosting | |
| """) | |
| gr.Examples( | |
| examples=[ | |
| ["Hello! This is a test of the pocket-tts system. It's incredibly fast and runs right on your CPU.", "Kyutai Voices", "alba", None], | |
| ["The quick brown fox jumps over the lazy dog.", "Kyutai Voices", "marius", None], | |
| ["Would you like some tea? It's freshly brewed.", "Kyutai Voices", "javert", None] | |
| ], | |
| inputs=[text_input, voice_mode, voice_select, voice_upload], | |
| ) | |
| gr.HTML(""" | |
| <div class="disclaimer"> | |
| <div class="social-handles"> | |
| <a href="https://github.com/D3vShoaib" target="_blank"> | |
| <img src="https://img.icons8.com/color/48/github--v1.png" class="social-icon" alt="GitHub"> | |
| </a> | |
| <a href="https://linkedin.com/in/D3vShoaib" target="_blank"> | |
| <img src="https://img.icons8.com/color/48/linkedin.png" class="social-icon" alt="LinkedIn"> | |
| </a> | |
| <a href="https://twitter.com/D3vShoaib" target="_blank"> | |
| <img src="https://img.icons8.com/color/48/twitterx--v1.png" class="social-icon" alt="Twitter"> | |
| </a> | |
| <a href="https://instagram.com/d3vshoaib" target="_blank"> | |
| <img src="https://img.icons8.com/color/48/instagram-new--v1.png" class="social-icon" alt="Instagram"> | |
| </a> | |
| </div> | |
| <p>Built with ❤️ by <a href="https://github.com/D3vShoaib" style="color: #10b981; text-decoration: none; font-weight: 500;">D3vShoaib</a></p> | |
| <p>⚠️ I am not associated with Kyutai TTS and this is only for demonstration purposes.</p> | |
| </div> | |
| """) | |
| # Visibility Toggling | |
| def update_voice_ui(mode): | |
| if mode == "Kyutai Voices": | |
| return gr.update(visible=True), gr.update(visible=False) | |
| else: | |
| return gr.update(visible=False), gr.update(visible=True) | |
| voice_mode.change( | |
| fn=update_voice_ui, | |
| inputs=[voice_mode], | |
| outputs=[standard_voice_col, cloning_voice_col] | |
| ) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, voice_mode, voice_select, voice_upload], | |
| outputs=audio_output | |
| ) | |
| text_input.submit( | |
| fn=generate_speech, | |
| inputs=[text_input, voice_mode, voice_select, voice_upload], | |
| outputs=audio_output | |
| ) | |
| clear_btn.click( | |
| fn=lambda: ("", "Kyutai Voices", "alba", None, None), | |
| outputs=[text_input, voice_mode, voice_select, voice_upload, audio_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch(theme=theme, css=css) |