pocket-tts / app.py
D3vShoaib's picture
added HF login for model download needed for voice-cloning-mdoel
fa6c114
import gradio as gr
import numpy as np
import os
from huggingface_hub import login
from pocket_tts import TTSModel
# HF Token for gated models in Spaces
hf_token = os.getenv("HF_TOKEN")
if hf_token:
print("HF_TOKEN found, logging in...")
login(token=hf_token)
# Load model once at startup
print("Loading PocketTTS model...")
model = TTSModel.load_model()
print("Model loaded.")
VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma']
import traceback
def generate_speech(text, voice_mode, voice_dropdown, voice_upload):
if not text:
return None
try:
if voice_mode == "Kyutai Voices":
voice_path = voice_dropdown
else:
if not voice_upload:
raise gr.Error("Please upload an audio file for voice cloning.")
voice_path = voice_upload
print(f"Generating with voice: {voice_path}")
try:
voice_state = model.get_state_for_audio_prompt(voice_path)
audio = model.generate_audio(voice_state, text)
except Exception as e:
full_error = traceback.format_exc()
print(f"Error in model processing: {full_error}")
raise gr.Error(f"Model error: {str(e)}")
# Convert to 16-bit PCM to avoid Gradio warnings
audio_np = audio.cpu().numpy()
audio_int16 = (audio_np * 32767).astype(np.int16)
return (model.sample_rate, audio_int16)
except gr.Error:
raise
except Exception as e:
full_error = traceback.format_exc()
print(f"Unexpected error: {full_error}")
raise gr.Error(f"An unexpected error occurred: {str(e)}")
# Load custom theme with fallback
try:
theme = gr.Theme.from_hub("JohnSmith9982/small_and_pretty")
except Exception as e:
print(f"Warning: Could not load custom theme: {e}. Using default Soft theme.")
theme = gr.themes.Soft()
css = """
footer {visibility: hidden}
.gradio-container {
max-width: 100% !important;
padding: 0 !important;
}
@media (min-width: 768px) {
.gradio-container {
padding-left: 2% !important;
padding-right: 2% !important;
}
}
.header-section {
text-align: left;
margin-bottom: 1.5rem;
}
.main-title {
color: #10b981;
font-weight: 800;
font-size: 1.8rem;
margin: 5px 0;
}
@media (min-width: 768px) {
.main-title {
font-size: 2.2rem;
}
}
.logo-container {
display: flex;
justify-content: flex-start;
align-items: center;
gap: 10px;
margin-bottom: 10px;
}
.logo-img {
height: 40px;
border-radius: 8px;
}
@media (min-width: 768px) {
.logo-img {
height: 50px;
}
.logo-container {
gap: 15px;
}
}
.description {
max-width: 900px;
margin: 10px 0;
font-size: 0.95rem;
line-height: 1.5;
color: #4b5563;
}
.links-row {
display: flex;
flex-wrap: wrap;
justify-content: flex-start;
gap: 8px;
margin: 10px 0;
font-size: 0.85rem;
}
@media (min-width: 768px) {
.links-row {
gap: 10px;
font-size: 0.9rem;
}
}
.links-row a {
color: #10b981;
text-decoration: none;
padding: 3px 10px;
border: 1px solid #10b981;
border-radius: 15px;
transition: all 0.2s;
white-space: nowrap;
}
.links-row a:hover {
background-color: #10b981;
color: white;
}
.social-handles {
display: flex;
justify-content: center;
gap: 20px;
margin: 15px 0;
}
.social-icon {
width: 28px;
height: 28px;
transition: all 0.3s ease;
}
.social-icon:hover {
transform: scale(1.1) translateY(-3px);
}
.disclaimer {
text-align: center;
font-size: 0.8rem;
color: #9ca3af;
margin-top: 30px;
padding: 15px;
border-top: 1px solid #f3f4f6;
}
@media (min-width: 768px) {
.disclaimer {
margin-top: 40px;
padding: 20px;
}
}
#voice-mode .wrap {
display: flex !important;
flex-direction: row !important;
width: 100% !important;
}
#voice-mode .wrap label {
flex: 1 !important;
justify-content: center !important;
text-align: center !important;
}
"""
with gr.Blocks() as demo:
with gr.Column(elem_classes="header-section"):
with gr.Row():
with gr.Column(scale=4):
gr.HTML("""
<div class="logo-container">
<img src="https://cdn-avatars.huggingface.co/v1/production/uploads/6355a3c1805be5a8f30fea49/8xGdIOlfkopZfhbMitw_k.jpeg" class="logo-img" alt="Kyutai Logo">
<img src="https://raw.githubusercontent.com/kyutai-labs/pocket-tts/refs/heads/main/docs/logo.png" class="logo-img" alt="PocketTTS Logo">
<h1 class='main-title'>PocketTTS</h1>
</div>
""")
gr.HTML("""
<div class="description">
<b>Lightweight CPU-based Text-to-Speech.</b>
Forget GPUs and web APIs. Pocket TTS is a simple pip install away.
<br>
<small>Supports Python 3.10+ and PyTorch 2.5+ (CPU versions supported).</small>
</div>
""")
gr.HTML("""
<div class="links-row">
<a href="https://kyutai.org/tts" target="_blank">🔊 Demo</a>
<a href="https://github.com/kyutai-labs/pocket-tts" target="_blank">🐱‍💻 GitHub</a>
<a href="https://huggingface.co/kyutai/pocket-tts" target="_blank">🤗 Model Card</a>
<a href="https://huggingface.co/spaces/D3vShoaib/pocket-tts" target="_blank">🤗 Space</a>
<a href="https://arxiv.org/abs/2509.06926" target="_blank">📄 Paper</a>
<a href="https://github.com/kyutai-labs/pocket-tts/tree/main/docs" target="_blank">📚 Docs</a>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(
label="Text to Speak",
placeholder="Enter text here...",
lines=8,
elem_id="text-input"
)
voice_mode = gr.Radio(
choices=["Kyutai Voices", "Voice Cloning"],
value="Kyutai Voices",
label="Voice Mode",
elem_id="voice-mode"
)
with gr.Column(visible=True) as standard_voice_col:
voice_select = gr.Dropdown(
choices=VOICES,
value="alba",
label="Select from Kyutai Voices",
elem_id="voice-select"
)
with gr.Column(visible=False) as cloning_voice_col:
voice_upload = gr.Audio(
label="Upload Voice for Cloning (WAV/MP3)",
type="filepath",
elem_id="voice-upload"
)
with gr.Row():
clear_btn = gr.Button("🗑️ Clear", variant="secondary")
generate_btn = gr.Button("⚡ Generate", variant="primary")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Audio Output",
autoplay=True,
elem_id="audio-output"
)
gr.Markdown("""
### 🚀 Performance
- **Latency**: ~200ms first chunk (local install)
- **Speed**: 6x real-time
- **Engine**: CPU Optimized
- **Note**: Demo limited by Gradio hosting
""")
gr.Examples(
examples=[
["Hello! This is a test of the pocket-tts system. It's incredibly fast and runs right on your CPU.", "Kyutai Voices", "alba", None],
["The quick brown fox jumps over the lazy dog.", "Kyutai Voices", "marius", None],
["Would you like some tea? It's freshly brewed.", "Kyutai Voices", "javert", None]
],
inputs=[text_input, voice_mode, voice_select, voice_upload],
)
gr.HTML("""
<div class="disclaimer">
<div class="social-handles">
<a href="https://github.com/D3vShoaib" target="_blank">
<img src="https://img.icons8.com/color/48/github--v1.png" class="social-icon" alt="GitHub">
</a>
<a href="https://linkedin.com/in/D3vShoaib" target="_blank">
<img src="https://img.icons8.com/color/48/linkedin.png" class="social-icon" alt="LinkedIn">
</a>
<a href="https://twitter.com/D3vShoaib" target="_blank">
<img src="https://img.icons8.com/color/48/twitterx--v1.png" class="social-icon" alt="Twitter">
</a>
<a href="https://instagram.com/d3vshoaib" target="_blank">
<img src="https://img.icons8.com/color/48/instagram-new--v1.png" class="social-icon" alt="Instagram">
</a>
</div>
<p>Built with ❤️ by <a href="https://github.com/D3vShoaib" style="color: #10b981; text-decoration: none; font-weight: 500;">D3vShoaib</a></p>
<p>⚠️ I am not associated with Kyutai TTS and this is only for demonstration purposes.</p>
</div>
""")
# Visibility Toggling
def update_voice_ui(mode):
if mode == "Kyutai Voices":
return gr.update(visible=True), gr.update(visible=False)
else:
return gr.update(visible=False), gr.update(visible=True)
voice_mode.change(
fn=update_voice_ui,
inputs=[voice_mode],
outputs=[standard_voice_col, cloning_voice_col]
)
# Event handlers
generate_btn.click(
fn=generate_speech,
inputs=[text_input, voice_mode, voice_select, voice_upload],
outputs=audio_output
)
text_input.submit(
fn=generate_speech,
inputs=[text_input, voice_mode, voice_select, voice_upload],
outputs=audio_output
)
clear_btn.click(
fn=lambda: ("", "Kyutai Voices", "alba", None, None),
outputs=[text_input, voice_mode, voice_select, voice_upload, audio_output]
)
if __name__ == "__main__":
demo.queue().launch(theme=theme, css=css)