pocket-tts

Runtime error

App Files Files Community

pocket-tts / app.py

D3vShoaib

added HF login for model download needed for voice-cloning-mdoel

fa6c114 3 months ago

raw

history blame contribute delete

10.4 kB

	import gradio as gr
	import numpy as np
	import os
	from huggingface_hub import login
	from pocket_tts import TTSModel

	# HF Token for gated models in Spaces
	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	print("HF_TOKEN found, logging in...")
	login(token=hf_token)

	# Load model once at startup
	print("Loading PocketTTS model...")
	model = TTSModel.load_model()
	print("Model loaded.")

	VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma']

	import traceback

	def generate_speech(text, voice_mode, voice_dropdown, voice_upload):
	if not text:
	return None

	try:
	if voice_mode == "Kyutai Voices":
	voice_path = voice_dropdown
	else:
	if not voice_upload:
	raise gr.Error("Please upload an audio file for voice cloning.")
	voice_path = voice_upload

	print(f"Generating with voice: {voice_path}")
	try:
	voice_state = model.get_state_for_audio_prompt(voice_path)
	audio = model.generate_audio(voice_state, text)
	except Exception as e:
	full_error = traceback.format_exc()
	print(f"Error in model processing: {full_error}")
	raise gr.Error(f"Model error: {str(e)}")

	# Convert to 16-bit PCM to avoid Gradio warnings
	audio_np = audio.cpu().numpy()
	audio_int16 = (audio_np * 32767).astype(np.int16)

	return (model.sample_rate, audio_int16)
	except gr.Error:
	raise
	except Exception as e:
	full_error = traceback.format_exc()
	print(f"Unexpected error: {full_error}")
	raise gr.Error(f"An unexpected error occurred: {str(e)}")


	# Load custom theme with fallback
	try:
	theme = gr.Theme.from_hub("JohnSmith9982/small_and_pretty")
	except Exception as e:
	print(f"Warning: Could not load custom theme: {e}. Using default Soft theme.")
	theme = gr.themes.Soft()

	css = """
	footer {visibility: hidden}
	.gradio-container {
	max-width: 100% !important;
	padding: 0 !important;
	}
	@media (min-width: 768px) {
	.gradio-container {
	padding-left: 2% !important;
	padding-right: 2% !important;
	}
	}
	.header-section {
	text-align: left;
	margin-bottom: 1.5rem;
	}
	.main-title {
	color: #10b981;
	font-weight: 800;
	font-size: 1.8rem;
	margin: 5px 0;
	}
	@media (min-width: 768px) {
	.main-title {
	font-size: 2.2rem;
	}
	}
	.logo-container {
	display: flex;
	justify-content: flex-start;
	align-items: center;
	gap: 10px;
	margin-bottom: 10px;
	}
	.logo-img {
	height: 40px;
	border-radius: 8px;
	}
	@media (min-width: 768px) {
	.logo-img {
	height: 50px;
	}
	.logo-container {
	gap: 15px;
	}
	}
	.description {
	max-width: 900px;
	margin: 10px 0;
	font-size: 0.95rem;
	line-height: 1.5;
	color: #4b5563;
	}
	.links-row {
	display: flex;
	flex-wrap: wrap;
	justify-content: flex-start;
	gap: 8px;
	margin: 10px 0;
	font-size: 0.85rem;
	}
	@media (min-width: 768px) {
	.links-row {
	gap: 10px;
	font-size: 0.9rem;
	}
	}
	.links-row a {
	color: #10b981;
	text-decoration: none;
	padding: 3px 10px;
	border: 1px solid #10b981;
	border-radius: 15px;
	transition: all 0.2s;
	white-space: nowrap;
	}
	.links-row a:hover {
	background-color: #10b981;
	color: white;
	}
	.social-handles {
	display: flex;
	justify-content: center;
	gap: 20px;
	margin: 15px 0;
	}
	.social-icon {
	width: 28px;
	height: 28px;
	transition: all 0.3s ease;
	}
	.social-icon:hover {
	transform: scale(1.1) translateY(-3px);
	}
	.disclaimer {
	text-align: center;
	font-size: 0.8rem;
	color: #9ca3af;
	margin-top: 30px;
	padding: 15px;
	border-top: 1px solid #f3f4f6;
	}
	@media (min-width: 768px) {
	.disclaimer {
	margin-top: 40px;
	padding: 20px;
	}
	}
	#voice-mode .wrap {
	display: flex !important;
	flex-direction: row !important;
	width: 100% !important;
	}

	#voice-mode .wrap label {
	flex: 1 !important;
	justify-content: center !important;
	text-align: center !important;
	}
	"""

	with gr.Blocks() as demo:
	with gr.Column(elem_classes="header-section"):
	with gr.Row():
	with gr.Column(scale=4):
	gr.HTML("""
	<div class="logo-container">
	<img src="https://cdn-avatars.huggingface.co/v1/production/uploads/6355a3c1805be5a8f30fea49/8xGdIOlfkopZfhbMitw_k.jpeg" class="logo-img" alt="Kyutai Logo">
	<img src="https://raw.githubusercontent.com/kyutai-labs/pocket-tts/refs/heads/main/docs/logo.png" class="logo-img" alt="PocketTTS Logo">
	<h1 class='main-title'>PocketTTS</h1>
	</div>
	""")
	gr.HTML("""
	<div class="description">
	<b>Lightweight CPU-based Text-to-Speech.</b>
	Forget GPUs and web APIs. Pocket TTS is a simple pip install away.
	<br>
	<small>Supports Python 3.10+ and PyTorch 2.5+ (CPU versions supported).</small>
	</div>
	""")
	gr.HTML("""
	<div class="links-row">
	<a href="https://kyutai.org/tts" target="_blank">🔊 Demo</a>
	<a href="https://github.com/kyutai-labs/pocket-tts" target="_blank">🐱‍💻 GitHub</a>
	<a href="https://huggingface.co/kyutai/pocket-tts" target="_blank">🤗 Model Card</a>
	<a href="https://huggingface.co/spaces/D3vShoaib/pocket-tts" target="_blank">🤗 Space</a>
	<a href="https://arxiv.org/abs/2509.06926" target="_blank">📄 Paper</a>
	<a href="https://github.com/kyutai-labs/pocket-tts/tree/main/docs" target="_blank">📚 Docs</a>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	text_input = gr.Textbox(
	label="Text to Speak",
	placeholder="Enter text here...",
	lines=8,
	elem_id="text-input"
	)
	voice_mode = gr.Radio(
	choices=["Kyutai Voices", "Voice Cloning"],
	value="Kyutai Voices",
	label="Voice Mode",
	elem_id="voice-mode"
	)

	with gr.Column(visible=True) as standard_voice_col:
	voice_select = gr.Dropdown(
	choices=VOICES,
	value="alba",
	label="Select from Kyutai Voices",
	elem_id="voice-select"
	)

	with gr.Column(visible=False) as cloning_voice_col:
	voice_upload = gr.Audio(
	label="Upload Voice for Cloning (WAV/MP3)",
	type="filepath",
	elem_id="voice-upload"
	)
	with gr.Row():
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")
	generate_btn = gr.Button("⚡ Generate", variant="primary")

	with gr.Column(scale=1):
	audio_output = gr.Audio(
	label="Audio Output",
	autoplay=True,
	elem_id="audio-output"
	)
	gr.Markdown("""
	### 🚀 Performance
	- Latency: ~200ms first chunk (local install)
	- Speed: 6x real-time
	- Engine: CPU Optimized
	- Note: Demo limited by Gradio hosting
	""")

	gr.Examples(
	examples=[
	["Hello! This is a test of the pocket-tts system. It's incredibly fast and runs right on your CPU.", "Kyutai Voices", "alba", None],
	["The quick brown fox jumps over the lazy dog.", "Kyutai Voices", "marius", None],
	["Would you like some tea? It's freshly brewed.", "Kyutai Voices", "javert", None]
	],
	inputs=[text_input, voice_mode, voice_select, voice_upload],
	)

	gr.HTML("""
	<div class="disclaimer">
	<div class="social-handles">
	<a href="https://github.com/D3vShoaib" target="_blank">
	<img src="https://img.icons8.com/color/48/github--v1.png" class="social-icon" alt="GitHub">
	</a>
	<a href="https://linkedin.com/in/D3vShoaib" target="_blank">
	<img src="https://img.icons8.com/color/48/linkedin.png" class="social-icon" alt="LinkedIn">
	</a>
	<a href="https://twitter.com/D3vShoaib" target="_blank">
	<img src="https://img.icons8.com/color/48/twitterx--v1.png" class="social-icon" alt="Twitter">
	</a>
	<a href="https://instagram.com/d3vshoaib" target="_blank">
	<img src="https://img.icons8.com/color/48/instagram-new--v1.png" class="social-icon" alt="Instagram">
	</a>
	</div>
	<p>Built with ❤️ by <a href="https://github.com/D3vShoaib" style="color: #10b981; text-decoration: none; font-weight: 500;">D3vShoaib</a></p>
	<p>⚠️ I am not associated with Kyutai TTS and this is only for demonstration purposes.</p>
	</div>
	""")

	# Visibility Toggling
	def update_voice_ui(mode):
	if mode == "Kyutai Voices":
	return gr.update(visible=True), gr.update(visible=False)
	else:
	return gr.update(visible=False), gr.update(visible=True)

	voice_mode.change(
	fn=update_voice_ui,
	inputs=[voice_mode],
	outputs=[standard_voice_col, cloning_voice_col]
	)

	# Event handlers
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, voice_mode, voice_select, voice_upload],
	outputs=audio_output
	)

	text_input.submit(
	fn=generate_speech,
	inputs=[text_input, voice_mode, voice_select, voice_upload],
	outputs=audio_output
	)

	clear_btn.click(
	fn=lambda: ("", "Kyutai Voices", "alba", None, None),
	outputs=[text_input, voice_mode, voice_select, voice_upload, audio_output]
	)

	if __name__ == "__main__":
	demo.queue().launch(theme=theme, css=css)