Spaces:

D3vShoaib
/

anycoder-81f107bc

Runtime error

App Files Files Community

anycoder-81f107bc / app.py

D3vShoaib

Upload folder using huggingface_hub

8c16911 verified 26 days ago

raw

history blame contribute delete

17.1 kB

	import gradio as gr
	import torch
	import os
	from datetime import datetime

	# Try to import the pocket-tts model, with graceful fallback
	try:
	from pocket_tts import TextToSpeech, Voice, VoiceProfile, Speaker
	MODEL_AVAILABLE = True
	except ImportError:
	MODEL_AVAILABLE = False
	print("pocket-tts not installed. Run: pip install pocket-tts")

	# Voice configuration - Pocket-TTS typically supports multiple speakers
	VOICE_OPTIONS = {
	"en_US_male_1": "American Male (Deep)",
	"en_US_female_1": "American Female (Clear)",
	"en_US_female_2": "American Female (Warm)",
	"en_UK_male_1": "British Male (Formal)",
	"en_UK_female_1": "British Female (Elegant)",
	"en_AU_male_1": "Australian Male (Casual)",
	"en_AU_female_1": "Australian Female (Friendly)",
	}

	# Language options
	LANGUAGE_OPTIONS = [
	("English (US)", "en_US"),
	("English (UK)", "en_UK"),
	("English (Australia)", "en_AU"),
	]

	# Speed options (0.5x to 2.0x)
	SPEED_OPTIONS = [0.5, 0.75, 1.0, 1.25, 1.5, 2.0]

	# Pitch options (-12 to +12 semitones)
	PITCH_OPTIONS = [-12, -6, 0, 6, 12]


	def load_model(device="cpu"):
	"""
	Load the Pocket-TTS model.

	Args:
	device: Device to load the model on ('cpu' or 'cuda')

	Returns:
	TextToSpeech model instance or None if not available
	"""
	if not MODEL_AVAILABLE:
	return None

	try:
	model = TextToSpeech(device=device)
	return model
	except Exception as e:
	print(f"Error loading model: {e}")
	return None


	def generate_speech(
	text: str,
	voice: str,
	language: str,
	speed: float,
	pitch: int,
	sample_rate: int,
	model_instance
	) -> tuple:
	"""
	Generate speech from text using Pocket-TTS.

	Args:
	text: Input text to synthesize
	voice: Voice identifier
	language: Language code
	speed: Speech speed multiplier (0.5 - 2.0)
	pitch: Pitch adjustment in semitones (-12 to +12)
	sample_rate: Output audio sample rate
	model_instance: Loaded TTS model

	Returns:
	Tuple of (audio_data, sample_rate) or error message
	"""
	# Validate input
	if not text or not text.strip():
	return None, "Please enter some text to synthesize."

	if len(text.strip()) < 2:
	return None, "Text is too short. Please enter at least 2 characters."

	if not MODEL_AVAILABLE:
	# Return a demo message when model is not available
	return None, "Model not available. Please install pocket-tts: pip install pocket-tts"

	if model_instance is None:
	return None, "Model failed to load. Please check your installation."

	try:
	# Construct voice configuration
	voice_config = Voice(
	profile=VoiceProfile.from_id(voice),
	speaker=Speaker.from_id(voice)
	)

	# Generate speech with options
	audio = model_instance.tts(
	text=text.strip(),
	voice=voice_config,
	speed=speed,
	pitch_shift=pitch,
	sample_rate=sample_rate
	)

	return (sample_rate, audio), None

	except Exception as e:
	return None, f"Error generating speech: {str(e)}"


	def clear_all():
	"""Reset all inputs to default values."""
	return "", "en_US", list(VOICE_OPTIONS.keys())[0], 1.0, 0, 24000, None


	def get_voice_list(language: str):
	"""Get available voices for the selected language."""
	# Filter voices by language prefix
	lang_prefix = language.split("_")[0]
	lang_voices = {k: v for k, v in VOICE_OPTIONS.items() if k.startswith(lang_prefix)}

	# If no language-specific voices, return all voices
	if not lang_voices:
	lang_voices = VOICE_OPTIONS

	return list(lang_voices.keys()), list(lang_voices.values())


	# Custom CSS for the app
	CUSTOM_CSS = """
	:root {
	--primary-color: #6366f1;
	--secondary-color: #8b5cf6;
	}

	.gradio-container {
	max-width: 1200px !important;
	}

	.header-title {
	text-align: center;
	background: linear-gradient(135deg, #6366f1, #8b5cf6, #a855f7);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	background-clip: text;
	font-size: 2.5rem;
	font-weight: 700;
	margin-bottom: 0.5rem;
	}

	.header-subtitle {
	text-align: center;
	color: #6b7280;
	font-size: 1.1rem;
	margin-bottom: 1.5rem;
	}

	.built-with {
	text-align: center;
	margin-top: 1rem;
	padding: 0.75rem;
	background: linear-gradient(135deg, #f0f9ff, #e0f2fe);
	border-radius: 0.5rem;
	border: 1px solid #bae6fd;
	}

	.built-with a {
	color: #0369a1;
	text-decoration: none;
	font-weight: 500;
	}

	.built-with a:hover {
	text-decoration: underline;
	}

	.audio-player {
	border-radius: 0.75rem;
	overflow: hidden;
	}

	.section-title {
	font-size: 1.1rem;
	font-weight: 600;
	color: #374151;
	margin-bottom: 0.5rem;
	padding-bottom: 0.25rem;
	border-bottom: 2px solid #e5e7eb;
	}

	.info-box {
	background: #fef3c7;
	border: 1px solid #fcd34d;
	border-radius: 0.5rem;
	padding: 0.75rem;
	margin: 0.5rem 0;
	font-size: 0.9rem;
	color: #92400e;
	}

	.success-box {
	background: #d1fae5;
	border: 1px solid #6ee7b7;
	border-radius: 0.5rem;
	padding: 0.75rem;
	margin: 0.5rem 0;
	font-size: 0.9rem;
	color: #065f46;
	}

	.tips-box {
	background: #f3f4f6;
	border: 1px solid #d1d5db;
	border-radius: 0.5rem;
	padding: 0.75rem;
	margin: 0.5rem 0;
	font-size: 0.85rem;
	color: #4b5563;
	}

	.tips-box ul {
	margin: 0.5rem 0 0 0;
	padding-left: 1.25rem;
	}

	.tips-box li {
	margin: 0.25rem 0;
	}
	"""


	# Create custom theme
	custom_theme = gr.themes.Soft(
	primary_hue="indigo",
	secondary_hue="violet",
	neutral_hue="slate",
	text_size="lg",
	spacing_size="lg",
	radius_size="md",
	font=gr.themes.GoogleFont("Inter")
	).set(
	button_primary_background_fill="*primary_600",
	button_primary_background_fill_hover="*primary_700",
	button_secondary_background_fill="*secondary_200",
	button_secondary_background_fill_hover="*secondary_300",
	block_title_text_weight="600",
	block_title_text_color="*primary_700",
	input_background_fill="*neutral_100",
	)


	def create_app():
	"""Create the Gradio application."""

	# Load model on startup
	model = load_model("cpu")

	with gr.Blocks(
	theme=custom_theme,
	css=CUSTOM_CSS,
	title="Pocket-TTS - Text to Speech Converter",
	fill_height=True
	) as demo:

	# Header with branding
	gr.HTML("""
	<div class="header-section">
	<h1 class="header-title">🎙️ Pocket-TTS</h1>
	<p class="header-subtitle">High-Quality Text-to-Speech Synthesis with Natural Voices</p>
	</div>
	""")

	# Main content
	with gr.Row(equal_height=True):
	with gr.Column(scale=2):
	# Text input section
	gr.HTML('<p class="section-title">📝 Text Input</p>')

	text_input = gr.Textbox(
	label="Text to Synthesize",
	placeholder="Enter your text here... e.g., Hello! This is a text-to-speech demonstration.",
	lines=5,
	max_lines=10,
	info="Enter the text you want to convert to speech",
	interactive=True,
	elem_id="text-input"
	)

	# Quick text buttons
	with gr.Row():
	gr.Button("👋 Hello World", size="sm").click(
	lambda: "Hello World! Welcome to the Pocket-TTS demo.",
	outputs=text_input
	)
	gr.Button("📖 Sample Text", size="sm").click(
	lambda: "The quick brown fox jumps over the lazy dog. This is a sample sentence to test the text-to-speech system.",
	outputs=text_input
	)
	gr.Button("🧪 Long Text", size="sm").click(
	lambda: "Artificial intelligence has revolutionized the way we interact with technology. From virtual assistants to autonomous vehicles, AI is everywhere. Text-to-speech systems have improved dramatically, offering more natural and expressive voices than ever before.",
	outputs=text_input
	)

	# Voice settings section
	gr.HTML('<p class="section-title">🎵 Voice Settings</p>')

	with gr.Row():
	with gr.Column(scale=1):
	language_dropdown = gr.Dropdown(
	choices=LANGUAGE_OPTIONS,
	value="en_US",
	label="Language",
	info="Select the language of your text",
	elem_id="language"
	)

	with gr.Column(scale=2):
	voice_dropdown = gr.Dropdown(
	choices=list(VOICE_OPTIONS.keys()),
	value=list(VOICE_OPTIONS.keys())[0],
	label="Voice",
	info="Choose a voice for synthesis",
	elem_id="voice"
	)

	# Advanced settings accordion
	with gr.Accordion("⚙️ Advanced Settings", open=False):
	with gr.Row():
	with gr.Column():
	speed_slider = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.25,
	label="Speed",
	info="Speech speed (0.5x - 2.0x)",
	elem_id="speed"
	)

	with gr.Column():
	pitch_slider = gr.Slider(
	minimum=-12,
	maximum=12,
	value=0,
	step=1,
	label="Pitch",
	info="Pitch shift (-12 to +12 semitones)",
	elem_id="pitch"
	)

	with gr.Row():
	with gr.Column():
	sample_rate_dropdown = gr.Dropdown(
	choices=[(str(sr), sr) for sr in [16000, 22050, 24000, 44100, 48000]],
	value=24000,
	type="index",
	label="Sample Rate",
	info="Output audio sample rate (Hz)",
	elem_id="sample-rate"
	)

	# Generate button
	generate_btn = gr.Button(
	"🎙️ Generate Speech",
	variant="primary",
	size="lg",
	elem_id="generate-btn"
	)

	with gr.Column(scale=1):
	# Output section
	gr.HTML('<p class="section-title">🔊 Audio Output</p>')

	audio_output = gr.Audio(
	label="Generated Audio",
	type="numpy",
	interactive=False,
	elem_id="audio-output"
	)

	# Download button (appears when audio is generated)
	download_btn = gr.DownloadButton(
	"📥 Download Audio",
	variant="secondary",
	size="sm",
	visible=False,
	elem_id="download-btn"
	)

	# Status message
	status_output = gr.Markdown(
	value="",
	visible=False,
	elem_id="status"
	)

	# Model info
	with gr.Accordion("ℹ️ Model Information", open=False):
	gr.Markdown("""
	Pocket-TTS by Kyutai Labs

	- Lightweight text-to-speech model
	- Optimized for CPU inference
	- Multiple voice options
	- Real-time synthesis support

	Requirements:
	- Python 3.8+
	- PyTorch
	- 2GB+ RAM
	""")

	# Tips section
	with gr.Accordion("💡 Tips", open=False):
	gr.HTML("""
	<div class="tips-box">
	<strong>Tips for best results:</strong>
	<ul>
	<li>Use proper punctuation for natural pauses</li>
	<li>Try different voices for different contexts</li>
	<li>Adjust speed for clarity (slower = clearer)</li>
	<li>Pitch works best within ±6 semitones</li>
	</ul>
	</div>
	""")

	# Footer with "Built with anycoder"
	gr.HTML("""
	<div class="built-with">
	<p>🚀 Powered by Pocket-TTS from Kyutai Labs</p>
	<p>🔧 Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a> — Deploy ML models in minutes</p>
	</div>
	""")

	# Event handlers
	def update_voices(language):
	"""Update voice dropdown based on selected language."""
	voices, labels = get_voice_list(language)
	return gr.Dropdown(value=voices[0] if voices else None)

	def update_status(message, success=False):
	"""Update status message."""
	if success:
	return gr.Markdown(
	value=f'<div class="success-box">✅ {message}</div>',
	visible=True
	)
	elif message:
	return gr.Markdown(
	value=f'<div class="info-box">ℹ️ {message}</div>',
	visible=True
	)
	return gr.Markdown(value="", visible=False)

	# Connect events
	language_dropdown.change(
	update_voices,
	inputs=language_dropdown,
	outputs=voice_dropdown
	)

	generate_btn.click(
	generate_speech,
	inputs=[
	text_input,
	voice_dropdown,
	language_dropdown,
	speed_slider,
	pitch_slider,
	sample_rate_dropdown,
	gr.State(model)
	],
	outputs=[audio_output, status_output],
	show_progress="full"
	)

	# Enable download button when audio is generated
	audio_output.change(
	lambda x: (gr.DownloadButton(visible=True) if x is not None else gr.DownloadButton(visible=False)),
	inputs=audio_output,
	outputs=download_btn
	)

	# Clear button functionality
	clear_btn = gr.Button("🗑️ Clear", size="sm", variant="stop")
	clear_btn.click(
	clear_all,
	outputs=[
	text_input,
	language_dropdown,
	voice_dropdown,
	speed_slider,
	pitch_slider,
	sample_rate_dropdown,
	audio_output
	]
	)

	return demo


	def main():
	"""Main entry point for the application."""
	demo = create_app()

	# Launch the application
	demo.launch(
	theme=custom_theme,
	title="Pocket-TTS - Text to Speech",
	description="High-Quality Text-to-Speech with Pocket-TTS by Kyutai Labs",
	article="## About Pocket-TTS\n\nPocket-TTS is a lightweight, efficient text-to-speech model developed by Kyutai Labs. It offers natural-sounding voice synthesis optimized for CPU inference.",
	footer_links=[
	{"label": "Kyutai Labs", "url": "https://kyutai.org"},
	{"label": "GitHub", "url": "https://github.com/kyutai-labs/pocket-tts"},
	{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}
	],
	show_error=True,
	quiet=False
	)


	if __name__ == "__main__":
	main()