Spaces:

YashChowdhary
/

Text_To_Speech

Running

App Files Files Community

Text_To_Speech / app.py

YashChowdhary

Update app.py

1868e79 verified 2 months ago

raw

history blame contribute delete

25.9 kB

	"""
	Text-to-Speech Application
	================================================
	Created by: Yash Chowdhary

	A comprehensive TTS application using Kokoro-82M model with full voice control.

	Features:
	- 28 built-in voices (American & British English, Male & Female)
	- Speed control (0.5x - 2.0x)
	- Pitch adjustment via audio post-processing
	- Configurable pause insertion
	- Style presets for different tones (Neutral, Dramatic, Whisper, etc.)

	License: Apache 2.0 (same as Kokoro model)
	"""

	import gradio as gr
	import numpy as np
	import soundfile as sf
	import io
	import re
	from typing import Optional, Tuple, Generator
	from dataclasses import dataclass
	from enum import Enum

	# Kokoro TTS imports
	from kokoro import KPipeline

	# ============================================================================
	# CONFIGURATION & CONSTANTS
	# ============================================================================

	SAMPLE_RATE = 24000 # Kokoro outputs 24kHz audio
	MAX_CHAR_LIMIT = 5000 # Maximum characters per generation

	# Voice definitions with metadata
	# Format: voice_id -> (display_name, gender, accent, quality_grade, description)
	VOICE_CATALOG = {
	# American English - Female
	"af_heart": ("Heart ❤️", "Female", "American", "A", "Premium quality, warm and natural"),
	"af_bella": ("Bella 🔥", "Female", "American", "A-", "Clear and expressive"),
	"af_nicole": ("Nicole 🎧", "Female", "American", "B-", "Professional narrator style"),
	"af_aoede": ("Aoede", "Female", "American", "C+", "Melodic and pleasant"),
	"af_kore": ("Kore", "Female", "American", "C+", "Youthful and energetic"),
	"af_sarah": ("Sarah", "Female", "American", "C+", "Friendly and approachable"),
	"af_nova": ("Nova", "Female", "American", "C", "Modern and crisp"),
	"af_sky": ("Sky", "Female", "American", "C-", "Light and airy"),
	"af_alloy": ("Alloy", "Female", "American", "C", "Balanced and versatile"),
	"af_jessica": ("Jessica", "Female", "American", "D", "Casual conversational"),
	"af_river": ("River", "Female", "American", "D", "Gentle and flowing"),

	# American English - Male
	"am_michael": ("Michael", "Male", "American", "C+", "Authoritative and clear"),
	"am_fenrir": ("Fenrir", "Male", "American", "C+", "Deep and resonant"),
	"am_puck": ("Puck", "Male", "American", "C+", "Playful and dynamic"),
	"am_echo": ("Echo", "Male", "American", "D", "Warm and reflective"),
	"am_eric": ("Eric", "Male", "American", "D", "Professional and steady"),
	"am_liam": ("Liam", "Male", "American", "D", "Young and natural"),
	"am_onyx": ("Onyx", "Male", "American", "D", "Rich and smooth"),
	"am_santa": ("Santa 🎅", "Male", "American", "D-", "Jolly and festive"),
	"am_adam": ("Adam", "Male", "American", "F+", "Basic male voice"),

	# British English - Female
	"bf_emma": ("Emma", "Female", "British", "B-", "Elegant British accent"),
	"bf_isabella": ("Isabella", "Female", "British", "C", "Sophisticated and refined"),
	"bf_alice": ("Alice", "Female", "British", "D", "Classic British tone"),
	"bf_lily": ("Lily", "Female", "British", "D", "Soft and gentle"),

	# British English - Male
	"bm_george": ("George", "Male", "British", "C", "Distinguished gentleman"),
	"bm_fable": ("Fable", "Male", "British", "C", "Storyteller quality"),
	"bm_lewis": ("Lewis", "Male", "British", "D+", "Conversational British"),
	"bm_daniel": ("Daniel", "Male", "British", "D", "Standard British male"),
	}


	@dataclass
	class StylePreset:
	"""Defines a style preset with associated audio parameters."""
	name: str
	description: str
	speed: float
	pitch_shift: float # semitones
	pause_multiplier: float
	recommended_voices: list


	# Style presets for different tones
	STYLE_PRESETS = {
	"neutral": StylePreset(
	name="Neutral Narrator",
	description="Clear, balanced narration suitable for most content",
	speed=1.0,
	pitch_shift=0,
	pause_multiplier=1.0,
	recommended_voices=["af_heart", "af_bella", "am_michael", "bf_emma"]
	),
	"dramatic": StylePreset(
	name="Dramatic / Horror",
	description="Slower, deeper voice for suspenseful or dramatic content",
	speed=0.85,
	pitch_shift=-2,
	pause_multiplier=1.5,
	recommended_voices=["am_fenrir", "am_onyx", "bm_george", "af_nicole"]
	),
	"excited": StylePreset(
	name="Excited / Surprised",
	description="Faster, higher energy delivery",
	speed=1.2,
	pitch_shift=1,
	pause_multiplier=0.7,
	recommended_voices=["af_kore", "am_puck", "af_nova", "af_sky"]
	),
	"calm": StylePreset(
	name="Calm / Meditative",
	description="Slow, soothing voice for relaxation content",
	speed=0.8,
	pitch_shift=-1,
	pause_multiplier=1.8,
	recommended_voices=["af_heart", "bf_lily", "am_echo", "bf_emma"]
	),
	"storyteller": StylePreset(
	name="Storyteller",
	description="Engaging pace for audiobooks and stories",
	speed=0.95,
	pitch_shift=0,
	pause_multiplier=1.2,
	recommended_voices=["bm_fable", "af_bella", "am_michael", "bf_isabella"]
	),
	"professional": StylePreset(
	name="Professional / Corporate",
	description="Clear, authoritative delivery for business content",
	speed=1.05,
	pitch_shift=0,
	pause_multiplier=1.0,
	recommended_voices=["af_nicole", "am_eric", "bf_emma", "bm_george"]
	),
	"cheerful": StylePreset(
	name="Cheerful / Friendly",
	description="Warm, upbeat tone for friendly content",
	speed=1.1,
	pitch_shift=0.5,
	pause_multiplier=0.9,
	recommended_voices=["af_sarah", "am_puck", "af_kore", "am_liam"]
	),
	}


	# ============================================================================
	# AUDIO PROCESSING UTILITIES
	# ============================================================================

	def pitch_shift_audio(audio: np.ndarray, sample_rate: int, semitones: float) -> np.ndarray:
	"""
	Shift the pitch of audio by a given number of semitones.
	Uses simple resampling-based pitch shifting (no external dependencies).

	Args:
	audio: Input audio array
	sample_rate: Sample rate of the audio
	semitones: Number of semitones to shift (positive = higher, negative = lower)

	Returns:
	Pitch-shifted audio array
	"""
	if semitones == 0:
	return audio

	# Calculate the pitch shift factor
	# Each semitone is a factor of 2^(1/12)
	factor = 2 ** (semitones / 12)

	# Resample to shift pitch
	# To raise pitch: stretch time, then resample to original length
	# To lower pitch: compress time, then resample to original length

	original_length = len(audio)

	# Create new sample indices
	new_length = int(original_length / factor)
	indices = np.linspace(0, original_length - 1, new_length)

	# Linear interpolation for resampling
	shifted = np.interp(indices, np.arange(original_length), audio)

	# Resample back to original length to maintain duration
	final_indices = np.linspace(0, len(shifted) - 1, original_length)
	result = np.interp(final_indices, np.arange(len(shifted)), shifted)

	return result.astype(np.float32)


	def insert_pauses(audio_segments: list, pause_duration_ms: int, sample_rate: int) -> np.ndarray:
	"""
	Insert silence between audio segments.

	Args:
	audio_segments: List of audio arrays
	pause_duration_ms: Pause duration in milliseconds
	sample_rate: Sample rate of the audio

	Returns:
	Combined audio with pauses inserted
	"""
	if not audio_segments:
	return np.array([], dtype=np.float32)

	# Create silence array
	pause_samples = int(sample_rate * pause_duration_ms / 1000)
	silence = np.zeros(pause_samples, dtype=np.float32)

	# Combine segments with pauses
	combined = []
	for i, segment in enumerate(audio_segments):
	combined.append(segment)
	if i < len(audio_segments) - 1: # Don't add pause after last segment
	combined.append(silence)

	return np.concatenate(combined)


	def normalize_audio(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:
	"""
	Normalize audio to a target dB level.

	Args:
	audio: Input audio array
	target_db: Target peak level in dB (default -3 dB)

	Returns:
	Normalized audio array
	"""
	if len(audio) == 0:
	return audio

	# Find the peak amplitude
	peak = np.max(np.abs(audio))
	if peak == 0:
	return audio

	# Calculate the gain needed
	target_amplitude = 10 ** (target_db / 20)
	gain = target_amplitude / peak

	return (audio * gain).astype(np.float32)


	def preprocess_text(text: str, add_pauses: bool = True) -> str:
	"""
	Preprocess text to improve TTS output quality.

	Args:
	text: Input text
	add_pauses: Whether to add pause hints at sentence boundaries

	Returns:
	Preprocessed text
	"""
	# Clean up whitespace
	text = re.sub(r'\s+', ' ', text.strip())

	# Normalize common abbreviations
	abbreviations = {
	r'\bDr\.': 'Doctor',
	r'\bMr\.': 'Mister',
	r'\bMrs\.': 'Missus',
	r'\bMs\.': 'Miss',
	r'\bProf\.': 'Professor',
	r'\betc\.': 'etcetera',
	r'\be\.g\.': 'for example',
	r'\bi\.e\.': 'that is',
	}

	for pattern, replacement in abbreviations.items():
	text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

	return text


	# ============================================================================
	# TTS ENGINE
	# ============================================================================

	class KokoroTTSEngine:
	"""
	Wrapper class for Kokoro TTS with additional processing capabilities.
	"""

	def __init__(self):
	"""Initialize the TTS engine with both American and British English pipelines."""
	print("Initializing Kokoro TTS Engine...")

	# Initialize pipelines for both accents
	self.pipelines = {
	'a': KPipeline(lang_code='a'), # American English
	'b': KPipeline(lang_code='b'), # British English
	}

	# Add custom pronunciation for "Kokoro"
	self.pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
	self.pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'

	# Pre-load voice packs for faster inference
	print("Pre-loading voice packs...")
	for voice_id in VOICE_CATALOG.keys():
	lang_code = voice_id[0] # 'a' or 'b'
	try:
	self.pipelines[lang_code].load_voice(voice_id)
	except Exception as e:
	print(f"Warning: Could not pre-load voice {voice_id}: {e}")

	print("TTS Engine initialized successfully!")

	def generate(
	self,
	text: str,
	voice: str = "af_heart",
	speed: float = 1.0,
	pitch_shift: float = 0.0,
	pause_between_sentences_ms: int = 300,
	) -> Tuple[int, np.ndarray]:
	"""
	Generate speech from text with full parameter control.

	Args:
	text: Input text to synthesize
	voice: Voice ID from VOICE_CATALOG
	speed: Speed multiplier (0.5 to 2.0)
	pitch_shift: Pitch adjustment in semitones (-5 to +5)
	pause_between_sentences_ms: Pause duration between sentences

	Returns:
	Tuple of (sample_rate, audio_array)
	"""
	# Validate inputs
	text = preprocess_text(text.strip()[:MAX_CHAR_LIMIT])
	if not text:
	return SAMPLE_RATE, np.zeros(1, dtype=np.float32)

	speed = max(0.5, min(2.0, speed))
	pitch_shift = max(-5, min(5, pitch_shift))

	# Get the appropriate pipeline
	lang_code = voice[0] if voice[0] in self.pipelines else 'a'
	pipeline = self.pipelines[lang_code]

	# Generate audio segments
	audio_segments = []

	try:
	for _, phonemes, audio in pipeline(text, voice=voice, speed=speed):
	if audio is not None:
	audio_segments.append(audio.numpy() if hasattr(audio, 'numpy') else audio)
	except Exception as e:
	print(f"Generation error: {e}")
	return SAMPLE_RATE, np.zeros(1, dtype=np.float32)

	if not audio_segments:
	return SAMPLE_RATE, np.zeros(1, dtype=np.float32)

	# Combine segments with pauses
	combined_audio = insert_pauses(audio_segments, pause_between_sentences_ms, SAMPLE_RATE)

	# Apply pitch shift if requested
	if pitch_shift != 0:
	combined_audio = pitch_shift_audio(combined_audio, SAMPLE_RATE, pitch_shift)

	# Normalize the final audio
	combined_audio = normalize_audio(combined_audio)

	return SAMPLE_RATE, combined_audio

	def generate_with_style(
	self,
	text: str,
	voice: str,
	style_preset: str,
	custom_speed: Optional[float] = None,
	custom_pitch: Optional[float] = None,
	custom_pause: Optional[int] = None,
	) -> Tuple[int, np.ndarray]:
	"""
	Generate speech using a style preset with optional custom overrides.

	Args:
	text: Input text to synthesize
	voice: Voice ID
	style_preset: Style preset name from STYLE_PRESETS
	custom_speed: Override the preset speed (optional)
	custom_pitch: Override the preset pitch (optional)
	custom_pause: Override the preset pause (optional)

	Returns:
	Tuple of (sample_rate, audio_array)
	"""
	preset = STYLE_PRESETS.get(style_preset, STYLE_PRESETS["neutral"])

	speed = custom_speed if custom_speed is not None else preset.speed
	pitch = custom_pitch if custom_pitch is not None else preset.pitch_shift
	pause = custom_pause if custom_pause is not None else int(300 * preset.pause_multiplier)

	return self.generate(
	text=text,
	voice=voice,
	speed=speed,
	pitch_shift=pitch,
	pause_between_sentences_ms=pause,
	)


	# ============================================================================
	# GRADIO INTERFACE
	# ============================================================================

	def create_voice_choices():
	"""Create organized voice choices for the dropdown."""
	choices = []

	# Group by accent and gender
	groups = {
	("American", "Female"): [],
	("American", "Male"): [],
	("British", "Female"): [],
	("British", "Male"): [],
	}

	for voice_id, (name, gender, accent, grade, desc) in VOICE_CATALOG.items():
	groups[(accent, gender)].append((voice_id, name, grade))

	# Build choices with group labels
	for (accent, gender), voices in groups.items():
	flag = "🇺🇸" if accent == "American" else "🇬🇧"
	gender_icon = "🚺" if gender == "Female" else "🚹"

	for voice_id, name, grade in sorted(voices, key=lambda x: x[2]): # Sort by grade
	label = f"{flag} {gender_icon} {name} [{grade}]"
	choices.append((label, voice_id))

	return choices


	def create_style_choices():
	"""Create style preset choices for the dropdown."""
	return [(preset.name, key) for key, preset in STYLE_PRESETS.items()]


	# Initialize the TTS engine globally
	print("Loading Kokoro TTS Engine...")
	tts_engine = KokoroTTSEngine()


	def generate_speech(
	text: str,
	voice: str,
	style: str,
	speed: float,
	pitch: float,
	pause: int,
	use_style_defaults: bool,
	) -> Tuple[int, np.ndarray]:
	"""
	Main generation function for Gradio interface.
	"""
	if not text.strip():
	gr.Warning("Please enter some text to synthesize.")
	return None

	try:
	if use_style_defaults:
	sample_rate, audio = tts_engine.generate_with_style(
	text=text,
	voice=voice,
	style_preset=style,
	)
	else:
	sample_rate, audio = tts_engine.generate(
	text=text,
	voice=voice,
	speed=speed,
	pitch_shift=pitch,
	pause_between_sentences_ms=pause,
	)

	return (sample_rate, audio)

	except Exception as e:
	gr.Error(f"Generation failed: {str(e)}")
	return None


	def update_style_info(style: str) -> str:
	"""Update the style information display."""
	preset = STYLE_PRESETS.get(style, STYLE_PRESETS["neutral"])

	recommended = ", ".join([
	VOICE_CATALOG[v][0] for v in preset.recommended_voices if v in VOICE_CATALOG
	])

	return f"""{preset.name}

	{preset.description}

	- Speed: {preset.speed}x
	- Pitch Shift: {preset.pitch_shift:+.1f} semitones
	- Pause Multiplier: {preset.pause_multiplier}x

	Recommended Voices: {recommended}
	"""


	def update_controls_from_style(style: str, use_defaults: bool):
	"""Update the control sliders based on selected style."""
	if not use_defaults:
	return gr.update(), gr.update(), gr.update()

	preset = STYLE_PRESETS.get(style, STYLE_PRESETS["neutral"])
	return (
	gr.update(value=preset.speed),
	gr.update(value=preset.pitch_shift),
	gr.update(value=int(300 * preset.pause_multiplier)),
	)


	# Sample texts for demonstration
	SAMPLE_TEXTS = {
	"welcome": """Welcome to Kokoro Text-to-Speech! This is an open-source model with 82 million parameters,
	capable of producing natural-sounding speech. Try different voices and styles to find your perfect combination.""",

	"horror": """The old house creaked as I pushed open the door. Something moved in the shadows.
	A whisper echoed through the empty halls... "You shouldn't have come here."
	I turned to run, but the door had vanished.""",

	"news": """Breaking news tonight: Scientists have made a groundbreaking discovery that could change
	our understanding of the universe. The research team announced their findings at a press conference
	held earlier today at the National Science Foundation.""",

	"story": """Once upon a time, in a kingdom far away, there lived a young princess who dreamed of adventure.
	One day, she discovered a magical map hidden in the castle library.
	Little did she know, this map would lead her to the greatest journey of her life.""",

	"technical": """The system architecture consists of three main components: the frontend user interface,
	the backend API server, and the database layer. Each component is designed for scalability and
	can be deployed independently using container orchestration.""",
	}


	def load_sample_text(sample_key: str) -> str:
	"""Load a sample text."""
	return SAMPLE_TEXTS.get(sample_key, "")


	# Build the Gradio interface
	with gr.Blocks(
	title="Text-to-Speech",
	theme=gr.themes.Soft(),
	css="""
	.main-title {
	text-align: center;
	margin-bottom: 1rem;
	}
	.info-box {
	border: 1px solid var(--border-color-primary);
	border-radius: 8px;
	padding: 1rem;
	margin: 0.5rem 0;
	}
	.info-box strong {
	color: var(--body-text-color);
	}
	"""
	) as demo:

	# Header
	gr.Markdown(
	"""
	# 🎙️ Text-to-Speech
	Created by Yash Chowdhary

	An open-source, high-quality TTS system powered by [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)

	Features: 28 voices • Style presets • Speed/Pitch/Pause control • CPU-friendly
	""",
	elem_classes=["main-title"]
	)

	with gr.Row():
	# Left column - Input controls
	with gr.Column(scale=1):
	# Text input
	text_input = gr.Textbox(
	label="📝 Text to Synthesize",
	placeholder="Enter your text here...",
	lines=6,
	max_lines=15,
	info=f"Maximum {MAX_CHAR_LIMIT} characters"
	)

	# Sample text buttons
	with gr.Accordion("📚 Sample Texts", open=False):
	with gr.Row():
	gr.Button("Welcome", size="sm").click(
	lambda: SAMPLE_TEXTS["welcome"], outputs=text_input
	)
	gr.Button("Horror 👻", size="sm").click(
	lambda: SAMPLE_TEXTS["horror"], outputs=text_input
	)
	gr.Button("News 📰", size="sm").click(
	lambda: SAMPLE_TEXTS["news"], outputs=text_input
	)
	with gr.Row():
	gr.Button("Story 📖", size="sm").click(
	lambda: SAMPLE_TEXTS["story"], outputs=text_input
	)
	gr.Button("Technical 💻", size="sm").click(
	lambda: SAMPLE_TEXTS["technical"], outputs=text_input
	)

	# Voice selection
	voice_dropdown = gr.Dropdown(
	choices=create_voice_choices(),
	value="af_heart",
	label="🎭 Voice",
	info="Select a voice (sorted by quality grade)"
	)

	# Style preset
	style_dropdown = gr.Dropdown(
	choices=create_style_choices(),
	value="neutral",
	label="🎨 Style Preset",
	info="Choose a style for different content types"
	)

	# Style info display
	style_info = gr.Markdown(
	value=update_style_info("neutral"),
	elem_classes=["info-box"]
	)

	# Use style defaults checkbox
	use_style_defaults = gr.Checkbox(
	label="Use Style Preset Defaults",
	value=True,
	info="When checked, style preset values override manual controls"
	)

	# Right column - Advanced controls and output
	with gr.Column(scale=1):
	# Advanced controls
	with gr.Accordion("⚙️ Advanced Controls", open=True):
	speed_slider = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.05,
	label="🏃 Speed",
	info="Speaking rate (0.5x = slow, 2.0x = fast)"
	)

	pitch_slider = gr.Slider(
	minimum=-5.0,
	maximum=5.0,
	value=0.0,
	step=0.5,
	label="🎵 Pitch Shift (semitones)",
	info="Adjust voice pitch (-5 = deeper, +5 = higher)"
	)

	pause_slider = gr.Slider(
	minimum=0,
	maximum=1000,
	value=300,
	step=50,
	label="⏸️ Pause Between Sentences (ms)",
	info="Silence duration between sentences"
	)

	# Generate button
	generate_btn = gr.Button(
	"🎙️ Generate Speech",
	variant="primary",
	size="lg"
	)

	# Audio output
	audio_output = gr.Audio(
	label="🔊 Generated Audio",
	type="numpy",
	interactive=False,
	autoplay=True
	)

	# Download info
	gr.Markdown(
	"""
	💡 Tips:
	- Click the download button (⬇️) on the audio player to save
	- Try different voices with the same text to compare
	- Use style presets as starting points, then customize
	"""
	)

	# Footer
	gr.Markdown(
	"""
	---
	Created by Yash Chowdhary \| Powered by [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) (Apache 2.0)

	Resources: [Model Card](https://huggingface.co/hexgrad/Kokoro-82M) \|
	[Voice List](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md) \|
	[GitHub](https://github.com/hexgrad/kokoro)
	"""
	)

	# Event handlers
	style_dropdown.change(
	fn=update_style_info,
	inputs=[style_dropdown],
	outputs=[style_info]
	)

	style_dropdown.change(
	fn=update_controls_from_style,
	inputs=[style_dropdown, use_style_defaults],
	outputs=[speed_slider, pitch_slider, pause_slider]
	)

	use_style_defaults.change(
	fn=update_controls_from_style,
	inputs=[style_dropdown, use_style_defaults],
	outputs=[speed_slider, pitch_slider, pause_slider]
	)

	generate_btn.click(
	fn=generate_speech,
	inputs=[
	text_input,
	voice_dropdown,
	style_dropdown,
	speed_slider,
	pitch_slider,
	pause_slider,
	use_style_defaults,
	],
	outputs=[audio_output]
	)


	# Launch configuration
	if __name__ == "__main__":
	demo.queue().launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_api=True
	)