Text_To_Speech / app.py
YashChowdhary's picture
Update app.py
1868e79 verified
"""
Text-to-Speech Application
================================================
Created by: Yash Chowdhary
A comprehensive TTS application using Kokoro-82M model with full voice control.
Features:
- 28 built-in voices (American & British English, Male & Female)
- Speed control (0.5x - 2.0x)
- Pitch adjustment via audio post-processing
- Configurable pause insertion
- Style presets for different tones (Neutral, Dramatic, Whisper, etc.)
License: Apache 2.0 (same as Kokoro model)
"""
import gradio as gr
import numpy as np
import soundfile as sf
import io
import re
from typing import Optional, Tuple, Generator
from dataclasses import dataclass
from enum import Enum
# Kokoro TTS imports
from kokoro import KPipeline
# ============================================================================
# CONFIGURATION & CONSTANTS
# ============================================================================
SAMPLE_RATE = 24000 # Kokoro outputs 24kHz audio
MAX_CHAR_LIMIT = 5000 # Maximum characters per generation
# Voice definitions with metadata
# Format: voice_id -> (display_name, gender, accent, quality_grade, description)
VOICE_CATALOG = {
# American English - Female
"af_heart": ("Heart ❀️", "Female", "American", "A", "Premium quality, warm and natural"),
"af_bella": ("Bella πŸ”₯", "Female", "American", "A-", "Clear and expressive"),
"af_nicole": ("Nicole 🎧", "Female", "American", "B-", "Professional narrator style"),
"af_aoede": ("Aoede", "Female", "American", "C+", "Melodic and pleasant"),
"af_kore": ("Kore", "Female", "American", "C+", "Youthful and energetic"),
"af_sarah": ("Sarah", "Female", "American", "C+", "Friendly and approachable"),
"af_nova": ("Nova", "Female", "American", "C", "Modern and crisp"),
"af_sky": ("Sky", "Female", "American", "C-", "Light and airy"),
"af_alloy": ("Alloy", "Female", "American", "C", "Balanced and versatile"),
"af_jessica": ("Jessica", "Female", "American", "D", "Casual conversational"),
"af_river": ("River", "Female", "American", "D", "Gentle and flowing"),
# American English - Male
"am_michael": ("Michael", "Male", "American", "C+", "Authoritative and clear"),
"am_fenrir": ("Fenrir", "Male", "American", "C+", "Deep and resonant"),
"am_puck": ("Puck", "Male", "American", "C+", "Playful and dynamic"),
"am_echo": ("Echo", "Male", "American", "D", "Warm and reflective"),
"am_eric": ("Eric", "Male", "American", "D", "Professional and steady"),
"am_liam": ("Liam", "Male", "American", "D", "Young and natural"),
"am_onyx": ("Onyx", "Male", "American", "D", "Rich and smooth"),
"am_santa": ("Santa πŸŽ…", "Male", "American", "D-", "Jolly and festive"),
"am_adam": ("Adam", "Male", "American", "F+", "Basic male voice"),
# British English - Female
"bf_emma": ("Emma", "Female", "British", "B-", "Elegant British accent"),
"bf_isabella": ("Isabella", "Female", "British", "C", "Sophisticated and refined"),
"bf_alice": ("Alice", "Female", "British", "D", "Classic British tone"),
"bf_lily": ("Lily", "Female", "British", "D", "Soft and gentle"),
# British English - Male
"bm_george": ("George", "Male", "British", "C", "Distinguished gentleman"),
"bm_fable": ("Fable", "Male", "British", "C", "Storyteller quality"),
"bm_lewis": ("Lewis", "Male", "British", "D+", "Conversational British"),
"bm_daniel": ("Daniel", "Male", "British", "D", "Standard British male"),
}
@dataclass
class StylePreset:
"""Defines a style preset with associated audio parameters."""
name: str
description: str
speed: float
pitch_shift: float # semitones
pause_multiplier: float
recommended_voices: list
# Style presets for different tones
STYLE_PRESETS = {
"neutral": StylePreset(
name="Neutral Narrator",
description="Clear, balanced narration suitable for most content",
speed=1.0,
pitch_shift=0,
pause_multiplier=1.0,
recommended_voices=["af_heart", "af_bella", "am_michael", "bf_emma"]
),
"dramatic": StylePreset(
name="Dramatic / Horror",
description="Slower, deeper voice for suspenseful or dramatic content",
speed=0.85,
pitch_shift=-2,
pause_multiplier=1.5,
recommended_voices=["am_fenrir", "am_onyx", "bm_george", "af_nicole"]
),
"excited": StylePreset(
name="Excited / Surprised",
description="Faster, higher energy delivery",
speed=1.2,
pitch_shift=1,
pause_multiplier=0.7,
recommended_voices=["af_kore", "am_puck", "af_nova", "af_sky"]
),
"calm": StylePreset(
name="Calm / Meditative",
description="Slow, soothing voice for relaxation content",
speed=0.8,
pitch_shift=-1,
pause_multiplier=1.8,
recommended_voices=["af_heart", "bf_lily", "am_echo", "bf_emma"]
),
"storyteller": StylePreset(
name="Storyteller",
description="Engaging pace for audiobooks and stories",
speed=0.95,
pitch_shift=0,
pause_multiplier=1.2,
recommended_voices=["bm_fable", "af_bella", "am_michael", "bf_isabella"]
),
"professional": StylePreset(
name="Professional / Corporate",
description="Clear, authoritative delivery for business content",
speed=1.05,
pitch_shift=0,
pause_multiplier=1.0,
recommended_voices=["af_nicole", "am_eric", "bf_emma", "bm_george"]
),
"cheerful": StylePreset(
name="Cheerful / Friendly",
description="Warm, upbeat tone for friendly content",
speed=1.1,
pitch_shift=0.5,
pause_multiplier=0.9,
recommended_voices=["af_sarah", "am_puck", "af_kore", "am_liam"]
),
}
# ============================================================================
# AUDIO PROCESSING UTILITIES
# ============================================================================
def pitch_shift_audio(audio: np.ndarray, sample_rate: int, semitones: float) -> np.ndarray:
"""
Shift the pitch of audio by a given number of semitones.
Uses simple resampling-based pitch shifting (no external dependencies).
Args:
audio: Input audio array
sample_rate: Sample rate of the audio
semitones: Number of semitones to shift (positive = higher, negative = lower)
Returns:
Pitch-shifted audio array
"""
if semitones == 0:
return audio
# Calculate the pitch shift factor
# Each semitone is a factor of 2^(1/12)
factor = 2 ** (semitones / 12)
# Resample to shift pitch
# To raise pitch: stretch time, then resample to original length
# To lower pitch: compress time, then resample to original length
original_length = len(audio)
# Create new sample indices
new_length = int(original_length / factor)
indices = np.linspace(0, original_length - 1, new_length)
# Linear interpolation for resampling
shifted = np.interp(indices, np.arange(original_length), audio)
# Resample back to original length to maintain duration
final_indices = np.linspace(0, len(shifted) - 1, original_length)
result = np.interp(final_indices, np.arange(len(shifted)), shifted)
return result.astype(np.float32)
def insert_pauses(audio_segments: list, pause_duration_ms: int, sample_rate: int) -> np.ndarray:
"""
Insert silence between audio segments.
Args:
audio_segments: List of audio arrays
pause_duration_ms: Pause duration in milliseconds
sample_rate: Sample rate of the audio
Returns:
Combined audio with pauses inserted
"""
if not audio_segments:
return np.array([], dtype=np.float32)
# Create silence array
pause_samples = int(sample_rate * pause_duration_ms / 1000)
silence = np.zeros(pause_samples, dtype=np.float32)
# Combine segments with pauses
combined = []
for i, segment in enumerate(audio_segments):
combined.append(segment)
if i < len(audio_segments) - 1: # Don't add pause after last segment
combined.append(silence)
return np.concatenate(combined)
def normalize_audio(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:
"""
Normalize audio to a target dB level.
Args:
audio: Input audio array
target_db: Target peak level in dB (default -3 dB)
Returns:
Normalized audio array
"""
if len(audio) == 0:
return audio
# Find the peak amplitude
peak = np.max(np.abs(audio))
if peak == 0:
return audio
# Calculate the gain needed
target_amplitude = 10 ** (target_db / 20)
gain = target_amplitude / peak
return (audio * gain).astype(np.float32)
def preprocess_text(text: str, add_pauses: bool = True) -> str:
"""
Preprocess text to improve TTS output quality.
Args:
text: Input text
add_pauses: Whether to add pause hints at sentence boundaries
Returns:
Preprocessed text
"""
# Clean up whitespace
text = re.sub(r'\s+', ' ', text.strip())
# Normalize common abbreviations
abbreviations = {
r'\bDr\.': 'Doctor',
r'\bMr\.': 'Mister',
r'\bMrs\.': 'Missus',
r'\bMs\.': 'Miss',
r'\bProf\.': 'Professor',
r'\betc\.': 'etcetera',
r'\be\.g\.': 'for example',
r'\bi\.e\.': 'that is',
}
for pattern, replacement in abbreviations.items():
text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
return text
# ============================================================================
# TTS ENGINE
# ============================================================================
class KokoroTTSEngine:
"""
Wrapper class for Kokoro TTS with additional processing capabilities.
"""
def __init__(self):
"""Initialize the TTS engine with both American and British English pipelines."""
print("Initializing Kokoro TTS Engine...")
# Initialize pipelines for both accents
self.pipelines = {
'a': KPipeline(lang_code='a'), # American English
'b': KPipeline(lang_code='b'), # British English
}
# Add custom pronunciation for "Kokoro"
self.pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkΙ™ΙΉO'
self.pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkΙ™ΙΉQ'
# Pre-load voice packs for faster inference
print("Pre-loading voice packs...")
for voice_id in VOICE_CATALOG.keys():
lang_code = voice_id[0] # 'a' or 'b'
try:
self.pipelines[lang_code].load_voice(voice_id)
except Exception as e:
print(f"Warning: Could not pre-load voice {voice_id}: {e}")
print("TTS Engine initialized successfully!")
def generate(
self,
text: str,
voice: str = "af_heart",
speed: float = 1.0,
pitch_shift: float = 0.0,
pause_between_sentences_ms: int = 300,
) -> Tuple[int, np.ndarray]:
"""
Generate speech from text with full parameter control.
Args:
text: Input text to synthesize
voice: Voice ID from VOICE_CATALOG
speed: Speed multiplier (0.5 to 2.0)
pitch_shift: Pitch adjustment in semitones (-5 to +5)
pause_between_sentences_ms: Pause duration between sentences
Returns:
Tuple of (sample_rate, audio_array)
"""
# Validate inputs
text = preprocess_text(text.strip()[:MAX_CHAR_LIMIT])
if not text:
return SAMPLE_RATE, np.zeros(1, dtype=np.float32)
speed = max(0.5, min(2.0, speed))
pitch_shift = max(-5, min(5, pitch_shift))
# Get the appropriate pipeline
lang_code = voice[0] if voice[0] in self.pipelines else 'a'
pipeline = self.pipelines[lang_code]
# Generate audio segments
audio_segments = []
try:
for _, phonemes, audio in pipeline(text, voice=voice, speed=speed):
if audio is not None:
audio_segments.append(audio.numpy() if hasattr(audio, 'numpy') else audio)
except Exception as e:
print(f"Generation error: {e}")
return SAMPLE_RATE, np.zeros(1, dtype=np.float32)
if not audio_segments:
return SAMPLE_RATE, np.zeros(1, dtype=np.float32)
# Combine segments with pauses
combined_audio = insert_pauses(audio_segments, pause_between_sentences_ms, SAMPLE_RATE)
# Apply pitch shift if requested
if pitch_shift != 0:
combined_audio = pitch_shift_audio(combined_audio, SAMPLE_RATE, pitch_shift)
# Normalize the final audio
combined_audio = normalize_audio(combined_audio)
return SAMPLE_RATE, combined_audio
def generate_with_style(
self,
text: str,
voice: str,
style_preset: str,
custom_speed: Optional[float] = None,
custom_pitch: Optional[float] = None,
custom_pause: Optional[int] = None,
) -> Tuple[int, np.ndarray]:
"""
Generate speech using a style preset with optional custom overrides.
Args:
text: Input text to synthesize
voice: Voice ID
style_preset: Style preset name from STYLE_PRESETS
custom_speed: Override the preset speed (optional)
custom_pitch: Override the preset pitch (optional)
custom_pause: Override the preset pause (optional)
Returns:
Tuple of (sample_rate, audio_array)
"""
preset = STYLE_PRESETS.get(style_preset, STYLE_PRESETS["neutral"])
speed = custom_speed if custom_speed is not None else preset.speed
pitch = custom_pitch if custom_pitch is not None else preset.pitch_shift
pause = custom_pause if custom_pause is not None else int(300 * preset.pause_multiplier)
return self.generate(
text=text,
voice=voice,
speed=speed,
pitch_shift=pitch,
pause_between_sentences_ms=pause,
)
# ============================================================================
# GRADIO INTERFACE
# ============================================================================
def create_voice_choices():
"""Create organized voice choices for the dropdown."""
choices = []
# Group by accent and gender
groups = {
("American", "Female"): [],
("American", "Male"): [],
("British", "Female"): [],
("British", "Male"): [],
}
for voice_id, (name, gender, accent, grade, desc) in VOICE_CATALOG.items():
groups[(accent, gender)].append((voice_id, name, grade))
# Build choices with group labels
for (accent, gender), voices in groups.items():
flag = "πŸ‡ΊπŸ‡Έ" if accent == "American" else "πŸ‡¬πŸ‡§"
gender_icon = "🚺" if gender == "Female" else "🚹"
for voice_id, name, grade in sorted(voices, key=lambda x: x[2]): # Sort by grade
label = f"{flag} {gender_icon} {name} [{grade}]"
choices.append((label, voice_id))
return choices
def create_style_choices():
"""Create style preset choices for the dropdown."""
return [(preset.name, key) for key, preset in STYLE_PRESETS.items()]
# Initialize the TTS engine globally
print("Loading Kokoro TTS Engine...")
tts_engine = KokoroTTSEngine()
def generate_speech(
text: str,
voice: str,
style: str,
speed: float,
pitch: float,
pause: int,
use_style_defaults: bool,
) -> Tuple[int, np.ndarray]:
"""
Main generation function for Gradio interface.
"""
if not text.strip():
gr.Warning("Please enter some text to synthesize.")
return None
try:
if use_style_defaults:
sample_rate, audio = tts_engine.generate_with_style(
text=text,
voice=voice,
style_preset=style,
)
else:
sample_rate, audio = tts_engine.generate(
text=text,
voice=voice,
speed=speed,
pitch_shift=pitch,
pause_between_sentences_ms=pause,
)
return (sample_rate, audio)
except Exception as e:
gr.Error(f"Generation failed: {str(e)}")
return None
def update_style_info(style: str) -> str:
"""Update the style information display."""
preset = STYLE_PRESETS.get(style, STYLE_PRESETS["neutral"])
recommended = ", ".join([
VOICE_CATALOG[v][0] for v in preset.recommended_voices if v in VOICE_CATALOG
])
return f"""**{preset.name}**
{preset.description}
- **Speed:** {preset.speed}x
- **Pitch Shift:** {preset.pitch_shift:+.1f} semitones
- **Pause Multiplier:** {preset.pause_multiplier}x
**Recommended Voices:** {recommended}
"""
def update_controls_from_style(style: str, use_defaults: bool):
"""Update the control sliders based on selected style."""
if not use_defaults:
return gr.update(), gr.update(), gr.update()
preset = STYLE_PRESETS.get(style, STYLE_PRESETS["neutral"])
return (
gr.update(value=preset.speed),
gr.update(value=preset.pitch_shift),
gr.update(value=int(300 * preset.pause_multiplier)),
)
# Sample texts for demonstration
SAMPLE_TEXTS = {
"welcome": """Welcome to Kokoro Text-to-Speech! This is an open-source model with 82 million parameters,
capable of producing natural-sounding speech. Try different voices and styles to find your perfect combination.""",
"horror": """The old house creaked as I pushed open the door. Something moved in the shadows.
A whisper echoed through the empty halls... "You shouldn't have come here."
I turned to run, but the door had vanished.""",
"news": """Breaking news tonight: Scientists have made a groundbreaking discovery that could change
our understanding of the universe. The research team announced their findings at a press conference
held earlier today at the National Science Foundation.""",
"story": """Once upon a time, in a kingdom far away, there lived a young princess who dreamed of adventure.
One day, she discovered a magical map hidden in the castle library.
Little did she know, this map would lead her to the greatest journey of her life.""",
"technical": """The system architecture consists of three main components: the frontend user interface,
the backend API server, and the database layer. Each component is designed for scalability and
can be deployed independently using container orchestration.""",
}
def load_sample_text(sample_key: str) -> str:
"""Load a sample text."""
return SAMPLE_TEXTS.get(sample_key, "")
# Build the Gradio interface
with gr.Blocks(
title="Text-to-Speech",
theme=gr.themes.Soft(),
css="""
.main-title {
text-align: center;
margin-bottom: 1rem;
}
.info-box {
border: 1px solid var(--border-color-primary);
border-radius: 8px;
padding: 1rem;
margin: 0.5rem 0;
}
.info-box strong {
color: var(--body-text-color);
}
"""
) as demo:
# Header
gr.Markdown(
"""
# πŸŽ™οΈ Text-to-Speech
**Created by Yash Chowdhary**
An open-source, high-quality TTS system powered by [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)
Features: 28 voices β€’ Style presets β€’ Speed/Pitch/Pause control β€’ CPU-friendly
""",
elem_classes=["main-title"]
)
with gr.Row():
# Left column - Input controls
with gr.Column(scale=1):
# Text input
text_input = gr.Textbox(
label="πŸ“ Text to Synthesize",
placeholder="Enter your text here...",
lines=6,
max_lines=15,
info=f"Maximum {MAX_CHAR_LIMIT} characters"
)
# Sample text buttons
with gr.Accordion("πŸ“š Sample Texts", open=False):
with gr.Row():
gr.Button("Welcome", size="sm").click(
lambda: SAMPLE_TEXTS["welcome"], outputs=text_input
)
gr.Button("Horror πŸ‘»", size="sm").click(
lambda: SAMPLE_TEXTS["horror"], outputs=text_input
)
gr.Button("News πŸ“°", size="sm").click(
lambda: SAMPLE_TEXTS["news"], outputs=text_input
)
with gr.Row():
gr.Button("Story πŸ“–", size="sm").click(
lambda: SAMPLE_TEXTS["story"], outputs=text_input
)
gr.Button("Technical πŸ’»", size="sm").click(
lambda: SAMPLE_TEXTS["technical"], outputs=text_input
)
# Voice selection
voice_dropdown = gr.Dropdown(
choices=create_voice_choices(),
value="af_heart",
label="🎭 Voice",
info="Select a voice (sorted by quality grade)"
)
# Style preset
style_dropdown = gr.Dropdown(
choices=create_style_choices(),
value="neutral",
label="🎨 Style Preset",
info="Choose a style for different content types"
)
# Style info display
style_info = gr.Markdown(
value=update_style_info("neutral"),
elem_classes=["info-box"]
)
# Use style defaults checkbox
use_style_defaults = gr.Checkbox(
label="Use Style Preset Defaults",
value=True,
info="When checked, style preset values override manual controls"
)
# Right column - Advanced controls and output
with gr.Column(scale=1):
# Advanced controls
with gr.Accordion("βš™οΈ Advanced Controls", open=True):
speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.05,
label="πŸƒ Speed",
info="Speaking rate (0.5x = slow, 2.0x = fast)"
)
pitch_slider = gr.Slider(
minimum=-5.0,
maximum=5.0,
value=0.0,
step=0.5,
label="🎡 Pitch Shift (semitones)",
info="Adjust voice pitch (-5 = deeper, +5 = higher)"
)
pause_slider = gr.Slider(
minimum=0,
maximum=1000,
value=300,
step=50,
label="⏸️ Pause Between Sentences (ms)",
info="Silence duration between sentences"
)
# Generate button
generate_btn = gr.Button(
"πŸŽ™οΈ Generate Speech",
variant="primary",
size="lg"
)
# Audio output
audio_output = gr.Audio(
label="πŸ”Š Generated Audio",
type="numpy",
interactive=False,
autoplay=True
)
# Download info
gr.Markdown(
"""
πŸ’‘ **Tips:**
- Click the download button (⬇️) on the audio player to save
- Try different voices with the same text to compare
- Use style presets as starting points, then customize
"""
)
# Footer
gr.Markdown(
"""
---
**Created by Yash Chowdhary** | Powered by [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) (Apache 2.0)
**Resources:** [Model Card](https://huggingface.co/hexgrad/Kokoro-82M) |
[Voice List](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md) |
[GitHub](https://github.com/hexgrad/kokoro)
"""
)
# Event handlers
style_dropdown.change(
fn=update_style_info,
inputs=[style_dropdown],
outputs=[style_info]
)
style_dropdown.change(
fn=update_controls_from_style,
inputs=[style_dropdown, use_style_defaults],
outputs=[speed_slider, pitch_slider, pause_slider]
)
use_style_defaults.change(
fn=update_controls_from_style,
inputs=[style_dropdown, use_style_defaults],
outputs=[speed_slider, pitch_slider, pause_slider]
)
generate_btn.click(
fn=generate_speech,
inputs=[
text_input,
voice_dropdown,
style_dropdown,
speed_slider,
pitch_slider,
pause_slider,
use_style_defaults,
],
outputs=[audio_output]
)
# Launch configuration
if __name__ == "__main__":
demo.queue().launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_api=True
)