tts / app.py
sadimanna's picture
added files
1168147
import streamlit as st
import torch
import numpy as np
import tempfile
from transformers import VitsModel, AutoTokenizer
from scipy.io.wavfile import write
import librosa
from scipy.signal import butter, lfilter
#============================================
# Voice Presets
#============================================
VOICE_PRESETS = {
"Custom (Manual)": None, # special case
"Neutral": {
"pitch": 0,
"speed": 1.0,
"effect": None
},
"Deep": {
"pitch": -4,
"speed": 0.9,
"effect": "bass"
},
"Child-like": {
"pitch": 5,
"speed": 1.15,
"effect": None
},
"Robotic": {
"pitch": 0,
"speed": 1.0,
"effect": "robotic"
}
}
#============================================
# Audio Post-Processing Functions
#============================================
def apply_pitch_speed(audio, sr, pitch=0, speed=1.0):
if speed != 1.0:
audio = librosa.effects.time_stretch(audio, rate=speed)
if pitch != 0:
audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=pitch)
return audio
def bass_boost(audio, sr, gain=1.5, cutoff=200):
b, a = butter(2, cutoff / (sr / 2), btype="low")
low = lfilter(b, a, audio)
return audio + gain * low
def robotic_effect(audio, sr, freq=30):
t = np.arange(len(audio)) / sr
modulator = np.sin(2 * np.pi * freq * t)
return audio * modulator
# ------------------------
# Page config
# ------------------------
st.set_page_config(
page_title="MMS-TTS English",
layout="centered"
)
st.title("๐Ÿ”Š MMS-TTS English (Speed & Pitch Control)")
st.markdown(
"""
Generate English speech using **facebook/mms-tts-eng**
Post-process audio to control **speed** and **pitch**.
"""
)
# ------------------------
# Load model (cached)
# ------------------------
@st.cache_resource
def load_model():
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
model.eval()
return model, tokenizer
model, tokenizer = load_model()
# ------------------------
# UI Controls
# ------------------------
st.caption(
"Select a preset for fixed voice styles, or choose Custom (Manual) to control pitch and speed yourself."
)
preset_name = st.selectbox(
"Voice Preset",
options=list(VOICE_PRESETS.keys()),
index=0
)
preset = VOICE_PRESETS[preset_name]
is_custom = preset is None
text = st.text_area(
"Input Text",
height=150,
placeholder="Enter English text here..."
)
speed = st.slider(
"Speech Speed",
min_value=0.5,
max_value=1.5,
value=1.0,
step=0.05,
disabled=not is_custom
)
pitch = st.slider(
"Pitch Shift (semitones)",
min_value=-6,
max_value=6,
value=0,
step=1,
disabled=not is_custom
)
if not is_custom:
st.info(
f"Preset selected: **{preset_name}**\n\n"
f"- Pitch: {preset['pitch']} semitones\n"
f"- Speed: {preset['speed']}x\n"
f"- Effect: {preset['effect'] if preset['effect'] else 'None'}"
)
else:
preset = {
"pitch": pitch,
"speed": speed,
"effect": None
}
#=------------------------
# Generate Button
# ------------------------
generate = st.button("๐ŸŽ™๏ธ Generate Audio")
# ------------------------
# Generation
# ------------------------
if generate:
if not text.strip():
st.warning("Please enter text.")
else:
with st.spinner("Generating speech..."):
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
waveform = model(**inputs).waveform
audio = waveform.squeeze().cpu().numpy()
sr = model.config.sampling_rate
# Apply pitch + speed
audio = apply_pitch_speed(
audio,
sr,
pitch=preset["pitch"],
speed=preset["speed"]
)
# Apply effect
if preset["effect"] == "bass":
audio = bass_boost(audio, sr)
elif preset["effect"] == "robotic":
audio = robotic_effect(audio, sr)
# Normalize
audio = audio / np.max(np.abs(audio))
audio_int16 = np.int16(audio * 32767)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
write(tmp.name, sr, audio_int16)
output_path = tmp.name
st.success("Audio generated successfully!")
st.audio(output_path, format="audio/wav")
with open(output_path, "rb") as f:
st.download_button(
"โฌ‡๏ธ Download WAV",
data=f,
file_name="mms_tts_output.wav",
mime="audio/wav"
)
# ------------------------