| import streamlit as st |
| import torch |
| import numpy as np |
| import tempfile |
| from transformers import VitsModel, AutoTokenizer |
| from scipy.io.wavfile import write |
| import librosa |
| from scipy.signal import butter, lfilter |
|
|
| |
| |
| |
|
|
| VOICE_PRESETS = { |
| "Custom (Manual)": None, |
| "Neutral": { |
| "pitch": 0, |
| "speed": 1.0, |
| "effect": None |
| }, |
| "Deep": { |
| "pitch": -4, |
| "speed": 0.9, |
| "effect": "bass" |
| }, |
| "Child-like": { |
| "pitch": 5, |
| "speed": 1.15, |
| "effect": None |
| }, |
| "Robotic": { |
| "pitch": 0, |
| "speed": 1.0, |
| "effect": "robotic" |
| } |
| } |
|
|
|
|
| |
| |
| |
|
|
| def apply_pitch_speed(audio, sr, pitch=0, speed=1.0): |
| if speed != 1.0: |
| audio = librosa.effects.time_stretch(audio, rate=speed) |
| if pitch != 0: |
| audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=pitch) |
| return audio |
|
|
|
|
| def bass_boost(audio, sr, gain=1.5, cutoff=200): |
| b, a = butter(2, cutoff / (sr / 2), btype="low") |
| low = lfilter(b, a, audio) |
| return audio + gain * low |
|
|
|
|
| def robotic_effect(audio, sr, freq=30): |
| t = np.arange(len(audio)) / sr |
| modulator = np.sin(2 * np.pi * freq * t) |
| return audio * modulator |
|
|
|
|
| |
| |
| |
| st.set_page_config( |
| page_title="MMS-TTS English", |
| layout="centered" |
| ) |
|
|
| st.title("๐ MMS-TTS English (Speed & Pitch Control)") |
|
|
| st.markdown( |
| """ |
| Generate English speech using **facebook/mms-tts-eng** |
| Post-process audio to control **speed** and **pitch**. |
| """ |
| ) |
|
|
| |
| |
| |
| @st.cache_resource |
| def load_model(): |
| model = VitsModel.from_pretrained("facebook/mms-tts-eng") |
| tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") |
| model.eval() |
| return model, tokenizer |
|
|
| model, tokenizer = load_model() |
|
|
| |
| |
| |
| st.caption( |
| "Select a preset for fixed voice styles, or choose Custom (Manual) to control pitch and speed yourself." |
| ) |
|
|
| preset_name = st.selectbox( |
| "Voice Preset", |
| options=list(VOICE_PRESETS.keys()), |
| index=0 |
| ) |
|
|
| preset = VOICE_PRESETS[preset_name] |
| is_custom = preset is None |
|
|
| text = st.text_area( |
| "Input Text", |
| height=150, |
| placeholder="Enter English text here..." |
| ) |
|
|
| speed = st.slider( |
| "Speech Speed", |
| min_value=0.5, |
| max_value=1.5, |
| value=1.0, |
| step=0.05, |
| disabled=not is_custom |
| ) |
|
|
| pitch = st.slider( |
| "Pitch Shift (semitones)", |
| min_value=-6, |
| max_value=6, |
| value=0, |
| step=1, |
| disabled=not is_custom |
| ) |
|
|
| if not is_custom: |
| st.info( |
| f"Preset selected: **{preset_name}**\n\n" |
| f"- Pitch: {preset['pitch']} semitones\n" |
| f"- Speed: {preset['speed']}x\n" |
| f"- Effect: {preset['effect'] if preset['effect'] else 'None'}" |
| ) |
| else: |
| preset = { |
| "pitch": pitch, |
| "speed": speed, |
| "effect": None |
| } |
|
|
| |
| |
| |
|
|
| generate = st.button("๐๏ธ Generate Audio") |
|
|
|
|
|
|
| |
| |
| |
| if generate: |
| if not text.strip(): |
| st.warning("Please enter text.") |
| else: |
| with st.spinner("Generating speech..."): |
| inputs = tokenizer(text, return_tensors="pt") |
|
|
| with torch.no_grad(): |
| waveform = model(**inputs).waveform |
|
|
| audio = waveform.squeeze().cpu().numpy() |
| sr = model.config.sampling_rate |
|
|
| |
| audio = apply_pitch_speed( |
| audio, |
| sr, |
| pitch=preset["pitch"], |
| speed=preset["speed"] |
| ) |
|
|
| |
| if preset["effect"] == "bass": |
| audio = bass_boost(audio, sr) |
| elif preset["effect"] == "robotic": |
| audio = robotic_effect(audio, sr) |
|
|
| |
| audio = audio / np.max(np.abs(audio)) |
| audio_int16 = np.int16(audio * 32767) |
|
|
|
|
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: |
| write(tmp.name, sr, audio_int16) |
| output_path = tmp.name |
|
|
| st.success("Audio generated successfully!") |
|
|
| st.audio(output_path, format="audio/wav") |
|
|
| with open(output_path, "rb") as f: |
| st.download_button( |
| "โฌ๏ธ Download WAV", |
| data=f, |
| file_name="mms_tts_output.wav", |
| mime="audio/wav" |
| ) |
| |