Index_tts_streamlit / src /streamlit_app.py
VictorKola's picture
Update src/streamlit_app.py
eff35ed verified
import streamlit as st
import torch
import tempfile
import soundfile as sf
from TTS.api import TTS
from streamlit_audiorecorder import audiorecorder
st.set_page_config(page_title="Voice Clone TTS", layout="centered")
st.title("🎙️ Voice-Cloning Text-to-Speech")
st.markdown(
"""
1. **Record** your voice or **upload** an existing audio file (WAV/MP3).
2. Enter the **text** you want spoken in _your_ voice.
3. (Optional) Paste an **API key** if required by your model/service.
4. Click **Generate** to hear the cloned speech.
"""
)
# 1) AUDIO INPUT: record or upload
st.header("1. Provide your voice sample")
col1, col2 = st.columns(2)
with col1:
st.write("**Record in-page**")
audio_bytes = audiorecorder("Click to record", "Recording…")
if isinstance(audio_bytes, bytes):
st.audio(audio_bytes, format="audio/wav")
with col2:
st.write("**Or upload file**")
upload = st.file_uploader("Upload WAV/MP3", type=["wav", "mp3"])
if upload is not None:
audio_bytes = upload.read()
st.audio(audio_bytes, format=upload.type)
if 'audio_bytes' not in locals() or not isinstance(audio_bytes, (bytes, bytearray)):
st.warning("Please record or upload a valid audio sample before proceeding.")
st.stop()
# 2) USER TEXT & (optional) KEY
st.header("2. Text & API key")
text_input = st.text_area("Enter text to speak in your voice", value="Hello, this is my cloned voice!", height=120)
api_key = st.text_input("API Key (if your model needs one)", type="password")
# 3) LOAD & CACHE THE TTS PIPELINE
@st.cache_resource(show_spinner=False)
def load_tts_model():
# replace with your chosen multispeaker/cloning model
model_name = "IndexTeam/IndexTTS-1.5"
# Coqui TTS uses its own GPU flag
return TTS(model_name=model_name, progress_bar=False, gpu=torch.cuda.is_available())
tts = load_tts_model()
# 4) GENERATE
if st.button("▶️ Generate Speech"):
if not text_input.strip():
st.error("Please enter some text to synthesize.")
st.stop()
with st.spinner("Cloning your voice…"):
# save the reference audio to a temp WAV
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
sf.write(tmp.name, sf.read(io.BytesIO(audio_bytes))[0], samplerate=sf.read(io.BytesIO(audio_bytes))[1])
ref_path = tmp.name
# do the TTS with your voice as reference
wav = tts.tts(text=text_input, speaker_wav=ref_path)
# save output and play
out_path = ref_path.replace(".wav", "_out.wav")
sf.write(out_path, wav, samplerate=tts.synthesizer.output_sample_rate)
st.success("✅ Done!")
st.audio(out_path, format="audio/wav")