Voice-clone / app.py
Archimedis's picture
Update app.py
8720acc verified
import streamlit as st
import soundfile as sf
import librosa
import numpy as np
import tempfile
import os
import torch
# --- Page Configuration ---
st.set_page_config(
page_title="VoiceClone Pro",
page_icon="๐ŸŽ™๏ธ",
layout="centered"
)
# --- Header Section ---
st.title("๐ŸŽ™๏ธ VoiceClone Pro")
st.markdown("""
<style>
.stButton>button { width: 100%; border-radius: 20px; }
.stTextInput>div>div>input { border-radius: 10px; }
</style>
""", unsafe_allow_html=True)
st.caption("Enterprise-Grade Zero-Shot Voice Cloning. No Training Required.")
# --- Model Loading ---
@st.cache_resource
def load_engine():
try:
from f5_tts.api import F5TTS
# Initialize model
model = F5TTS()
return model
except ImportError:
return None
except Exception as e:
return str(e)
with st.spinner("Initializing AI Engine... (This may take 1-2 mins on first boot)"):
engine = load_engine()
# --- Error Handling ---
if engine is None:
st.error("Critical Error: F5-TTS library not found. Please check requirements.txt.")
st.stop()
elif isinstance(engine, str):
st.error(f"Model Load Error: {engine}")
st.stop()
else:
st.success("System Online")
# --- Audio Pre-processing (The Fix) ---
def preprocess_audio(input_path):
"""
Forces audio to Mono and standardizes Sample Rate to fix Tensor Mismatch errors.
"""
# 1. Load with Librosa (Forces Mono mixing)
# sr=None preserves original quality, we let F5-TTS handle final resampling if needed
y, sr = librosa.load(input_path, sr=None, mono=True)
# 2. Trim Silence (Removes dead air at start/end which confuses the model)
y, _ = librosa.effects.trim(y, top_db=20)
# 3. Create a clean temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
sf.write(tmp.name, y, sr)
return tmp.name
st.divider()
# --- User Interface ---
col1, col2 = st.columns([1, 2])
with col1:
st.subheader("1. Reference")
st.info("Upload a 10-15s clear audio clip.")
ref_audio = st.file_uploader("Drop Audio Here", type=["wav", "mp3", "aac", "m4a"])
st.divider()
st.subheader("โš™๏ธ Settings")
# Quality vs Speed Slider
quality_steps = st.select_slider(
"Quality vs. Speed",
options=[8, 16, 32, 64],
value=32, # Default to 32 for stability, use 16 for speed
format_func=lambda x: f"{x} Steps ({'Fastest' if x==8 else 'Balanced' if x==16 else 'Standard' if x==32 else 'High Def'})"
)
speaking_rate = st.slider("Speaking Pace", 0.5, 2.0, 1.0, 0.1)
with col2:
st.subheader("2. Script")
text_input = st.text_area(
"Enter text to speak:",
height=150,
placeholder="Hello! I am speaking with the exact clone of your voice..."
)
# --- Generation Logic ---
if st.button("Generate Clone", type="primary"):
if not ref_audio:
st.warning("Please upload a reference audio file first.")
elif not text_input:
st.warning("Please enter text to generate.")
else:
try:
with st.status("Processing...", expanded=True) as status:
# 1. Handle File Upload
file_ext = os.path.splitext(ref_audio.name)[1] or ".wav"
# Save raw upload
with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as raw_tmp:
raw_tmp.write(ref_audio.getbuffer())
raw_tmp_path = raw_tmp.name
# 2. Preprocess (The Fix for Tensor Error)
status.write("Normalizing audio (Stereo -> Mono)...")
clean_ref_path = preprocess_audio(raw_tmp_path)
# 3. Run Inference
status.write(f"Synthesizing ({quality_steps} steps)...")
# Unpack 3 values (Audio, SampleRate, Spectrogram)
wav, sr, _ = engine.infer(
ref_file=clean_ref_path,
ref_text="",
gen_text=text_input,
nfe_step=quality_steps,
speed=speaking_rate
)
# 4. Save Output
status.write("Finalizing audio stream...")
output_path = "output_clone.wav"
sf.write(output_path, wav, sr)
# Cleanup Temp Files
os.unlink(raw_tmp_path)
os.unlink(clean_ref_path)
status.update(label="Cloning Complete!", state="complete", expanded=False)
# --- Result Display ---
st.divider()
st.subheader("Result")
st.audio(output_path)
with open(output_path, "rb") as file:
st.download_button(
label="Download Audio",
data=file,
file_name="cloned_voice.wav",
mime="audio/wav"
)
except Exception as e:
st.error(f"Generation Failed: {str(e)}")
st.caption("Tip: Try a different audio file (shorter, clearer) if this persists.")