import streamlit as st import soundfile as sf import librosa import numpy as np import tempfile import os import torch # --- Page Configuration --- st.set_page_config( page_title="VoiceClone Pro", page_icon="🎙️", layout="centered" ) # --- Header Section --- st.title("🎙️ VoiceClone Pro") st.markdown(""" """, unsafe_allow_html=True) st.caption("Enterprise-Grade Zero-Shot Voice Cloning. No Training Required.") # --- Model Loading --- @st.cache_resource def load_engine(): try: from f5_tts.api import F5TTS # Initialize model model = F5TTS() return model except ImportError: return None except Exception as e: return str(e) with st.spinner("Initializing AI Engine... (This may take 1-2 mins on first boot)"): engine = load_engine() # --- Error Handling --- if engine is None: st.error("Critical Error: F5-TTS library not found. Please check requirements.txt.") st.stop() elif isinstance(engine, str): st.error(f"Model Load Error: {engine}") st.stop() else: st.success("System Online") # --- Audio Pre-processing (The Fix) --- def preprocess_audio(input_path): """ Forces audio to Mono and standardizes Sample Rate to fix Tensor Mismatch errors. """ # 1. Load with Librosa (Forces Mono mixing) # sr=None preserves original quality, we let F5-TTS handle final resampling if needed y, sr = librosa.load(input_path, sr=None, mono=True) # 2. Trim Silence (Removes dead air at start/end which confuses the model) y, _ = librosa.effects.trim(y, top_db=20) # 3. Create a clean temp file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: sf.write(tmp.name, y, sr) return tmp.name st.divider() # --- User Interface --- col1, col2 = st.columns([1, 2]) with col1: st.subheader("1. Reference") st.info("Upload a 10-15s clear audio clip.") ref_audio = st.file_uploader("Drop Audio Here", type=["wav", "mp3", "aac", "m4a"]) st.divider() st.subheader("⚙️ Settings") # Quality vs Speed Slider quality_steps = st.select_slider( "Quality vs. Speed", options=[8, 16, 32, 64], value=32, # Default to 32 for stability, use 16 for speed format_func=lambda x: f"{x} Steps ({'Fastest' if x==8 else 'Balanced' if x==16 else 'Standard' if x==32 else 'High Def'})" ) speaking_rate = st.slider("Speaking Pace", 0.5, 2.0, 1.0, 0.1) with col2: st.subheader("2. Script") text_input = st.text_area( "Enter text to speak:", height=150, placeholder="Hello! I am speaking with the exact clone of your voice..." ) # --- Generation Logic --- if st.button("Generate Clone", type="primary"): if not ref_audio: st.warning("Please upload a reference audio file first.") elif not text_input: st.warning("Please enter text to generate.") else: try: with st.status("Processing...", expanded=True) as status: # 1. Handle File Upload file_ext = os.path.splitext(ref_audio.name)[1] or ".wav" # Save raw upload with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as raw_tmp: raw_tmp.write(ref_audio.getbuffer()) raw_tmp_path = raw_tmp.name # 2. Preprocess (The Fix for Tensor Error) status.write("Normalizing audio (Stereo -> Mono)...") clean_ref_path = preprocess_audio(raw_tmp_path) # 3. Run Inference status.write(f"Synthesizing ({quality_steps} steps)...") # Unpack 3 values (Audio, SampleRate, Spectrogram) wav, sr, _ = engine.infer( ref_file=clean_ref_path, ref_text="", gen_text=text_input, nfe_step=quality_steps, speed=speaking_rate ) # 4. Save Output status.write("Finalizing audio stream...") output_path = "output_clone.wav" sf.write(output_path, wav, sr) # Cleanup Temp Files os.unlink(raw_tmp_path) os.unlink(clean_ref_path) status.update(label="Cloning Complete!", state="complete", expanded=False) # --- Result Display --- st.divider() st.subheader("Result") st.audio(output_path) with open(output_path, "rb") as file: st.download_button( label="Download Audio", data=file, file_name="cloned_voice.wav", mime="audio/wav" ) except Exception as e: st.error(f"Generation Failed: {str(e)}") st.caption("Tip: Try a different audio file (shorter, clearer) if this persists.")