Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import soundfile as sf | |
| import librosa | |
| import numpy as np | |
| import tempfile | |
| import os | |
| import torch | |
| # --- Page Configuration --- | |
| st.set_page_config( | |
| page_title="VoiceClone Pro", | |
| page_icon="๐๏ธ", | |
| layout="centered" | |
| ) | |
| # --- Header Section --- | |
| st.title("๐๏ธ VoiceClone Pro") | |
| st.markdown(""" | |
| <style> | |
| .stButton>button { width: 100%; border-radius: 20px; } | |
| .stTextInput>div>div>input { border-radius: 10px; } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| st.caption("Enterprise-Grade Zero-Shot Voice Cloning. No Training Required.") | |
| # --- Model Loading --- | |
| def load_engine(): | |
| try: | |
| from f5_tts.api import F5TTS | |
| # Initialize model | |
| model = F5TTS() | |
| return model | |
| except ImportError: | |
| return None | |
| except Exception as e: | |
| return str(e) | |
| with st.spinner("Initializing AI Engine... (This may take 1-2 mins on first boot)"): | |
| engine = load_engine() | |
| # --- Error Handling --- | |
| if engine is None: | |
| st.error("Critical Error: F5-TTS library not found. Please check requirements.txt.") | |
| st.stop() | |
| elif isinstance(engine, str): | |
| st.error(f"Model Load Error: {engine}") | |
| st.stop() | |
| else: | |
| st.success("System Online") | |
| # --- Audio Pre-processing (The Fix) --- | |
| def preprocess_audio(input_path): | |
| """ | |
| Forces audio to Mono and standardizes Sample Rate to fix Tensor Mismatch errors. | |
| """ | |
| # 1. Load with Librosa (Forces Mono mixing) | |
| # sr=None preserves original quality, we let F5-TTS handle final resampling if needed | |
| y, sr = librosa.load(input_path, sr=None, mono=True) | |
| # 2. Trim Silence (Removes dead air at start/end which confuses the model) | |
| y, _ = librosa.effects.trim(y, top_db=20) | |
| # 3. Create a clean temp file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| sf.write(tmp.name, y, sr) | |
| return tmp.name | |
| st.divider() | |
| # --- User Interface --- | |
| col1, col2 = st.columns([1, 2]) | |
| with col1: | |
| st.subheader("1. Reference") | |
| st.info("Upload a 10-15s clear audio clip.") | |
| ref_audio = st.file_uploader("Drop Audio Here", type=["wav", "mp3", "aac", "m4a"]) | |
| st.divider() | |
| st.subheader("โ๏ธ Settings") | |
| # Quality vs Speed Slider | |
| quality_steps = st.select_slider( | |
| "Quality vs. Speed", | |
| options=[8, 16, 32, 64], | |
| value=32, # Default to 32 for stability, use 16 for speed | |
| format_func=lambda x: f"{x} Steps ({'Fastest' if x==8 else 'Balanced' if x==16 else 'Standard' if x==32 else 'High Def'})" | |
| ) | |
| speaking_rate = st.slider("Speaking Pace", 0.5, 2.0, 1.0, 0.1) | |
| with col2: | |
| st.subheader("2. Script") | |
| text_input = st.text_area( | |
| "Enter text to speak:", | |
| height=150, | |
| placeholder="Hello! I am speaking with the exact clone of your voice..." | |
| ) | |
| # --- Generation Logic --- | |
| if st.button("Generate Clone", type="primary"): | |
| if not ref_audio: | |
| st.warning("Please upload a reference audio file first.") | |
| elif not text_input: | |
| st.warning("Please enter text to generate.") | |
| else: | |
| try: | |
| with st.status("Processing...", expanded=True) as status: | |
| # 1. Handle File Upload | |
| file_ext = os.path.splitext(ref_audio.name)[1] or ".wav" | |
| # Save raw upload | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as raw_tmp: | |
| raw_tmp.write(ref_audio.getbuffer()) | |
| raw_tmp_path = raw_tmp.name | |
| # 2. Preprocess (The Fix for Tensor Error) | |
| status.write("Normalizing audio (Stereo -> Mono)...") | |
| clean_ref_path = preprocess_audio(raw_tmp_path) | |
| # 3. Run Inference | |
| status.write(f"Synthesizing ({quality_steps} steps)...") | |
| # Unpack 3 values (Audio, SampleRate, Spectrogram) | |
| wav, sr, _ = engine.infer( | |
| ref_file=clean_ref_path, | |
| ref_text="", | |
| gen_text=text_input, | |
| nfe_step=quality_steps, | |
| speed=speaking_rate | |
| ) | |
| # 4. Save Output | |
| status.write("Finalizing audio stream...") | |
| output_path = "output_clone.wav" | |
| sf.write(output_path, wav, sr) | |
| # Cleanup Temp Files | |
| os.unlink(raw_tmp_path) | |
| os.unlink(clean_ref_path) | |
| status.update(label="Cloning Complete!", state="complete", expanded=False) | |
| # --- Result Display --- | |
| st.divider() | |
| st.subheader("Result") | |
| st.audio(output_path) | |
| with open(output_path, "rb") as file: | |
| st.download_button( | |
| label="Download Audio", | |
| data=file, | |
| file_name="cloned_voice.wav", | |
| mime="audio/wav" | |
| ) | |
| except Exception as e: | |
| st.error(f"Generation Failed: {str(e)}") | |
| st.caption("Tip: Try a different audio file (shorter, clearer) if this persists.") |