Spaces:

Archimedis
/

Voice-clone

Sleeping

App Files Files Community

Voice-clone / app.py

Archimedis

Update app.py

8720acc verified about 2 months ago

raw

history blame contribute delete

5.22 kB

	import streamlit as st
	import soundfile as sf
	import librosa
	import numpy as np
	import tempfile
	import os
	import torch

	# --- Page Configuration ---
	st.set_page_config(
	page_title="VoiceClone Pro",
	page_icon="🎙️",
	layout="centered"
	)

	# --- Header Section ---
	st.title("🎙️ VoiceClone Pro")
	st.markdown("""
	<style>
	.stButton>button { width: 100%; border-radius: 20px; }
	.stTextInput>div>div>input { border-radius: 10px; }
	</style>
	""", unsafe_allow_html=True)

	st.caption("Enterprise-Grade Zero-Shot Voice Cloning. No Training Required.")

	# --- Model Loading ---
	@st.cache_resource
	def load_engine():
	try:
	from f5_tts.api import F5TTS
	# Initialize model
	model = F5TTS()
	return model
	except ImportError:
	return None
	except Exception as e:
	return str(e)

	with st.spinner("Initializing AI Engine... (This may take 1-2 mins on first boot)"):
	engine = load_engine()

	# --- Error Handling ---
	if engine is None:
	st.error("Critical Error: F5-TTS library not found. Please check requirements.txt.")
	st.stop()
	elif isinstance(engine, str):
	st.error(f"Model Load Error: {engine}")
	st.stop()
	else:
	st.success("System Online")

	# --- Audio Pre-processing (The Fix) ---
	def preprocess_audio(input_path):
	"""
	Forces audio to Mono and standardizes Sample Rate to fix Tensor Mismatch errors.
	"""
	# 1. Load with Librosa (Forces Mono mixing)
	# sr=None preserves original quality, we let F5-TTS handle final resampling if needed
	y, sr = librosa.load(input_path, sr=None, mono=True)

	# 2. Trim Silence (Removes dead air at start/end which confuses the model)
	y, _ = librosa.effects.trim(y, top_db=20)

	# 3. Create a clean temp file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	sf.write(tmp.name, y, sr)
	return tmp.name

	st.divider()

	# --- User Interface ---
	col1, col2 = st.columns([1, 2])

	with col1:
	st.subheader("1. Reference")
	st.info("Upload a 10-15s clear audio clip.")
	ref_audio = st.file_uploader("Drop Audio Here", type=["wav", "mp3", "aac", "m4a"])

	st.divider()
	st.subheader("⚙️ Settings")

	# Quality vs Speed Slider
	quality_steps = st.select_slider(
	"Quality vs. Speed",
	options=[8, 16, 32, 64],
	value=32, # Default to 32 for stability, use 16 for speed
	format_func=lambda x: f"{x} Steps ({'Fastest' if x==8 else 'Balanced' if x==16 else 'Standard' if x==32 else 'High Def'})"
	)

	speaking_rate = st.slider("Speaking Pace", 0.5, 2.0, 1.0, 0.1)

	with col2:
	st.subheader("2. Script")
	text_input = st.text_area(
	"Enter text to speak:",
	height=150,
	placeholder="Hello! I am speaking with the exact clone of your voice..."
	)

	# --- Generation Logic ---
	if st.button("Generate Clone", type="primary"):
	if not ref_audio:
	st.warning("Please upload a reference audio file first.")
	elif not text_input:
	st.warning("Please enter text to generate.")
	else:
	try:
	with st.status("Processing...", expanded=True) as status:
	# 1. Handle File Upload
	file_ext = os.path.splitext(ref_audio.name)[1] or ".wav"

	# Save raw upload
	with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as raw_tmp:
	raw_tmp.write(ref_audio.getbuffer())
	raw_tmp_path = raw_tmp.name

	# 2. Preprocess (The Fix for Tensor Error)
	status.write("Normalizing audio (Stereo -> Mono)...")
	clean_ref_path = preprocess_audio(raw_tmp_path)

	# 3. Run Inference
	status.write(f"Synthesizing ({quality_steps} steps)...")

	# Unpack 3 values (Audio, SampleRate, Spectrogram)
	wav, sr, _ = engine.infer(
	ref_file=clean_ref_path,
	ref_text="",
	gen_text=text_input,
	nfe_step=quality_steps,
	speed=speaking_rate
	)

	# 4. Save Output
	status.write("Finalizing audio stream...")
	output_path = "output_clone.wav"
	sf.write(output_path, wav, sr)

	# Cleanup Temp Files
	os.unlink(raw_tmp_path)
	os.unlink(clean_ref_path)

	status.update(label="Cloning Complete!", state="complete", expanded=False)

	# --- Result Display ---
	st.divider()
	st.subheader("Result")
	st.audio(output_path)

	with open(output_path, "rb") as file:
	st.download_button(
	label="Download Audio",
	data=file,
	file_name="cloned_voice.wav",
	mime="audio/wav"
	)

	except Exception as e:
	st.error(f"Generation Failed: {str(e)}")
	st.caption("Tip: Try a different audio file (shorter, clearer) if this persists.")