E2-F5-TTS / app.py
Chouio's picture
Update app.py
b744140 verified
import spaces
import gradio as gr
from f5_tts.infer.utils_infer import remove_silence_for_generated_wav
from f5_tts.api import F5TTS
import tempfile
import os
import requests
import gdown
import zipfile
from pathlib import Path
# Initialize F5TTS
f5tts = F5TTS()
@spaces.GPU
def run_tts(ref_audio, ref_text, gen_text, remove_silence=False):
output_wav_path = tempfile.mktemp(suffix=".wav")
wav, sr, _ = f5tts.infer(
ref_file=ref_audio,
ref_text=ref_text,
gen_text=gen_text,
file_wave=output_wav_path,
remove_silence=remove_silence,
)
return output_wav_path
def download_voice(voice_url, voice_name, progress=gr.Progress()):
"""Download and setup a voice from URL"""
if not voice_url or not voice_name:
return "Please provide both URL and voice name."
base_path = "downloaded_voices"
os.makedirs(base_path, exist_ok=True)
# Determine download type
is_huggingface = "huggingface.co" in voice_url
is_google_drive = "drive.google.com" in voice_url
if not (is_huggingface or is_google_drive):
return "Unsupported URL. Only Hugging Face and Google Drive links are supported."
# Create voice directory
voice_dir = os.path.join(base_path, voice_name)
os.makedirs(voice_dir, exist_ok=True)
# Download file
zip_path = os.path.join(voice_dir, f"{voice_name}.zip")
try:
if is_huggingface:
progress(0, desc="Downloading from Hugging Face...")
response = requests.get(voice_url, stream=True)
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
with open(zip_path, 'wb') as f:
downloaded = 0
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if total_size > 0:
progress(downloaded / total_size, desc=f"Downloading: {downloaded//1024}KB/{total_size//1024}KB")
elif is_google_drive:
progress(0, desc="Downloading from Google Drive...")
gdown.download(url=voice_url, output=zip_path, quiet=False, fuzzy=True)
# Extract ZIP file
progress(0.8, desc="Extracting files...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(voice_dir)
# Remove ZIP file after extraction
if os.path.exists(zip_path):
os.remove(zip_path)
# Check if the voice was properly extracted
if not os.path.exists(voice_dir) or len(os.listdir(voice_dir)) == 0:
return "Voice directory is empty after extraction. Download may have failed."
# List downloaded files
files = os.listdir(voice_dir)
file_list = "\n".join([f" - {file}" for file in files])
return f"✅ Voice '{voice_name}' successfully downloaded!\n📁 Location: {voice_dir}\n📋 Files:\n{file_list}"
except Exception as e:
# Clean up on error
if os.path.exists(voice_dir):
try:
if os.path.exists(zip_path):
os.remove(zip_path)
# Don't remove the whole directory as it might contain other files
except:
pass
return f"❌ Error downloading voice: {str(e)}"
def list_available_voices():
"""List available downloaded voices"""
base_path = "downloaded_voices"
if not os.path.exists(base_path):
return "No voices downloaded yet."
voices = []
for item in os.listdir(base_path):
item_path = os.path.join(base_path, item)
if os.path.isdir(item_path):
files = os.listdir(item_path)
voices.append(f"🎤 **{item}**\n📍 Path: {item_path}\n📋 Files: {', '.join(files)}\n")
if not voices:
return "No voices found in the downloaded_voices directory."
return "\n".join(voices)
def load_voice_audio(voice_name, audio_file):
"""Load audio from downloaded voice"""
base_path = "downloaded_voices"
voice_path = os.path.join(base_path, voice_name)
if not os.path.exists(voice_path):
return None, f"Voice '{voice_name}' not found."
audio_path = os.path.join(voice_path, audio_file)
if not os.path.exists(audio_path):
return None, f"Audio file '{audio_file}' not found in voice '{voice_name}' directory."
return audio_path, f"✅ Loaded audio: {audio_file} from voice '{voice_name}'"
# Create Gradio interface with tabs
with gr.Blocks(title="🗣️ F5-TTS Demo with Voice Download") as demo:
gr.Markdown("# 🗣️ F5-TTS Demo with Voice Management")
gr.Markdown("Upload a reference voice, give reference and generation text, and hear it in the same voice! Plus, download pre-made voices from Hugging Face or Google Drive.")
with gr.Tabs():
with gr.TabItem("🔊 Generate Speech"):
with gr.Row():
with gr.Column():
ref_audio = gr.Audio(label="Reference Audio", type="filepath")
ref_text = gr.Textbox(
label="Reference Text",
placeholder="some call me nature, others call me mother nature.",
lines=3
)
gen_text = gr.Textbox(
label="Generation Text",
placeholder="I don't really care what you call me...",
lines=5
)
remove_silence = gr.Checkbox(label="Remove Silence from Output?", value=False)
generate_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
output_audio = gr.Audio(label="Generated Speech")
spectrogram = gr.Image(label="Spectrogram (if available)")
generate_btn.click(
fn=run_tts,
inputs=[ref_audio, ref_text, gen_text, remove_silence],
outputs=[output_audio]
)
with gr.TabItem("📥 Download Voices"):
gr.Markdown("## 📥 Download Pre-made Voices")
gr.Markdown("Download voices from Hugging Face or Google Drive. The voice should be in ZIP format containing audio files and metadata.")
with gr.Row():
with gr.Column():
voice_url = gr.Textbox(
label="Voice URL (Hugging Face or Google Drive)",
placeholder="https://huggingface.co/Chouio/Adam/resolve/main/AdamDefinitive.zip",
lines=2
)
voice_name = gr.Textbox(
label="Voice Name (for folder)",
placeholder="my_voice"
)
download_btn = gr.Button("Download Voice", variant="primary")
download_status = gr.Textbox(label="Status", interactive=False)
with gr.Column():
gr.Markdown("### 📋 Available Voices")
refresh_btn = gr.Button("Refresh List")
voices_list = gr.Markdown(label="Available Voices", value="No voices downloaded yet.")
download_btn.click(
fn=download_voice,
inputs=[voice_url, voice_name],
outputs=[download_status]
)
refresh_btn.click(
fn=list_available_voices,
outputs=[voices_list]
)
with gr.TabItem("🎭 Use Downloaded Voice"):
gr.Markdown("## 🎭 Use Downloaded Voice for TTS")
gr.Markdown("Select a downloaded voice and use its audio files for reference.")
with gr.Row():
with gr.Column():
# Voice selector
available_voices = gr.Dropdown(label="Select Voice", choices=[])
refresh_voices_btn = gr.Button("Refresh Voices")
# Audio file selector
voice_audio_files = gr.Dropdown(label="Select Audio File", choices=[])
load_audio_btn = gr.Button("Load Selected Audio")
# Reference text (auto-filled or manual)
ref_text_downloaded = gr.Textbox(
label="Reference Text",
placeholder="Reference text will be auto-filled or you can enter manually",
lines=3
)
# Generation text
gen_text_downloaded = gr.Textbox(
label="Generation Text",
placeholder="Enter text to generate in this voice...",
lines=5
)
remove_silence_downloaded = gr.Checkbox(label="Remove Silence from Output?", value=False)
generate_from_voice_btn = gr.Button("Generate with This Voice", variant="primary")
with gr.Column():
loaded_audio = gr.Audio(label="Loaded Reference Audio")
output_audio_downloaded = gr.Audio(label="Generated Speech")
# Refresh available voices
def refresh_voice_list():
base_path = "downloaded_voices"
if not os.path.exists(base_path):
return []
voices = []
for item in os.listdir(base_path):
if os.path.isdir(os.path.join(base_path, item)):
voices.append(item)
return voices
refresh_voices_btn.click(
fn=refresh_voice_list,
outputs=[available_voices]
)
# Update audio files when voice is selected
def update_audio_files(voice_name):
if not voice_name:
return []
base_path = "downloaded_voices"
voice_path = os.path.join(base_path, voice_name)
if not os.path.exists(voice_path):
return []
audio_files = []
for file in os.listdir(voice_path):
if file.lower().endswith(('.wav', '.mp3', '.flac', '.ogg')):
audio_files.append(file)
return audio_files
available_voices.change(
fn=update_audio_files,
inputs=[available_voices],
outputs=[voice_audio_files]
)
# Load selected audio
load_audio_btn.click(
fn=load_voice_audio,
inputs=[available_voices, voice_audio_files],
outputs=[loaded_audio, ref_text_downloaded] # Note: ref_text_downloaded will need additional handling
)
# Generate speech using downloaded voice
generate_from_voice_btn.click(
fn=run_tts,
inputs=[loaded_audio, ref_text_downloaded, gen_text_downloaded, remove_silence_downloaded],
outputs=[output_audio_downloaded]
)
if __name__ == "__main__":
demo.launch()