E2-F5-TTS

Runtime error

File size: 11,755 Bytes

import spaces
import gradio as gr
from f5_tts.infer.utils_infer import remove_silence_for_generated_wav
from f5_tts.api import F5TTS
import tempfile
import os
import requests
import gdown
import zipfile
from pathlib import Path

# Initialize F5TTS
f5tts = F5TTS()

@spaces.GPU
def run_tts(ref_audio, ref_text, gen_text, remove_silence=False):
    output_wav_path = tempfile.mktemp(suffix=".wav")
    wav, sr, _ = f5tts.infer(
        ref_file=ref_audio,
        ref_text=ref_text,
        gen_text=gen_text,
        file_wave=output_wav_path,
        remove_silence=remove_silence,
    )
    return output_wav_path

def download_voice(voice_url, voice_name, progress=gr.Progress()):
    """Download and setup a voice from URL"""
    if not voice_url or not voice_name:
        return "Please provide both URL and voice name."
    
    base_path = "downloaded_voices"
    os.makedirs(base_path, exist_ok=True)
    
    # Determine download type
    is_huggingface = "huggingface.co" in voice_url
    is_google_drive = "drive.google.com" in voice_url
    
    if not (is_huggingface or is_google_drive):
        return "Unsupported URL. Only Hugging Face and Google Drive links are supported."
    
    # Create voice directory
    voice_dir = os.path.join(base_path, voice_name)
    os.makedirs(voice_dir, exist_ok=True)
    
    # Download file
    zip_path = os.path.join(voice_dir, f"{voice_name}.zip")
    
    try:
        if is_huggingface:
            progress(0, desc="Downloading from Hugging Face...")
            response = requests.get(voice_url, stream=True)
            response.raise_for_status()
            total_size = int(response.headers.get('content-length', 0))
            
            with open(zip_path, 'wb') as f:
                downloaded = 0
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        downloaded += len(chunk)
                        if total_size > 0:
                            progress(downloaded / total_size, desc=f"Downloading: {downloaded//1024}KB/{total_size//1024}KB")
        elif is_google_drive:
            progress(0, desc="Downloading from Google Drive...")
            gdown.download(url=voice_url, output=zip_path, quiet=False, fuzzy=True)
        
        # Extract ZIP file
        progress(0.8, desc="Extracting files...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(voice_dir)
        
        # Remove ZIP file after extraction
        if os.path.exists(zip_path):
            os.remove(zip_path)
        
        # Check if the voice was properly extracted
        if not os.path.exists(voice_dir) or len(os.listdir(voice_dir)) == 0:
            return "Voice directory is empty after extraction. Download may have failed."
        
        # List downloaded files
        files = os.listdir(voice_dir)
        file_list = "\n".join([f"  - {file}" for file in files])
        
        return f"✅ Voice '{voice_name}' successfully downloaded!\n📁 Location: {voice_dir}\n📋 Files:\n{file_list}"
    
    except Exception as e:
        # Clean up on error
        if os.path.exists(voice_dir):
            try:
                if os.path.exists(zip_path):
                    os.remove(zip_path)
                # Don't remove the whole directory as it might contain other files
            except:
                pass
        return f"❌ Error downloading voice: {str(e)}"

def list_available_voices():
    """List available downloaded voices"""
    base_path = "downloaded_voices"
    if not os.path.exists(base_path):
        return "No voices downloaded yet."
    
    voices = []
    for item in os.listdir(base_path):
        item_path = os.path.join(base_path, item)
        if os.path.isdir(item_path):
            files = os.listdir(item_path)
            voices.append(f"🎤 **{item}**\n📍 Path: {item_path}\n📋 Files: {', '.join(files)}\n")
    
    if not voices:
        return "No voices found in the downloaded_voices directory."
    
    return "\n".join(voices)

def load_voice_audio(voice_name, audio_file):
    """Load audio from downloaded voice"""
    base_path = "downloaded_voices"
    voice_path = os.path.join(base_path, voice_name)
    
    if not os.path.exists(voice_path):
        return None, f"Voice '{voice_name}' not found."
    
    audio_path = os.path.join(voice_path, audio_file)
    if not os.path.exists(audio_path):
        return None, f"Audio file '{audio_file}' not found in voice '{voice_name}' directory."
    
    return audio_path, f"✅ Loaded audio: {audio_file} from voice '{voice_name}'"

# Create Gradio interface with tabs
with gr.Blocks(title="🗣️ F5-TTS Demo with Voice Download") as demo:
    gr.Markdown("# 🗣️ F5-TTS Demo with Voice Management")
    gr.Markdown("Upload a reference voice, give reference and generation text, and hear it in the same voice! Plus, download pre-made voices from Hugging Face or Google Drive.")
    
    with gr.Tabs():
        with gr.TabItem("🔊 Generate Speech"):
            with gr.Row():
                with gr.Column():
                    ref_audio = gr.Audio(label="Reference Audio", type="filepath")
                    ref_text = gr.Textbox(
                        label="Reference Text", 
                        placeholder="some call me nature, others call me mother nature.",
                        lines=3
                    )
                    gen_text = gr.Textbox(
                        label="Generation Text", 
                        placeholder="I don't really care what you call me...",
                        lines=5
                    )
                    remove_silence = gr.Checkbox(label="Remove Silence from Output?", value=False)
                    generate_btn = gr.Button("Generate Speech", variant="primary")
                
                with gr.Column():
                    output_audio = gr.Audio(label="Generated Speech")
                    spectrogram = gr.Image(label="Spectrogram (if available)")
            
            generate_btn.click(
                fn=run_tts,
                inputs=[ref_audio, ref_text, gen_text, remove_silence],
                outputs=[output_audio]
            )
        
        with gr.TabItem("📥 Download Voices"):
            gr.Markdown("## 📥 Download Pre-made Voices")
            gr.Markdown("Download voices from Hugging Face or Google Drive. The voice should be in ZIP format containing audio files and metadata.")
            
            with gr.Row():
                with gr.Column():
                    voice_url = gr.Textbox(
                        label="Voice URL (Hugging Face or Google Drive)",
                        placeholder="https://huggingface.co/Chouio/Adam/resolve/main/AdamDefinitive.zip",
                        lines=2
                    )
                    voice_name = gr.Textbox(
                        label="Voice Name (for folder)",
                        placeholder="my_voice"
                    )
                    download_btn = gr.Button("Download Voice", variant="primary")
                    download_status = gr.Textbox(label="Status", interactive=False)
                
                with gr.Column():
                    gr.Markdown("### 📋 Available Voices")
                    refresh_btn = gr.Button("Refresh List")
                    voices_list = gr.Markdown(label="Available Voices", value="No voices downloaded yet.")
            
            download_btn.click(
                fn=download_voice,
                inputs=[voice_url, voice_name],
                outputs=[download_status]
            )
            
            refresh_btn.click(
                fn=list_available_voices,
                outputs=[voices_list]
            )
        
        with gr.TabItem("🎭 Use Downloaded Voice"):
            gr.Markdown("## 🎭 Use Downloaded Voice for TTS")
            gr.Markdown("Select a downloaded voice and use its audio files for reference.")
            
            with gr.Row():
                with gr.Column():
                    # Voice selector
                    available_voices = gr.Dropdown(label="Select Voice", choices=[])
                    refresh_voices_btn = gr.Button("Refresh Voices")
                    
                    # Audio file selector
                    voice_audio_files = gr.Dropdown(label="Select Audio File", choices=[])
                    load_audio_btn = gr.Button("Load Selected Audio")
                    
                    # Reference text (auto-filled or manual)
                    ref_text_downloaded = gr.Textbox(
                        label="Reference Text", 
                        placeholder="Reference text will be auto-filled or you can enter manually",
                        lines=3
                    )
                    
                    # Generation text
                    gen_text_downloaded = gr.Textbox(
                        label="Generation Text", 
                        placeholder="Enter text to generate in this voice...",
                        lines=5
                    )
                    
                    remove_silence_downloaded = gr.Checkbox(label="Remove Silence from Output?", value=False)
                    generate_from_voice_btn = gr.Button("Generate with This Voice", variant="primary")
                
                with gr.Column():
                    loaded_audio = gr.Audio(label="Loaded Reference Audio")
                    output_audio_downloaded = gr.Audio(label="Generated Speech")
            
            # Refresh available voices
            def refresh_voice_list():
                base_path = "downloaded_voices"
                if not os.path.exists(base_path):
                    return []
                
                voices = []
                for item in os.listdir(base_path):
                    if os.path.isdir(os.path.join(base_path, item)):
                        voices.append(item)
                return voices
            
            refresh_voices_btn.click(
                fn=refresh_voice_list,
                outputs=[available_voices]
            )
            
            # Update audio files when voice is selected
            def update_audio_files(voice_name):
                if not voice_name:
                    return []
                
                base_path = "downloaded_voices"
                voice_path = os.path.join(base_path, voice_name)
                
                if not os.path.exists(voice_path):
                    return []
                
                audio_files = []
                for file in os.listdir(voice_path):
                    if file.lower().endswith(('.wav', '.mp3', '.flac', '.ogg')):
                        audio_files.append(file)
                return audio_files
            
            available_voices.change(
                fn=update_audio_files,
                inputs=[available_voices],
                outputs=[voice_audio_files]
            )
            
            # Load selected audio
            load_audio_btn.click(
                fn=load_voice_audio,
                inputs=[available_voices, voice_audio_files],
                outputs=[loaded_audio, ref_text_downloaded]  # Note: ref_text_downloaded will need additional handling
            )
            
            # Generate speech using downloaded voice
            generate_from_voice_btn.click(
                fn=run_tts,
                inputs=[loaded_audio, ref_text_downloaded, gen_text_downloaded, remove_silence_downloaded],
                outputs=[output_audio_downloaded]
            )

if __name__ == "__main__":
    demo.launch()