File size: 11,755 Bytes
b744140
 
 
 
 
 
c43a317
b744140
c43a317
b744140
 
 
 
9220f67
b744140
 
 
 
 
 
 
 
 
 
 
cb99a43
b744140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9220f67
b744140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca5ec77
b744140
 
 
 
 
 
 
 
 
 
 
 
 
ca0d05f
b744140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca0d05f
b744140
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
import spaces
import gradio as gr
from f5_tts.infer.utils_infer import remove_silence_for_generated_wav
from f5_tts.api import F5TTS
import tempfile
import os
import requests
import gdown
import zipfile
from pathlib import Path

# Initialize F5TTS
f5tts = F5TTS()

@spaces.GPU
def run_tts(ref_audio, ref_text, gen_text, remove_silence=False):
    output_wav_path = tempfile.mktemp(suffix=".wav")
    wav, sr, _ = f5tts.infer(
        ref_file=ref_audio,
        ref_text=ref_text,
        gen_text=gen_text,
        file_wave=output_wav_path,
        remove_silence=remove_silence,
    )
    return output_wav_path

def download_voice(voice_url, voice_name, progress=gr.Progress()):
    """Download and setup a voice from URL"""
    if not voice_url or not voice_name:
        return "Please provide both URL and voice name."
    
    base_path = "downloaded_voices"
    os.makedirs(base_path, exist_ok=True)
    
    # Determine download type
    is_huggingface = "huggingface.co" in voice_url
    is_google_drive = "drive.google.com" in voice_url
    
    if not (is_huggingface or is_google_drive):
        return "Unsupported URL. Only Hugging Face and Google Drive links are supported."
    
    # Create voice directory
    voice_dir = os.path.join(base_path, voice_name)
    os.makedirs(voice_dir, exist_ok=True)
    
    # Download file
    zip_path = os.path.join(voice_dir, f"{voice_name}.zip")
    
    try:
        if is_huggingface:
            progress(0, desc="Downloading from Hugging Face...")
            response = requests.get(voice_url, stream=True)
            response.raise_for_status()
            total_size = int(response.headers.get('content-length', 0))
            
            with open(zip_path, 'wb') as f:
                downloaded = 0
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        downloaded += len(chunk)
                        if total_size > 0:
                            progress(downloaded / total_size, desc=f"Downloading: {downloaded//1024}KB/{total_size//1024}KB")
        elif is_google_drive:
            progress(0, desc="Downloading from Google Drive...")
            gdown.download(url=voice_url, output=zip_path, quiet=False, fuzzy=True)
        
        # Extract ZIP file
        progress(0.8, desc="Extracting files...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(voice_dir)
        
        # Remove ZIP file after extraction
        if os.path.exists(zip_path):
            os.remove(zip_path)
        
        # Check if the voice was properly extracted
        if not os.path.exists(voice_dir) or len(os.listdir(voice_dir)) == 0:
            return "Voice directory is empty after extraction. Download may have failed."
        
        # List downloaded files
        files = os.listdir(voice_dir)
        file_list = "\n".join([f"  - {file}" for file in files])
        
        return f"βœ… Voice '{voice_name}' successfully downloaded!\nπŸ“ Location: {voice_dir}\nπŸ“‹ Files:\n{file_list}"
    
    except Exception as e:
        # Clean up on error
        if os.path.exists(voice_dir):
            try:
                if os.path.exists(zip_path):
                    os.remove(zip_path)
                # Don't remove the whole directory as it might contain other files
            except:
                pass
        return f"❌ Error downloading voice: {str(e)}"

def list_available_voices():
    """List available downloaded voices"""
    base_path = "downloaded_voices"
    if not os.path.exists(base_path):
        return "No voices downloaded yet."
    
    voices = []
    for item in os.listdir(base_path):
        item_path = os.path.join(base_path, item)
        if os.path.isdir(item_path):
            files = os.listdir(item_path)
            voices.append(f"🎀 **{item}**\nπŸ“ Path: {item_path}\nπŸ“‹ Files: {', '.join(files)}\n")
    
    if not voices:
        return "No voices found in the downloaded_voices directory."
    
    return "\n".join(voices)

def load_voice_audio(voice_name, audio_file):
    """Load audio from downloaded voice"""
    base_path = "downloaded_voices"
    voice_path = os.path.join(base_path, voice_name)
    
    if not os.path.exists(voice_path):
        return None, f"Voice '{voice_name}' not found."
    
    audio_path = os.path.join(voice_path, audio_file)
    if not os.path.exists(audio_path):
        return None, f"Audio file '{audio_file}' not found in voice '{voice_name}' directory."
    
    return audio_path, f"βœ… Loaded audio: {audio_file} from voice '{voice_name}'"

# Create Gradio interface with tabs
with gr.Blocks(title="πŸ—£οΈ F5-TTS Demo with Voice Download") as demo:
    gr.Markdown("# πŸ—£οΈ F5-TTS Demo with Voice Management")
    gr.Markdown("Upload a reference voice, give reference and generation text, and hear it in the same voice! Plus, download pre-made voices from Hugging Face or Google Drive.")
    
    with gr.Tabs():
        with gr.TabItem("πŸ”Š Generate Speech"):
            with gr.Row():
                with gr.Column():
                    ref_audio = gr.Audio(label="Reference Audio", type="filepath")
                    ref_text = gr.Textbox(
                        label="Reference Text", 
                        placeholder="some call me nature, others call me mother nature.",
                        lines=3
                    )
                    gen_text = gr.Textbox(
                        label="Generation Text", 
                        placeholder="I don't really care what you call me...",
                        lines=5
                    )
                    remove_silence = gr.Checkbox(label="Remove Silence from Output?", value=False)
                    generate_btn = gr.Button("Generate Speech", variant="primary")
                
                with gr.Column():
                    output_audio = gr.Audio(label="Generated Speech")
                    spectrogram = gr.Image(label="Spectrogram (if available)")
            
            generate_btn.click(
                fn=run_tts,
                inputs=[ref_audio, ref_text, gen_text, remove_silence],
                outputs=[output_audio]
            )
        
        with gr.TabItem("πŸ“₯ Download Voices"):
            gr.Markdown("## πŸ“₯ Download Pre-made Voices")
            gr.Markdown("Download voices from Hugging Face or Google Drive. The voice should be in ZIP format containing audio files and metadata.")
            
            with gr.Row():
                with gr.Column():
                    voice_url = gr.Textbox(
                        label="Voice URL (Hugging Face or Google Drive)",
                        placeholder="https://huggingface.co/Chouio/Adam/resolve/main/AdamDefinitive.zip",
                        lines=2
                    )
                    voice_name = gr.Textbox(
                        label="Voice Name (for folder)",
                        placeholder="my_voice"
                    )
                    download_btn = gr.Button("Download Voice", variant="primary")
                    download_status = gr.Textbox(label="Status", interactive=False)
                
                with gr.Column():
                    gr.Markdown("### πŸ“‹ Available Voices")
                    refresh_btn = gr.Button("Refresh List")
                    voices_list = gr.Markdown(label="Available Voices", value="No voices downloaded yet.")
            
            download_btn.click(
                fn=download_voice,
                inputs=[voice_url, voice_name],
                outputs=[download_status]
            )
            
            refresh_btn.click(
                fn=list_available_voices,
                outputs=[voices_list]
            )
        
        with gr.TabItem("🎭 Use Downloaded Voice"):
            gr.Markdown("## 🎭 Use Downloaded Voice for TTS")
            gr.Markdown("Select a downloaded voice and use its audio files for reference.")
            
            with gr.Row():
                with gr.Column():
                    # Voice selector
                    available_voices = gr.Dropdown(label="Select Voice", choices=[])
                    refresh_voices_btn = gr.Button("Refresh Voices")
                    
                    # Audio file selector
                    voice_audio_files = gr.Dropdown(label="Select Audio File", choices=[])
                    load_audio_btn = gr.Button("Load Selected Audio")
                    
                    # Reference text (auto-filled or manual)
                    ref_text_downloaded = gr.Textbox(
                        label="Reference Text", 
                        placeholder="Reference text will be auto-filled or you can enter manually",
                        lines=3
                    )
                    
                    # Generation text
                    gen_text_downloaded = gr.Textbox(
                        label="Generation Text", 
                        placeholder="Enter text to generate in this voice...",
                        lines=5
                    )
                    
                    remove_silence_downloaded = gr.Checkbox(label="Remove Silence from Output?", value=False)
                    generate_from_voice_btn = gr.Button("Generate with This Voice", variant="primary")
                
                with gr.Column():
                    loaded_audio = gr.Audio(label="Loaded Reference Audio")
                    output_audio_downloaded = gr.Audio(label="Generated Speech")
            
            # Refresh available voices
            def refresh_voice_list():
                base_path = "downloaded_voices"
                if not os.path.exists(base_path):
                    return []
                
                voices = []
                for item in os.listdir(base_path):
                    if os.path.isdir(os.path.join(base_path, item)):
                        voices.append(item)
                return voices
            
            refresh_voices_btn.click(
                fn=refresh_voice_list,
                outputs=[available_voices]
            )
            
            # Update audio files when voice is selected
            def update_audio_files(voice_name):
                if not voice_name:
                    return []
                
                base_path = "downloaded_voices"
                voice_path = os.path.join(base_path, voice_name)
                
                if not os.path.exists(voice_path):
                    return []
                
                audio_files = []
                for file in os.listdir(voice_path):
                    if file.lower().endswith(('.wav', '.mp3', '.flac', '.ogg')):
                        audio_files.append(file)
                return audio_files
            
            available_voices.change(
                fn=update_audio_files,
                inputs=[available_voices],
                outputs=[voice_audio_files]
            )
            
            # Load selected audio
            load_audio_btn.click(
                fn=load_voice_audio,
                inputs=[available_voices, voice_audio_files],
                outputs=[loaded_audio, ref_text_downloaded]  # Note: ref_text_downloaded will need additional handling
            )
            
            # Generate speech using downloaded voice
            generate_from_voice_btn.click(
                fn=run_tts,
                inputs=[loaded_audio, ref_text_downloaded, gen_text_downloaded, remove_silence_downloaded],
                outputs=[output_audio_downloaded]
            )

if __name__ == "__main__":
    demo.launch()