Chouio commited on
Commit
43f167f
·
verified ·
1 Parent(s): 06f823f

Update api.py

Browse files
Files changed (1) hide show
  1. api.py +501 -21
api.py CHANGED
@@ -1,30 +1,510 @@
 
 
 
 
 
 
 
 
 
 
1
  import requests
 
2
  import zipfile
3
  import os
 
4
 
5
- def descargar_modelo_rvc(url, destino="ckpts"):
6
- """
7
- Descarga un ZIP desde HuggingFace o Google Drive y lo extrae en /ckpts.
8
- Devuelve la ruta final del modelo.
9
- """
10
- os.makedirs(destino, exist_ok=True)
11
- zip_path = os.path.join(destino, "modelo_rvc.zip")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Descargar archivo ZIP
14
- with requests.get(url, stream=True) as r:
15
- with open(zip_path, "wb") as f:
16
- f.write(r.content)
 
 
 
17
 
18
- # Extraer ZIP
19
- with zipfile.ZipFile(zip_path, "r") as zip_ref:
20
- zip_ref.extractall(destino)
21
 
22
- # Detectar carpeta extraída
23
- contenido = os.listdir(destino)
24
- carpetas = [c for c in contenido if os.path.isdir(os.path.join(destino, c))]
25
 
26
- if not carpetas:
27
- return None # No se encontró carpeta
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- # Devolver la primera carpeta encontrada
30
- return os.path.join(destino, carpetas[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile as sf
2
+ import torch
3
+ import tqdm
4
+ from cached_path import cached_path
5
+ from model import DiT, UNetT
6
+ from model.utils import save_spectrogram
7
+ from model.utils_infer import load_vocoder, load_model, infer_process, remove_silence_for_generated_wav
8
+ from model.utils import seed_everything
9
+ import random
10
+ import sys
11
  import requests
12
+ import gdown
13
  import zipfile
14
  import os
15
+ from pathlib import Path
16
 
17
+ class F5TTS:
18
+ def __init__(
19
+ self,
20
+ model_type="F5-TTS",
21
+ ckpt_file="",
22
+ vocab_file="",
23
+ ode_method="euler",
24
+ use_ema=True,
25
+ local_path=None,
26
+ device=None,
27
+ ):
28
+ # Initialize parameters
29
+ self.final_wave = None
30
+ self.target_sample_rate = 24000
31
+ self.n_mel_channels = 100
32
+ self.hop_length = 256
33
+ self.target_rms = 0.1
34
+ self.seed = -1
35
+
36
+ # Set device
37
+ self.device = device or (
38
+ "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
39
+ )
40
+
41
+ # Load models
42
+ self.load_vocoder_model(local_path)
43
+ self.load_ema_model(model_type, ckpt_file, vocab_file, ode_method, use_ema)
44
+
45
+ def load_vocoder_model(self, local_path):
46
+ self.vocos = load_vocoder(local_path is not None, local_path, self.device)
47
+
48
+ def load_ema_model(self, model_type, ckpt_file, vocab_file, ode_method, use_ema):
49
+ if model_type == "F5-TTS":
50
+ if not ckpt_file:
51
+ ckpt_file = str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"))
52
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
53
+ model_cls = DiT
54
+ elif model_type == "E2-TTS":
55
+ if not ckpt_file:
56
+ ckpt_file = str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
57
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
58
+ model_cls = UNetT
59
+ else:
60
+ raise ValueError(f"Unknown model type: {model_type}")
61
+
62
+ self.ema_model = load_model(model_cls, model_cfg, ckpt_file, vocab_file, ode_method, use_ema, self.device)
63
+
64
+ def export_wav(self, wav, file_wave, remove_silence=False):
65
+ sf.write(file_wave, wav, self.target_sample_rate)
66
+ if remove_silence:
67
+ remove_silence_for_generated_wav(file_wave)
68
+
69
+ def export_spectrogram(self, spect, file_spect):
70
+ save_spectrogram(spect, file_spect)
71
+
72
+ def infer(
73
+ self,
74
+ ref_file,
75
+ ref_text,
76
+ gen_text,
77
+ show_info=print,
78
+ progress=tqdm,
79
+ target_rms=0.1,
80
+ cross_fade_duration=0.15,
81
+ sway_sampling_coef=-1,
82
+ cfg_strength=2,
83
+ nfe_step=32,
84
+ speed=1.0,
85
+ fix_duration=None,
86
+ remove_silence=False,
87
+ file_wave=None,
88
+ file_spect=None,
89
+ seed=-1,
90
+ ):
91
+ if seed == -1:
92
+ seed = random.randint(0, sys.maxsize)
93
+ seed_everything(seed)
94
+ self.seed = seed
95
+
96
+ wav, sr, spect = infer_process(
97
+ ref_file,
98
+ ref_text,
99
+ gen_text,
100
+ self.ema_model,
101
+ show_info=show_info,
102
+ progress=progress,
103
+ target_rms=target_rms,
104
+ cross_fade_duration=cross_fade_duration,
105
+ nfe_step=nfe_step,
106
+ cfg_strength=cfg_strength,
107
+ sway_sampling_coef=sway_sampling_coef,
108
+ speed=speed,
109
+ fix_duration=fix_duration,
110
+ device=self.device,
111
+ )
112
+
113
+ if file_wave is not None:
114
+ self.export_wav(wav, file_wave, remove_silence)
115
+ if file_spect is not None:
116
+ self.export_spectrogram(spect, file_spect)
117
+
118
+ return wav, sr, spect
119
+
120
+ @staticmethod
121
+ def download_from_huggingface(url, output_path):
122
+ """Download file from Hugging Face"""
123
+ try:
124
+ response = requests.get(url, stream=True)
125
+ response.raise_for_status()
126
+ total_size = int(response.headers.get('content-length', 0))
127
+
128
+ with open(output_path, 'wb') as f:
129
+ for chunk in tqdm.tqdm(response.iter_content(chunk_size=8192),
130
+ total=total_size//8192,
131
+ unit='KB',
132
+ desc="Downloading from Hugging Face"):
133
+ if chunk:
134
+ f.write(chunk)
135
+ return True
136
+ except Exception as e:
137
+ print(f"Error downloading from Hugging Face: {e}")
138
+ return False
139
+
140
+ @staticmethod
141
+ def download_from_google_drive(url, output_path):
142
+ """Download file from Google Drive"""
143
+ try:
144
+ # Use gdown for Google Drive downloads
145
+ gdown.download(url=url, output=output_path, quiet=False, fuzzy=True)
146
+ return True
147
+ except Exception as e:
148
+ print(f"Error downloading from Google Drive: {e}")
149
+ return False
150
+
151
+ @staticmethod
152
+ def extract_zip(zip_path, extract_path):
153
+ """Extract ZIP file"""
154
+ try:
155
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
156
+ zip_ref.extractall(extract_path)
157
+ return True
158
+ except Exception as e:
159
+ print(f"Error extracting ZIP file: {e}")
160
+ return False
161
+
162
+ @staticmethod
163
+ def download_and_setup_voice(voice_url, voice_name, base_path="voices"):
164
+ """
165
+ Download and setup a voice from URL (Hugging Face or Google Drive)
166
+
167
+ Args:
168
+ voice_url (str): URL to download the voice from
169
+ voice_name (str): Name for the voice folder
170
+ base_path (str): Base directory to store voices
171
+
172
+ Returns:
173
+ str: Path to the downloaded voice folder, or None if failed
174
+ """
175
+ # Create base directory if it doesn't exist
176
+ os.makedirs(base_path, exist_ok=True)
177
+
178
+ # Determine download type
179
+ is_huggingface = "huggingface.co" in voice_url
180
+ is_google_drive = "drive.google.com" in voice_url
181
+
182
+ if not (is_huggingface or is_google_drive):
183
+ print("Unsupported URL. Only Hugging Face and Google Drive links are supported.")
184
+ return None
185
+
186
+ # Create voice directory
187
+ voice_dir = os.path.join(base_path, voice_name)
188
+ os.makedirs(voice_dir, exist_ok=True)
189
+
190
+ # Download file
191
+ zip_path = os.path.join(voice_dir, f"{voice_name}.zip")
192
+
193
+ if is_huggingface:
194
+ success = F5TTS.download_from_huggingface(voice_url, zip_path)
195
+ elif is_google_drive:
196
+ success = F5TTS.download_from_google_drive(voice_url, zip_path)
197
+
198
+ if not success:
199
+ print("Download failed")
200
+ return None
201
+
202
+ # Extract ZIP file
203
+ success = F5TTS.extract_zip(zip_path, voice_dir)
204
+ if not success:
205
+ print("Extraction failed")
206
+ return None
207
+
208
+ # Remove ZIP file after extraction
209
+ os.remove(zip_path)
210
+
211
+ # Check if the voice was properly extracted
212
+ if not os.path.exists(voice_dir) or len(os.listdir(voice_dir)) == 0:
213
+ print("Voice directory is empty after extraction")
214
+ return None
215
+
216
+ print(f"Voice '{voice_name}' successfully downloaded and setup at: {voice_dir}")
217
+ return voice_dir
218
 
219
+ if __name__ == "__main__":
220
+ f5tts = F5TTS()
221
+ wav, sr, spect = f5tts.infer(
222
+ ref_file="tests/ref_audio/test_en_1_ref_short.wav",
223
+ ref_text="some call me nature, others call me mother nature.",
224
+ gen_text="""I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequen
225
+ </think>
226
 
227
+ I notice that I was cut off while generating the modified `api.txt` file. Let me continue with the complete modifications for both files.
 
 
228
 
229
+ ## 📁 `api.txt` completely modified
 
 
230
 
231
+ ```python
232
+ import soundfile as sf
233
+ import torch
234
+ import tqdm
235
+ from cached_path import cached_path
236
+ from model import DiT, UNetT
237
+ from model.utils import save_spectrogram
238
+ from model.utils_infer import load_vocoder, load_model, infer_process, remove_silence_for_generated_wav
239
+ from model.utils import seed_everything
240
+ import random
241
+ import sys
242
+ import requests
243
+ import gdown
244
+ import zipfile
245
+ import os
246
+ from pathlib import Path
247
+
248
+ class F5TTS:
249
+ def __init__(
250
+ self,
251
+ model_type="F5-TTS",
252
+ ckpt_file="",
253
+ vocab_file="",
254
+ ode_method="euler",
255
+ use_ema=True,
256
+ local_path=None,
257
+ device=None,
258
+ ):
259
+ # Initialize parameters
260
+ self.final_wave = None
261
+ self.target_sample_rate = 24000
262
+ self.n_mel_channels = 100
263
+ self.hop_length = 256
264
+ self.target_rms = 0.1
265
+ self.seed = -1
266
+
267
+ # Set device
268
+ self.device = device or (
269
+ "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
270
+ )
271
+
272
+ # Load models
273
+ self.load_vocoder_model(local_path)
274
+ self.load_ema_model(model_type, ckpt_file, vocab_file, ode_method, use_ema)
275
+
276
+ def load_vocoder_model(self, local_path):
277
+ self.vocos = load_vocoder(local_path is not None, local_path, self.device)
278
+
279
+ def load_ema_model(self, model_type, ckpt_file, vocab_file, ode_method, use_ema):
280
+ if model_type == "F5-TTS":
281
+ if not ckpt_file:
282
+ ckpt_file = str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"))
283
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
284
+ model_cls = DiT
285
+ elif model_type == "E2-TTS":
286
+ if not ckpt_file:
287
+ ckpt_file = str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
288
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
289
+ model_cls = UNetT
290
+ else:
291
+ raise ValueError(f"Unknown model type: {model_type}")
292
+
293
+ self.ema_model = load_model(model_cls, model_cfg, ckpt_file, vocab_file, ode_method, use_ema, self.device)
294
+
295
+ def export_wav(self, wav, file_wave, remove_silence=False):
296
+ sf.write(file_wave, wav, self.target_sample_rate)
297
+ if remove_silence:
298
+ remove_silence_for_generated_wav(file_wave)
299
+
300
+ def export_spectrogram(self, spect, file_spect):
301
+ save_spectrogram(spect, file_spect)
302
+
303
+ def infer(
304
+ self,
305
+ ref_file,
306
+ ref_text,
307
+ gen_text,
308
+ show_info=print,
309
+ progress=tqdm,
310
+ target_rms=0.1,
311
+ cross_fade_duration=0.15,
312
+ sway_sampling_coef=-1,
313
+ cfg_strength=2,
314
+ nfe_step=32,
315
+ speed=1.0,
316
+ fix_duration=None,
317
+ remove_silence=False,
318
+ file_wave=None,
319
+ file_spect=None,
320
+ seed=-1,
321
+ ):
322
+ if seed == -1:
323
+ seed = random.randint(0, sys.maxsize)
324
+ seed_everything(seed)
325
+ self.seed = seed
326
+
327
+ wav, sr, spect = infer_process(
328
+ ref_file,
329
+ ref_text,
330
+ gen_text,
331
+ self.ema_model,
332
+ show_info=show_info,
333
+ progress=progress,
334
+ target_rms=target_rms,
335
+ cross_fade_duration=cross_fade_duration,
336
+ nfe_step=nfe_step,
337
+ cfg_strength=cfg_strength,
338
+ sway_sampling_coef=sway_sampling_coef,
339
+ speed=speed,
340
+ fix_duration=fix_duration,
341
+ device=self.device,
342
+ )
343
+
344
+ if file_wave is not None:
345
+ self.export_wav(wav, file_wave, remove_silence)
346
+ if file_spect is not None:
347
+ self.export_spectrogram(spect, file_spect)
348
+
349
+ return wav, sr, spect
350
+
351
+ @staticmethod
352
+ def download_from_huggingface(url, output_path):
353
+ """Download file from Hugging Face"""
354
+ try:
355
+ response = requests.get(url, stream=True)
356
+ response.raise_for_status()
357
+ total_size = int(response.headers.get('content-length', 0))
358
+
359
+ with open(output_path, 'wb') as f:
360
+ for chunk in tqdm.tqdm(response.iter_content(chunk_size=8192),
361
+ total=total_size//8192,
362
+ unit='KB',
363
+ desc="Downloading from Hugging Face"):
364
+ if chunk:
365
+ f.write(chunk)
366
+ return True
367
+ except Exception as e:
368
+ print(f"Error downloading from Hugging Face: {e}")
369
+ return False
370
+
371
+ @staticmethod
372
+ def download_from_google_drive(url, output_path):
373
+ """Download file from Google Drive"""
374
+ try:
375
+ # Use gdown for Google Drive downloads
376
+ gdown.download(url=url, output=output_path, quiet=False, fuzzy=True)
377
+ return True
378
+ except Exception as e:
379
+ print(f"Error downloading from Google Drive: {e}")
380
+ return False
381
+
382
+ @staticmethod
383
+ def extract_zip(zip_path, extract_path):
384
+ """Extract ZIP file"""
385
+ try:
386
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
387
+ zip_ref.extractall(extract_path)
388
+ return True
389
+ except Exception as e:
390
+ print(f"Error extracting ZIP file: {e}")
391
+ return False
392
+
393
+ @staticmethod
394
+ def download_and_setup_voice(voice_url, voice_name, base_path="voices"):
395
+ """
396
+ Download and setup a voice from URL (Hugging Face or Google Drive)
397
+
398
+ Args:
399
+ voice_url (str): URL to download the voice from
400
+ voice_name (str): Name for the voice folder
401
+ base_path (str): Base directory to store voices
402
+
403
+ Returns:
404
+ str: Path to the downloaded voice folder, or None if failed
405
+ """
406
+ # Create base directory if it doesn't exist
407
+ os.makedirs(base_path, exist_ok=True)
408
+
409
+ # Determine download type
410
+ is_huggingface = "huggingface.co" in voice_url
411
+ is_google_drive = "drive.google.com" in voice_url
412
+
413
+ if not (is_huggingface or is_google_drive):
414
+ print("Unsupported URL. Only Hugging Face and Google Drive links are supported.")
415
+ return None
416
+
417
+ # Create voice directory
418
+ voice_dir = os.path.join(base_path, voice_name)
419
+ os.makedirs(voice_dir, exist_ok=True)
420
+
421
+ # Download file
422
+ zip_path = os.path.join(voice_dir, f"{voice_name}.zip")
423
+
424
+ if is_huggingface:
425
+ success = F5TTS.download_from_huggingface(voice_url, zip_path)
426
+ elif is_google_drive:
427
+ success = F5TTS.download_from_google_drive(voice_url, zip_path)
428
+
429
+ if not success:
430
+ print("Download failed")
431
+ return None
432
+
433
+ # Extract ZIP file
434
+ success = F5TTS.extract_zip(zip_path, voice_dir)
435
+ if not success:
436
+ print("Extraction failed")
437
+ return None
438
+
439
+ # Remove ZIP file after extraction
440
+ if os.path.exists(zip_path):
441
+ os.remove(zip_path)
442
+
443
+ # Check if the voice was properly extracted
444
+ if not os.path.exists(voice_dir) or len(os.listdir(voice_dir)) == 0:
445
+ print("Voice directory is empty after extraction")
446
+ return None
447
+
448
+ print(f"Voice '{voice_name}' successfully downloaded and setup at: {voice_dir}")
449
+ return voice_dir
450
+
451
+ @staticmethod
452
+ def list_available_voices(base_path="voices"):
453
+ """List available downloaded voices"""
454
+ if not os.path.exists(base_path):
455
+ return []
456
+
457
+ voices = []
458
+ for item in os.listdir(base_path):
459
+ item_path = os.path.join(base_path, item)
460
+ if os.path.isdir(item_path):
461
+ voices.append({
462
+ 'name': item,
463
+ 'path': item_path,
464
+ 'files': os.listdir(item_path)
465
+ })
466
+ return voices
467
 
468
+ if __name__ == "__main__":
469
+ # Example usage of voice download functionality
470
+ print("=== F5TTS Voice Management Example ===")
471
+
472
+ # Example URL (this is just an example, replace with actual voice URL)
473
+ example_url = "https://huggingface.co/Chouio/Adam/resolve/main/AdamDefinitive.zip"
474
+ voice_name = "Adam_Voice"
475
+
476
+ print(f"Downloading voice from: {example_url}")
477
+ voice_path = F5TTS.download_and_setup_voice(example_url, voice_name)
478
+
479
+ if voice_path:
480
+ print(f"Voice downloaded successfully to: {voice_path}")
481
+ print("Available files in voice directory:")
482
+ for file in os.listdir(voice_path):
483
+ print(f" - {file}")
484
+
485
+ # List available voices
486
+ available_voices = F5TTS.list_available_voices()
487
+ print(f"\nAvailable voices ({len(available_voices)}):")
488
+ for voice in available_voices:
489
+ print(f" - {voice['name']}")
490
+ print(f" Path: {voice['path']}")
491
+ print(f" Files: {', '.join(voice['files'])}")
492
+
493
+ # Initialize F5TTS for inference
494
+ f5tts = F5TTS()
495
+
496
+ # Example inference (requires actual audio files)
497
+ try:
498
+ wav, sr, spect = f5tts.infer(
499
+ ref_file="tests/ref_audio/test_en_1_ref_short.wav",
500
+ ref_text="some call me nature, others call me mother nature.",
501
+ gen_text="""I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences.""",
502
+ file_wave="tests/out.wav",
503
+ file_spect="tests/out.png",
504
+ seed=-1, # random seed = -1
505
+ )
506
+ print("seed :", f5tts.seed)
507
+ print("Inference completed successfully!")
508
+ except Exception as e:
509
+ print(f"Inference failed: {e}")
510
+ print("Note: This example requires actual audio files in the specified paths.")