baenacoco commited on
Commit
abeb645
·
verified ·
1 Parent(s): 9d2c48c

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +6 -7
  2. app.py +384 -0
  3. hub_utils.py +64 -0
  4. packages.txt +1 -0
  5. requirements.txt +12 -0
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
- title: Talking Head Voice Train
3
- emoji: 🐨
4
- colorFrom: gray
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Talking Head - Voice Train
3
+ emoji: 🗣️
4
+ colorFrom: purple
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
+ hardware: a100-large
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Space 3: Train Voice (Whisper + F5-TTS fine-tuning)
2
+
3
+ Downloads audio from Hub -> Whisper transcription -> F5-TTS fine-tune -> saves model to Hub.
4
+ GPU: A100 (Whisper large-v3 + F5-TTS training)
5
+ """
6
+ import gc
7
+ import json
8
+ import logging
9
+ import os
10
+ import shutil
11
+ import subprocess
12
+ import sys
13
+ import traceback
14
+ from pathlib import Path
15
+
16
+ import gradio as gr
17
+ import numpy as np
18
+ import soundfile as sf
19
+ import torch
20
+
21
+ from hub_utils import download_step, upload_step, list_projects, HF_DATASET_REPO_ID
22
+
23
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # ── Config ──
27
+ IS_HF_SPACE = os.environ.get("SPACE_ID") is not None
28
+ _data_path = Path("/data")
29
+ if IS_HF_SPACE and _data_path.exists() and os.access(_data_path, os.W_OK):
30
+ BASE_DIR = _data_path
31
+ else:
32
+ BASE_DIR = Path("data")
33
+
34
+ AUDIO_DIR = BASE_DIR / "audio"
35
+ VOICE_MODEL_DIR = BASE_DIR / "voice_model"
36
+ TEMP_DIR = BASE_DIR / "temp"
37
+ HF_CACHE_DIR = BASE_DIR / "hf_cache"
38
+
39
+ for d in [AUDIO_DIR, VOICE_MODEL_DIR, TEMP_DIR, HF_CACHE_DIR]:
40
+ d.mkdir(parents=True, exist_ok=True)
41
+
42
+ os.environ["HF_HOME"] = str(HF_CACHE_DIR)
43
+ os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE_DIR)
44
+
45
+ AUDIO_SAMPLE_RATE = 16000
46
+ F5_SAMPLE_RATE = 24000
47
+ VOICE_FINETUNE_EPOCHS = 100
48
+ VOICE_FINETUNE_LR = 1e-5
49
+ VOICE_FINETUNE_BATCH_SIZE = 3200
50
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
51
+
52
+ APP_VERSION = "1.0.0"
53
+
54
+
55
+ def _clear_cache():
56
+ gc.collect()
57
+ if torch.cuda.is_available():
58
+ torch.cuda.empty_cache()
59
+ torch.cuda.synchronize()
60
+
61
+
62
+ # ── Whisper transcription ──
63
+
64
+ def _transcribe_segments(segment_paths, progress_callback=None):
65
+ import whisper
66
+ logger.info("Loading Whisper for transcription...")
67
+ model = whisper.load_model("medium", device=DEVICE)
68
+ transcripts = []
69
+ for i, seg_path in enumerate(segment_paths):
70
+ if progress_callback:
71
+ progress_callback(i / len(segment_paths) * 0.3, f"Transcribiendo segmento {i+1}/{len(segment_paths)}...")
72
+ result = model.transcribe(seg_path, language="es", fp16=torch.cuda.is_available())
73
+ text = result["text"].strip()
74
+ if text:
75
+ transcripts.append({"audio_path": seg_path, "text": text})
76
+ del model
77
+ _clear_cache()
78
+ logger.info(f"Transcribed {len(transcripts)} segments")
79
+ return transcripts
80
+
81
+
82
+ # ── Dataset preparation ──
83
+
84
+ def _prepare_finetune_dataset(transcripts):
85
+ dataset_dir = TEMP_DIR / "voice_finetune_data"
86
+ if dataset_dir.exists():
87
+ shutil.rmtree(dataset_dir)
88
+ dataset_dir.mkdir(parents=True)
89
+ wavs_dir = dataset_dir / "wavs"
90
+ wavs_dir.mkdir()
91
+
92
+ metadata = []
93
+ for i, item in enumerate(transcripts):
94
+ audio, sr = sf.read(item["audio_path"])
95
+ if sr != F5_SAMPLE_RATE:
96
+ import torchaudio
97
+ audio_tensor = torch.from_numpy(audio).float()
98
+ if audio_tensor.dim() == 1:
99
+ audio_tensor = audio_tensor.unsqueeze(0)
100
+ resampler = torchaudio.transforms.Resample(sr, F5_SAMPLE_RATE)
101
+ audio_tensor = resampler(audio_tensor)
102
+ audio = audio_tensor.squeeze(0).numpy()
103
+
104
+ max_samples = 15 * F5_SAMPLE_RATE
105
+ min_samples = 2 * F5_SAMPLE_RATE
106
+
107
+ if len(audio) <= max_samples:
108
+ clips = [(audio, item["text"])]
109
+ else:
110
+ n_parts = max(1, len(audio) // (10 * F5_SAMPLE_RATE))
111
+ part_size = len(audio) // n_parts
112
+ clips = []
113
+ words = item["text"].split()
114
+ words_per_part = max(1, len(words) // n_parts)
115
+ for j in range(n_parts):
116
+ start = j * part_size
117
+ end = min((j + 1) * part_size, len(audio))
118
+ if end - start < min_samples:
119
+ continue
120
+ text_start = j * words_per_part
121
+ text_end = min((j + 1) * words_per_part, len(words))
122
+ part_text = " ".join(words[text_start:text_end])
123
+ if part_text:
124
+ clips.append((audio[start:end], part_text))
125
+
126
+ for j, (clip_audio, clip_text) in enumerate(clips):
127
+ fname = f"clip_{i:04d}_{j:02d}.wav"
128
+ wav_path = wavs_dir / fname
129
+ sf.write(str(wav_path), clip_audio, F5_SAMPLE_RATE)
130
+ duration = len(clip_audio) / F5_SAMPLE_RATE
131
+ metadata.append({"audio_file": str(wav_path.resolve()), "text": clip_text, "duration": round(duration, 2)})
132
+
133
+ meta_path = dataset_dir / "metadata.csv"
134
+ with open(meta_path, "w") as f:
135
+ f.write("audio_file|text\n")
136
+ for item in metadata:
137
+ f.write(f"{item['audio_file']}|{item['text']}\n")
138
+
139
+ logger.info(f"Prepared {len(metadata)} clips for fine-tuning")
140
+ return dataset_dir
141
+
142
+
143
+ def _ensure_vocab_file():
144
+ from importlib.resources import files as pkg_files
145
+ vocab_path = Path(pkg_files("f5_tts").joinpath("../../data/Emilia_ZH_EN_pinyin/vocab.txt"))
146
+ if vocab_path.exists():
147
+ return
148
+ vocab_path.parent.mkdir(parents=True, exist_ok=True)
149
+ logger.info("Downloading pretrained vocab.txt for F5-TTS...")
150
+ import urllib.request
151
+ url = "https://raw.githubusercontent.com/SWivid/F5-TTS/main/data/Emilia_ZH_EN_pinyin/vocab.txt"
152
+ urllib.request.urlretrieve(url, str(vocab_path))
153
+
154
+
155
+ def _prepare_arrow_dataset(dataset_dir, progress_callback=None):
156
+ if progress_callback:
157
+ progress_callback(0.32, "Preparando dataset Arrow...")
158
+
159
+ _ensure_vocab_file()
160
+
161
+ meta_csv = dataset_dir / "metadata.csv"
162
+ arrow_dir = dataset_dir / "arrow_data"
163
+ arrow_dir.mkdir(parents=True, exist_ok=True)
164
+
165
+ import csv
166
+ from datasets import Dataset as HFDataset, Audio
167
+ from f5_tts.model.utils import convert_char_to_pinyin
168
+
169
+ audio_text_pairs = []
170
+ with open(meta_csv, "r", encoding="utf-8-sig") as f:
171
+ reader = csv.reader(f, delimiter="|")
172
+ next(reader, None)
173
+ for row in reader:
174
+ if len(row) >= 2:
175
+ audio_text_pairs.append((row[0].strip(), row[1].strip()))
176
+
177
+ if not audio_text_pairs:
178
+ raise RuntimeError("No audio-text pairs found in metadata.csv")
179
+
180
+ texts = [pair[1] for pair in audio_text_pairs]
181
+ pinyin_texts = convert_char_to_pinyin(texts, polyphone=True)
182
+
183
+ valid_audio_paths = []
184
+ valid_texts = []
185
+ durations = []
186
+ for i, (audio_path, text) in enumerate(audio_text_pairs):
187
+ if not Path(audio_path).exists():
188
+ continue
189
+ audio_info = sf.info(audio_path)
190
+ duration = audio_info.duration
191
+ if duration < 0.3 or duration > 30:
192
+ continue
193
+ valid_audio_paths.append(audio_path)
194
+ valid_texts.append(pinyin_texts[i])
195
+ durations.append(duration)
196
+
197
+ if not valid_audio_paths:
198
+ raise RuntimeError("No valid audio clips after filtering")
199
+
200
+ ds = HFDataset.from_dict({"audio_path": valid_audio_paths, "text": valid_texts})
201
+ ds = ds.cast_column("audio_path", Audio())
202
+ ds.save_to_disk(str(arrow_dir / "raw"))
203
+ ds.to_parquet(str(arrow_dir / "raw.parquet"))
204
+
205
+ with open(arrow_dir / "duration.json", "w") as f:
206
+ json.dump({"duration": durations}, f)
207
+
208
+ from importlib.resources import files as pkg_files
209
+ pretrained_vocab = Path(pkg_files("f5_tts").joinpath("../../data/Emilia_ZH_EN_pinyin/vocab.txt"))
210
+ if pretrained_vocab.exists():
211
+ shutil.copy2(str(pretrained_vocab), str(arrow_dir / "vocab.txt"))
212
+
213
+ logger.info(f"Arrow dataset: {len(valid_audio_paths)} samples, {sum(durations)/3600:.2f}h total")
214
+ return arrow_dir
215
+
216
+
217
+ def finetune_voice(segment_paths, epochs, learning_rate, batch_size, progress_callback=None):
218
+ if not segment_paths:
219
+ raise ValueError("No audio segments found.")
220
+
221
+ _clear_cache()
222
+
223
+ transcripts = _transcribe_segments(segment_paths, progress_callback)
224
+ if not transcripts:
225
+ raise ValueError("Could not transcribe any audio segments")
226
+
227
+ if progress_callback:
228
+ progress_callback(0.3, "Preparando dataset...")
229
+ dataset_dir = _prepare_finetune_dataset(transcripts)
230
+ arrow_dir = _prepare_arrow_dataset(dataset_dir, progress_callback)
231
+
232
+ if progress_callback:
233
+ progress_callback(0.35, "Iniciando fine-tuning F5-TTS...")
234
+
235
+ VOICE_MODEL_DIR.mkdir(parents=True, exist_ok=True)
236
+
237
+ from importlib.resources import files as pkg_files
238
+ f5_data_root = Path(pkg_files("f5_tts").joinpath("../../data"))
239
+ f5_data_root.mkdir(parents=True, exist_ok=True)
240
+
241
+ dataset_name = "voice_finetune"
242
+ tokenizer = "char"
243
+ expected_dir = f5_data_root / f"{dataset_name}_{tokenizer}"
244
+ if expected_dir.exists():
245
+ shutil.rmtree(expected_dir)
246
+ shutil.copytree(str(arrow_dir), str(expected_dir))
247
+
248
+ cmd = [
249
+ sys.executable, "-m", "f5_tts.train.finetune_cli",
250
+ "--exp_name", "F5TTS_v1_Base",
251
+ "--dataset_name", dataset_name,
252
+ "--learning_rate", str(learning_rate),
253
+ "--batch_size_per_gpu", str(batch_size),
254
+ "--epochs", str(epochs),
255
+ "--finetune",
256
+ "--save_per_updates", "500",
257
+ "--last_per_updates", "200",
258
+ "--num_warmup_updates", "100",
259
+ "--tokenizer", tokenizer,
260
+ ]
261
+
262
+ logger.info(f"Running F5-TTS finetune: {' '.join(cmd)}")
263
+
264
+ process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
265
+ for line in process.stdout:
266
+ line = line.strip()
267
+ if line:
268
+ logger.info(f"[F5-TTS] {line}")
269
+ if progress_callback and "step" in line.lower():
270
+ progress_callback(0.4, f"Training: {line[:80]}...")
271
+
272
+ process.wait()
273
+ if process.returncode != 0:
274
+ raise RuntimeError(f"F5-TTS fine-tuning failed with exit code {process.returncode}")
275
+
276
+ ckpt_dir = Path(f"ckpts/{dataset_name}")
277
+ if ckpt_dir.exists():
278
+ for f in ckpt_dir.glob("*.pt"):
279
+ shutil.copy2(str(f), str(VOICE_MODEL_DIR / f.name))
280
+ for f in ckpt_dir.glob("*.safetensors"):
281
+ shutil.copy2(str(f), str(VOICE_MODEL_DIR / f.name))
282
+
283
+ ref_path = VOICE_MODEL_DIR / "reference.wav"
284
+ if segment_paths:
285
+ shutil.copy2(segment_paths[0], str(ref_path))
286
+
287
+ shutil.rmtree(dataset_dir, ignore_errors=True)
288
+ _clear_cache()
289
+
290
+ return str(VOICE_MODEL_DIR)
291
+
292
+
293
+ # ── Gradio handlers ──
294
+
295
+ def download_audio_from_hub(project_name, progress=gr.Progress()):
296
+ if not project_name or not project_name.strip():
297
+ return "Error: Debes introducir un nombre de proyecto"
298
+ name = project_name.strip()
299
+ try:
300
+ if AUDIO_DIR.exists():
301
+ shutil.rmtree(AUDIO_DIR)
302
+ AUDIO_DIR.mkdir(parents=True)
303
+
304
+ download_step(name, "step2_audio", str(BASE_DIR))
305
+ # Files are downloaded to BASE_DIR/{name}/step2_audio/ - move to AUDIO_DIR
306
+ src = BASE_DIR / name / "step2_audio"
307
+ if src.exists():
308
+ for f in src.iterdir():
309
+ shutil.move(str(f), str(AUDIO_DIR / f.name))
310
+ shutil.rmtree(BASE_DIR / name, ignore_errors=True)
311
+
312
+ segments = sorted(AUDIO_DIR.glob("segment_*.wav"))
313
+ return f"OK - Descargados {len(segments)} segmentos de audio"
314
+ except Exception as e:
315
+ return f"Error: {e}"
316
+
317
+
318
+ def train_voice_handler(project_name, epochs, lr, progress=gr.Progress()):
319
+ if not project_name or not project_name.strip():
320
+ return "Error: Debes introducir un nombre de proyecto"
321
+
322
+ segment_paths = sorted(str(p) for p in AUDIO_DIR.glob("segment_*.wav"))
323
+ if not segment_paths:
324
+ return "Error: No hay segmentos de audio. Descarga primero desde el Hub."
325
+
326
+ logger.info(f"=== Voice Training Started === epochs={epochs}, lr={lr}")
327
+ try:
328
+ result = finetune_voice(
329
+ segment_paths, epochs=int(epochs), learning_rate=lr,
330
+ batch_size=VOICE_FINETUNE_BATCH_SIZE,
331
+ progress_callback=lambda p, m: progress(p, desc=m),
332
+ )
333
+ logger.info(f"=== Voice Training Complete === {result}")
334
+ return f"OK - Modelo de voz guardado en: {result}"
335
+ except Exception as e:
336
+ logger.error(f"=== Voice Training Failed ===\n{traceback.format_exc()}")
337
+ return f"Error: {e}"
338
+
339
+
340
+ def save_to_hub(project_name):
341
+ if not project_name or not project_name.strip():
342
+ return "Error: Debes introducir un nombre de proyecto"
343
+ name = project_name.strip()
344
+ models = list(VOICE_MODEL_DIR.glob("*.pt")) + list(VOICE_MODEL_DIR.glob("*.safetensors"))
345
+ if not models:
346
+ return "Error: No hay modelo de voz para guardar. Entrena primero."
347
+ try:
348
+ return upload_step(name, "step3_voice", str(VOICE_MODEL_DIR))
349
+ except Exception as e:
350
+ return f"Error: {e}"
351
+
352
+
353
+ # ── UI ──
354
+
355
+ with gr.Blocks(title="Talking Head - Voice Train", theme=gr.themes.Soft()) as demo:
356
+ gr.Markdown(f"# Talking Head - Entrenar Voz `v{APP_VERSION}`\nWhisper transcripcion + F5-TTS fine-tuning")
357
+
358
+ project_name = gr.Textbox(
359
+ label="Nombre del proyecto",
360
+ placeholder="mi_proyecto",
361
+ info="Obligatorio. Se usa como carpeta en el Hub.",
362
+ )
363
+
364
+ gr.Markdown("### 1. Descargar audio del Hub")
365
+ download_btn = gr.Button("Descargar audio del Hub", variant="secondary")
366
+ download_status = gr.Textbox(label="Estado descarga", interactive=False)
367
+
368
+ gr.Markdown("### 2. Entrenar modelo de voz")
369
+ with gr.Row():
370
+ voice_epochs = gr.Slider(10, 300, value=VOICE_FINETUNE_EPOCHS, step=10, label="Epochs")
371
+ voice_lr = gr.Number(value=VOICE_FINETUNE_LR, label="Learning Rate")
372
+ train_btn = gr.Button("Entrenar Voz", variant="primary")
373
+ train_status = gr.Textbox(label="Estado entrenamiento", interactive=False)
374
+
375
+ gr.Markdown("### 3. Guardar modelo en Hub")
376
+ save_btn = gr.Button("Guardar en Hub", variant="secondary")
377
+ save_status = gr.Textbox(label="Estado guardado", interactive=False)
378
+
379
+ download_btn.click(download_audio_from_hub, inputs=[project_name], outputs=[download_status])
380
+ train_btn.click(train_voice_handler, inputs=[project_name, voice_epochs, voice_lr], outputs=[train_status])
381
+ save_btn.click(save_to_hub, inputs=[project_name], outputs=[save_status])
382
+
383
+ if __name__ == "__main__":
384
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)
hub_utils.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hub utilities for uploading/downloading step data to HF Dataset repo."""
2
+ import os
3
+ import logging
4
+ from pathlib import Path
5
+ from huggingface_hub import HfApi, hf_hub_download, list_repo_tree
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ HF_DATASET_REPO_ID = "baenacoco/talking-head-avatar"
10
+
11
+
12
+ def _get_api():
13
+ token = os.environ.get("HF_TOKEN")
14
+ if not token:
15
+ raise ValueError("HF_TOKEN no encontrado en variables de entorno")
16
+ api = HfApi(token=token)
17
+ api.create_repo(repo_id=HF_DATASET_REPO_ID, repo_type="dataset", exist_ok=True)
18
+ return api
19
+
20
+
21
+ def upload_step(name: str, step_folder: str, local_dir: str):
22
+ """Upload a local directory to {name}/{step_folder}/ in the dataset repo."""
23
+ api = _get_api()
24
+ api.upload_folder(
25
+ folder_path=local_dir,
26
+ path_in_repo=f"{name}/{step_folder}",
27
+ repo_id=HF_DATASET_REPO_ID,
28
+ repo_type="dataset",
29
+ )
30
+ logger.info(f"Uploaded {local_dir} -> {name}/{step_folder}")
31
+ return f"Subido a Hub: {name}/{step_folder}"
32
+
33
+
34
+ def download_step(name: str, step_folder: str, local_dir: str):
35
+ """Download {name}/{step_folder}/ from the dataset repo to a local directory."""
36
+ from huggingface_hub import snapshot_download
37
+ token = os.environ.get("HF_TOKEN")
38
+ snapshot_download(
39
+ repo_id=HF_DATASET_REPO_ID,
40
+ repo_type="dataset",
41
+ local_dir=local_dir,
42
+ allow_patterns=[f"{name}/{step_folder}/**"],
43
+ token=token,
44
+ )
45
+ logger.info(f"Downloaded {name}/{step_folder} -> {local_dir}")
46
+ return f"Descargado de Hub: {name}/{step_folder}"
47
+
48
+
49
+ def list_projects() -> list[str]:
50
+ """List project names (top-level folders) in the dataset repo."""
51
+ token = os.environ.get("HF_TOKEN")
52
+ try:
53
+ api = HfApi(token=token)
54
+ entries = list(api.list_repo_tree(
55
+ repo_id=HF_DATASET_REPO_ID, repo_type="dataset", path_in_repo="",
56
+ ))
57
+ return sorted(set(
58
+ e.rfilename.split("/")[0] if hasattr(e, "rfilename") else e.path.split("/")[0]
59
+ for e in entries
60
+ if ("/" in getattr(e, "rfilename", "")) or hasattr(e, "path")
61
+ ))
62
+ except Exception as e:
63
+ logger.warning(f"Could not list projects: {e}")
64
+ return []
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools>=69.0.0
2
+ gradio>=5.9.1
3
+ torch>=2.1.0
4
+ torchaudio>=2.1.0
5
+ numpy>=1.24.0
6
+ soundfile>=0.12.0
7
+ huggingface_hub>=0.20.0
8
+ datasets>=2.14.0
9
+ openai-whisper>=20231117
10
+ f5-tts>=0.3.0
11
+ sentencepiece>=0.1.99
12
+ protobuf>=3.20.0