Commit ·
6eb7e7c
1
Parent(s): 8c2b5ab
Wrapping PyHARP
Browse files- README.md +1 -1
- app.py +65 -53
- requirements.txt +2 -1
- tts/infer_cli.py +2 -1
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: 🎤
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
short_description: MegaTTS 3 but with voice cloning!
|
|
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.28.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
short_description: MegaTTS 3 but with voice cloning!
|
app.py
CHANGED
|
@@ -12,6 +12,18 @@ from pydub.effects import normalize
|
|
| 12 |
from huggingface_hub import snapshot_download
|
| 13 |
from tts.infer_cli import MegaTTS3DiTInfer, convert_to_wav, cut_wav
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
def download_weights():
|
| 17 |
"""Download model weights from HuggingFace if not already present."""
|
|
@@ -87,10 +99,10 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
|
|
| 87 |
# Generate speech with proper error handling
|
| 88 |
try:
|
| 89 |
resource_context = infer_pipe.preprocess(file_content)
|
| 90 |
-
|
| 91 |
# Clean up memory after successful generation
|
| 92 |
cleanup_memory()
|
| 93 |
-
return
|
| 94 |
except RuntimeError as cuda_error:
|
| 95 |
if "CUDA" in str(cuda_error):
|
| 96 |
print(f"CUDA error detected: {cuda_error}")
|
|
@@ -99,7 +111,7 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
|
|
| 99 |
gr.Warning("CUDA error occurred. Model has been reset. Please try again.")
|
| 100 |
else:
|
| 101 |
gr.Warning("CUDA error occurred and model reset failed. Please restart the application.")
|
| 102 |
-
return None
|
| 103 |
else:
|
| 104 |
raise cuda_error
|
| 105 |
|
|
@@ -108,7 +120,14 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
|
|
| 108 |
gr.Warning(f"Speech generation failed: {str(e)}")
|
| 109 |
# Clean up CUDA memory on any error
|
| 110 |
cleanup_memory()
|
| 111 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
def cleanup_memory():
|
| 114 |
"""Clean up GPU and system memory."""
|
|
@@ -169,57 +188,50 @@ def preprocess_audio_robust(audio_path, target_sr=22050, max_duration=30):
|
|
| 169 |
|
| 170 |
with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
|
| 171 |
gr.Markdown("# MegaTTS 3 Voice Cloning")
|
| 172 |
-
gr.Markdown("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities. The original authors did not release the WavVAE encoder, so voice cloning was not publicly available; however, thanks to [@ACoderPassBy](https://modelscope.cn/models/ACoderPassBy/MegaTTS-SFT)'s WavVAE encoder, we can now clone voices with MegaTTS 3!")
|
| 173 |
-
gr.Markdown("This is by no means the best voice cloning solution, but it works pretty well for some specific use-cases. Try out multiple and see which one works best for you.")
|
| 174 |
-
gr.Markdown("**Please use this Space responsibly and do not abuse it!** This demo is for research and educational purposes only!")
|
| 175 |
-
gr.Markdown("h/t to MysteryShack on Discord for the info about the unofficial WavVAE encoder!")
|
| 176 |
-
gr.Markdown("Upload a reference audio clip and enter text to generate speech with the cloned voice.")
|
| 177 |
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
maximum=10.0,
|
| 211 |
-
step=0.1
|
| 212 |
-
)
|
| 213 |
-
|
| 214 |
-
generate_btn = gr.Button("Generate Speech", variant="primary")
|
| 215 |
-
|
| 216 |
-
with gr.Column():
|
| 217 |
-
output_audio = gr.Audio(label="Generated Audio")
|
| 218 |
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
|
|
|
| 223 |
)
|
| 224 |
|
| 225 |
if __name__ == '__main__':
|
|
|
|
| 12 |
from huggingface_hub import snapshot_download
|
| 13 |
from tts.infer_cli import MegaTTS3DiTInfer, convert_to_wav, cut_wav
|
| 14 |
|
| 15 |
+
from pyharp.core import ModelCard, build_endpoint
|
| 16 |
+
from audiotools import AudioSignal
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
model_card = ModelCard(
|
| 20 |
+
name="MegaTTS 3 Voice Cloning",
|
| 21 |
+
description=("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities.\n"
|
| 22 |
+
"Please use this Space responsibly and do not abuse it! This demo is for research and educational purposes only."),
|
| 23 |
+
author="Ziyue Jiang et al.",
|
| 24 |
+
tags=["voice cloning"]
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
|
| 28 |
def download_weights():
|
| 29 |
"""Download model weights from HuggingFace if not already present."""
|
|
|
|
| 99 |
# Generate speech with proper error handling
|
| 100 |
try:
|
| 101 |
resource_context = infer_pipe.preprocess(file_content)
|
| 102 |
+
fs, wav = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
|
| 103 |
# Clean up memory after successful generation
|
| 104 |
cleanup_memory()
|
| 105 |
+
return fs, wav
|
| 106 |
except RuntimeError as cuda_error:
|
| 107 |
if "CUDA" in str(cuda_error):
|
| 108 |
print(f"CUDA error detected: {cuda_error}")
|
|
|
|
| 111 |
gr.Warning("CUDA error occurred. Model has been reset. Please try again.")
|
| 112 |
else:
|
| 113 |
gr.Warning("CUDA error occurred and model reset failed. Please restart the application.")
|
| 114 |
+
return None, None
|
| 115 |
else:
|
| 116 |
raise cuda_error
|
| 117 |
|
|
|
|
| 120 |
gr.Warning(f"Speech generation failed: {str(e)}")
|
| 121 |
# Clean up CUDA memory on any error
|
| 122 |
cleanup_memory()
|
| 123 |
+
return None, None
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def process_fn(inp_audio, inp_text, infer_timestep, p_w, t_w):
|
| 127 |
+
fs, wav = generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w)
|
| 128 |
+
sig = AudioSignal(wav, sample_rate=fs)
|
| 129 |
+
return save_audio(sig)
|
| 130 |
+
|
| 131 |
|
| 132 |
def cleanup_memory():
|
| 133 |
"""Clean up GPU and system memory."""
|
|
|
|
| 188 |
|
| 189 |
with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
|
| 190 |
gr.Markdown("# MegaTTS 3 Voice Cloning")
|
| 191 |
+
# gr.Markdown("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities. The original authors did not release the WavVAE encoder, so voice cloning was not publicly available; however, thanks to [@ACoderPassBy](https://modelscope.cn/models/ACoderPassBy/MegaTTS-SFT)'s WavVAE encoder, we can now clone voices with MegaTTS 3!")
|
| 192 |
+
# gr.Markdown("This is by no means the best voice cloning solution, but it works pretty well for some specific use-cases. Try out multiple and see which one works best for you.")
|
| 193 |
+
# gr.Markdown("**Please use this Space responsibly and do not abuse it!** This demo is for research and educational purposes only!")
|
| 194 |
+
# gr.Markdown("h/t to MysteryShack on Discord for the info about the unofficial WavVAE encoder!")
|
| 195 |
+
# gr.Markdown("Upload a reference audio clip and enter text to generate speech with the cloned voice.")
|
| 196 |
|
| 197 |
+
reference_audio = gr.Audio(
|
| 198 |
+
label="Reference Audio",
|
| 199 |
+
type="filepath"
|
| 200 |
+
)
|
| 201 |
+
text_input = gr.Textbox(
|
| 202 |
+
label="Text to Generate",
|
| 203 |
+
placeholder="Enter the text you want to synthesize..."
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
infer_timestep = gr.Number(
|
| 207 |
+
label="Inference Timesteps",
|
| 208 |
+
value=32,
|
| 209 |
+
minimum=1,
|
| 210 |
+
maximum=100,
|
| 211 |
+
step=1
|
| 212 |
+
)
|
| 213 |
+
p_w = gr.Number(
|
| 214 |
+
label="Intelligibility Weight",
|
| 215 |
+
value=1.4,
|
| 216 |
+
minimum=0.1,
|
| 217 |
+
maximum=5.0,
|
| 218 |
+
step=0.1
|
| 219 |
+
)
|
| 220 |
+
t_w = gr.Number(
|
| 221 |
+
label="Similarity Weight",
|
| 222 |
+
value=3.0,
|
| 223 |
+
minimum=0.1,
|
| 224 |
+
maximum=10.0,
|
| 225 |
+
step=0.1
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
output_audio = gr.Audio(type="filepath", label="Generated Audio")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
+
_ = build_endpoint(
|
| 231 |
+
model_card=model_card,
|
| 232 |
+
input_components=[reference_audio, text_input, infer_timestep, p_w, t_w],
|
| 233 |
+
output_components=[output_audio],
|
| 234 |
+
process_fn=process_fn
|
| 235 |
)
|
| 236 |
|
| 237 |
if __name__ == '__main__':
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
torch
|
| 2 |
torchaudio
|
| 3 |
numpy
|
|
@@ -15,6 +16,6 @@ x-transformers==1.44.4
|
|
| 15 |
torchdiffeq==0.2.5
|
| 16 |
openai-whisper==20240930
|
| 17 |
httpx==0.28.1
|
| 18 |
-
gradio==5.
|
| 19 |
hf-transfer
|
| 20 |
soundfile
|
|
|
|
| 1 |
+
git+https://github.com/TEAMuP-dev/pyharp.git@v0.3.0
|
| 2 |
torch
|
| 3 |
torchaudio
|
| 4 |
numpy
|
|
|
|
| 16 |
torchdiffeq==0.2.5
|
| 17 |
openai-whisper==20240930
|
| 18 |
httpx==0.28.1
|
| 19 |
+
gradio==5.28.0
|
| 20 |
hf-transfer
|
| 21 |
soundfile
|
tts/infer_cli.py
CHANGED
|
@@ -250,7 +250,8 @@ class MegaTTS3DiTInfer():
|
|
| 250 |
wav_pred_.append(wav_pred)
|
| 251 |
|
| 252 |
wav_pred = combine_audio_segments(wav_pred_, sr=self.sr).astype(float)
|
| 253 |
-
return
|
|
|
|
| 254 |
|
| 255 |
|
| 256 |
if __name__ == '__main__':
|
|
|
|
| 250 |
wav_pred_.append(wav_pred)
|
| 251 |
|
| 252 |
wav_pred = combine_audio_segments(wav_pred_, sr=self.sr).astype(float)
|
| 253 |
+
return self.sr, wav_pred
|
| 254 |
+
# return to_wav_bytes(wav_pred, self.sr)
|
| 255 |
|
| 256 |
|
| 257 |
if __name__ == '__main__':
|