MegaTTS3-Voice-Cloning

Running

App Files Files Community

ellagranger commited on Oct 20, 2025

Commit

6eb7e7c

1 Parent(s): 8c2b5ab

Wrapping PyHARP

Browse files

Files changed (4) hide show

README.md +1 -1
app.py +65 -53
requirements.txt +2 -1
tts/infer_cli.py +2 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🎤
 colorFrom: blue
 colorTo: green
 sdk: gradio
-sdk_version: 5.38.0
 app_file: app.py
 pinned: true
 short_description: MegaTTS 3 but with voice cloning!

 colorFrom: blue
 colorTo: green
 sdk: gradio
+sdk_version: 5.28.0
 app_file: app.py
 pinned: true
 short_description: MegaTTS 3 but with voice cloning!

app.py CHANGED Viewed

@@ -12,6 +12,18 @@ from pydub.effects import normalize
 from huggingface_hub import snapshot_download
 from tts.infer_cli import MegaTTS3DiTInfer, convert_to_wav, cut_wav
 def download_weights():
     """Download model weights from HuggingFace if not already present."""
@@ -87,10 +99,10 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
         # Generate speech with proper error handling
         try:
             resource_context = infer_pipe.preprocess(file_content)
-            wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
             # Clean up memory after successful generation
             cleanup_memory()
-            return wav_bytes
         except RuntimeError as cuda_error:
             if "CUDA" in str(cuda_error):
                 print(f"CUDA error detected: {cuda_error}")
@@ -99,7 +111,7 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
                     gr.Warning("CUDA error occurred. Model has been reset. Please try again.")
                 else:
                     gr.Warning("CUDA error occurred and model reset failed. Please restart the application.")
-                return None
             else:
                 raise cuda_error
@@ -108,7 +120,14 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
         gr.Warning(f"Speech generation failed: {str(e)}")
         # Clean up CUDA memory on any error
         cleanup_memory()
-        return None
 def cleanup_memory():
     """Clean up GPU and system memory."""
@@ -169,57 +188,50 @@ def preprocess_audio_robust(audio_path, target_sr=22050, max_duration=30):
 with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
     gr.Markdown("# MegaTTS 3 Voice Cloning")
-    gr.Markdown("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities. The original authors did not release the WavVAE encoder, so voice cloning was not publicly available; however, thanks to [@ACoderPassBy](https://modelscope.cn/models/ACoderPassBy/MegaTTS-SFT)'s WavVAE encoder, we can now clone voices with MegaTTS 3!")
-    gr.Markdown("This is by no means the best voice cloning solution, but it works pretty well for some specific use-cases. Try out multiple and see which one works best for you.")
-    gr.Markdown("**Please use this Space responsibly and do not abuse it!** This demo is for research and educational purposes only!")
-    gr.Markdown("h/t to MysteryShack on Discord for the info about the unofficial WavVAE encoder!")
-    gr.Markdown("Upload a reference audio clip and enter text to generate speech with the cloned voice.")
-    with gr.Row():
-        with gr.Column():
-            reference_audio = gr.Audio(
-                label="Reference Audio",
-                type="filepath",
-                sources=["upload", "microphone"]
-            )
-            text_input = gr.Textbox(
-                label="Text to Generate",
-                placeholder="Enter the text you want to synthesize...",
-                lines=3
-            )
-            with gr.Accordion("Advanced Options", open=False):
-                infer_timestep = gr.Number(
-                    label="Inference Timesteps",
-                    value=32,
-                    minimum=1,
-                    maximum=100,
-                    step=1
-                )
-                p_w = gr.Number(
-                    label="Intelligibility Weight",
-                    value=1.4,
-                    minimum=0.1,
-                    maximum=5.0,
-                    step=0.1
-                )
-                t_w = gr.Number(
-                    label="Similarity Weight",
-                    value=3.0,
-                    minimum=0.1,
-                    maximum=10.0,
-                    step=0.1
-                )
-            generate_btn = gr.Button("Generate Speech", variant="primary")
-        with gr.Column():
-            output_audio = gr.Audio(label="Generated Audio")
-    generate_btn.click(
-        fn=generate_speech,
-        inputs=[reference_audio, text_input, infer_timestep, p_w, t_w],
-        outputs=[output_audio]
     )
 if __name__ == '__main__':

 from huggingface_hub import snapshot_download
 from tts.infer_cli import MegaTTS3DiTInfer, convert_to_wav, cut_wav
+from pyharp.core import ModelCard, build_endpoint
+from audiotools import AudioSignal
+model_card = ModelCard(
+    name="MegaTTS 3 Voice Cloning",
+    description=("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities.\n"
+                 "Please use this Space responsibly and do not abuse it! This demo is for research and educational purposes only."),
+    author="Ziyue Jiang et al.",
+    tags=["voice cloning"]
+)
 def download_weights():
     """Download model weights from HuggingFace if not already present."""
         # Generate speech with proper error handling
         try:
             resource_context = infer_pipe.preprocess(file_content)
+            fs, wav = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
             # Clean up memory after successful generation
             cleanup_memory()
+            return fs, wav
         except RuntimeError as cuda_error:
             if "CUDA" in str(cuda_error):
                 print(f"CUDA error detected: {cuda_error}")
                     gr.Warning("CUDA error occurred. Model has been reset. Please try again.")
                 else:
                     gr.Warning("CUDA error occurred and model reset failed. Please restart the application.")
+                return None, None
             else:
                 raise cuda_error
         gr.Warning(f"Speech generation failed: {str(e)}")
         # Clean up CUDA memory on any error
         cleanup_memory()
+        return None, None
+def process_fn(inp_audio, inp_text, infer_timestep, p_w, t_w):
+    fs, wav = generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w)
+    sig = AudioSignal(wav, sample_rate=fs)
+    return save_audio(sig)
 def cleanup_memory():
     """Clean up GPU and system memory."""
 with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
     gr.Markdown("# MegaTTS 3 Voice Cloning")
+    # gr.Markdown("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities. The original authors did not release the WavVAE encoder, so voice cloning was not publicly available; however, thanks to [@ACoderPassBy](https://modelscope.cn/models/ACoderPassBy/MegaTTS-SFT)'s WavVAE encoder, we can now clone voices with MegaTTS 3!")
+    # gr.Markdown("This is by no means the best voice cloning solution, but it works pretty well for some specific use-cases. Try out multiple and see which one works best for you.")
+    # gr.Markdown("**Please use this Space responsibly and do not abuse it!** This demo is for research and educational purposes only!")
+    # gr.Markdown("h/t to MysteryShack on Discord for the info about the unofficial WavVAE encoder!")
+    # gr.Markdown("Upload a reference audio clip and enter text to generate speech with the cloned voice.")
+    reference_audio = gr.Audio(
+        label="Reference Audio",
+        type="filepath"
+    )
+    text_input = gr.Textbox(
+        label="Text to Generate",
+        placeholder="Enter the text you want to synthesize..."
+    )
+    infer_timestep = gr.Number(
+        label="Inference Timesteps",
+        value=32,
+        minimum=1,
+        maximum=100,
+        step=1
+    )
+    p_w = gr.Number(
+        label="Intelligibility Weight",
+        value=1.4,
+        minimum=0.1,
+        maximum=5.0,
+        step=0.1
+    )
+    t_w = gr.Number(
+        label="Similarity Weight",
+        value=3.0,
+        minimum=0.1,
+        maximum=10.0,
+        step=0.1
+    )
+    output_audio = gr.Audio(type="filepath", label="Generated Audio")
+    _ = build_endpoint(
+        model_card=model_card,
+        input_components=[reference_audio, text_input, infer_timestep, p_w, t_w],
+        output_components=[output_audio],
+        process_fn=process_fn
     )
 if __name__ == '__main__':

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 torch
 torchaudio
 numpy
@@ -15,6 +16,6 @@ x-transformers==1.44.4
 torchdiffeq==0.2.5
 openai-whisper==20240930
 httpx==0.28.1
-gradio==5.23.1
 hf-transfer
 soundfile

+git+https://github.com/TEAMuP-dev/pyharp.git@v0.3.0
 torch
 torchaudio
 numpy
 torchdiffeq==0.2.5
 openai-whisper==20240930
 httpx==0.28.1
+gradio==5.28.0
 hf-transfer
 soundfile

tts/infer_cli.py CHANGED Viewed

@@ -250,7 +250,8 @@ class MegaTTS3DiTInfer():
                 wav_pred_.append(wav_pred)
             wav_pred = combine_audio_segments(wav_pred_, sr=self.sr).astype(float)
-            return to_wav_bytes(wav_pred, self.sr)
 if __name__ == '__main__':

                 wav_pred_.append(wav_pred)
             wav_pred = combine_audio_segments(wav_pred_, sr=self.sr).astype(float)
+            return self.sr, wav_pred
+            # return to_wav_bytes(wav_pred, self.sr)
 if __name__ == '__main__':