ellagranger commited on
Commit
6eb7e7c
·
1 Parent(s): 8c2b5ab

Wrapping PyHARP

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +65 -53
  3. requirements.txt +2 -1
  4. tts/infer_cli.py +2 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🎤
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.38.0
8
  app_file: app.py
9
  pinned: true
10
  short_description: MegaTTS 3 but with voice cloning!
 
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.28.0
8
  app_file: app.py
9
  pinned: true
10
  short_description: MegaTTS 3 but with voice cloning!
app.py CHANGED
@@ -12,6 +12,18 @@ from pydub.effects import normalize
12
  from huggingface_hub import snapshot_download
13
  from tts.infer_cli import MegaTTS3DiTInfer, convert_to_wav, cut_wav
14
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def download_weights():
17
  """Download model weights from HuggingFace if not already present."""
@@ -87,10 +99,10 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
87
  # Generate speech with proper error handling
88
  try:
89
  resource_context = infer_pipe.preprocess(file_content)
90
- wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
91
  # Clean up memory after successful generation
92
  cleanup_memory()
93
- return wav_bytes
94
  except RuntimeError as cuda_error:
95
  if "CUDA" in str(cuda_error):
96
  print(f"CUDA error detected: {cuda_error}")
@@ -99,7 +111,7 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
99
  gr.Warning("CUDA error occurred. Model has been reset. Please try again.")
100
  else:
101
  gr.Warning("CUDA error occurred and model reset failed. Please restart the application.")
102
- return None
103
  else:
104
  raise cuda_error
105
 
@@ -108,7 +120,14 @@ def generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w):
108
  gr.Warning(f"Speech generation failed: {str(e)}")
109
  # Clean up CUDA memory on any error
110
  cleanup_memory()
111
- return None
 
 
 
 
 
 
 
112
 
113
  def cleanup_memory():
114
  """Clean up GPU and system memory."""
@@ -169,57 +188,50 @@ def preprocess_audio_robust(audio_path, target_sr=22050, max_duration=30):
169
 
170
  with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
171
  gr.Markdown("# MegaTTS 3 Voice Cloning")
172
- gr.Markdown("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities. The original authors did not release the WavVAE encoder, so voice cloning was not publicly available; however, thanks to [@ACoderPassBy](https://modelscope.cn/models/ACoderPassBy/MegaTTS-SFT)'s WavVAE encoder, we can now clone voices with MegaTTS 3!")
173
- gr.Markdown("This is by no means the best voice cloning solution, but it works pretty well for some specific use-cases. Try out multiple and see which one works best for you.")
174
- gr.Markdown("**Please use this Space responsibly and do not abuse it!** This demo is for research and educational purposes only!")
175
- gr.Markdown("h/t to MysteryShack on Discord for the info about the unofficial WavVAE encoder!")
176
- gr.Markdown("Upload a reference audio clip and enter text to generate speech with the cloned voice.")
177
 
178
- with gr.Row():
179
- with gr.Column():
180
- reference_audio = gr.Audio(
181
- label="Reference Audio",
182
- type="filepath",
183
- sources=["upload", "microphone"]
184
- )
185
- text_input = gr.Textbox(
186
- label="Text to Generate",
187
- placeholder="Enter the text you want to synthesize...",
188
- lines=3
189
- )
190
-
191
- with gr.Accordion("Advanced Options", open=False):
192
- infer_timestep = gr.Number(
193
- label="Inference Timesteps",
194
- value=32,
195
- minimum=1,
196
- maximum=100,
197
- step=1
198
- )
199
- p_w = gr.Number(
200
- label="Intelligibility Weight",
201
- value=1.4,
202
- minimum=0.1,
203
- maximum=5.0,
204
- step=0.1
205
- )
206
- t_w = gr.Number(
207
- label="Similarity Weight",
208
- value=3.0,
209
- minimum=0.1,
210
- maximum=10.0,
211
- step=0.1
212
- )
213
-
214
- generate_btn = gr.Button("Generate Speech", variant="primary")
215
-
216
- with gr.Column():
217
- output_audio = gr.Audio(label="Generated Audio")
218
 
219
- generate_btn.click(
220
- fn=generate_speech,
221
- inputs=[reference_audio, text_input, infer_timestep, p_w, t_w],
222
- outputs=[output_audio]
 
223
  )
224
 
225
  if __name__ == '__main__':
 
12
  from huggingface_hub import snapshot_download
13
  from tts.infer_cli import MegaTTS3DiTInfer, convert_to_wav, cut_wav
14
 
15
+ from pyharp.core import ModelCard, build_endpoint
16
+ from audiotools import AudioSignal
17
+
18
+
19
+ model_card = ModelCard(
20
+ name="MegaTTS 3 Voice Cloning",
21
+ description=("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities.\n"
22
+ "Please use this Space responsibly and do not abuse it! This demo is for research and educational purposes only."),
23
+ author="Ziyue Jiang et al.",
24
+ tags=["voice cloning"]
25
+ )
26
+
27
 
28
  def download_weights():
29
  """Download model weights from HuggingFace if not already present."""
 
99
  # Generate speech with proper error handling
100
  try:
101
  resource_context = infer_pipe.preprocess(file_content)
102
+ fs, wav = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
103
  # Clean up memory after successful generation
104
  cleanup_memory()
105
+ return fs, wav
106
  except RuntimeError as cuda_error:
107
  if "CUDA" in str(cuda_error):
108
  print(f"CUDA error detected: {cuda_error}")
 
111
  gr.Warning("CUDA error occurred. Model has been reset. Please try again.")
112
  else:
113
  gr.Warning("CUDA error occurred and model reset failed. Please restart the application.")
114
+ return None, None
115
  else:
116
  raise cuda_error
117
 
 
120
  gr.Warning(f"Speech generation failed: {str(e)}")
121
  # Clean up CUDA memory on any error
122
  cleanup_memory()
123
+ return None, None
124
+
125
+
126
+ def process_fn(inp_audio, inp_text, infer_timestep, p_w, t_w):
127
+ fs, wav = generate_speech(inp_audio, inp_text, infer_timestep, p_w, t_w)
128
+ sig = AudioSignal(wav, sample_rate=fs)
129
+ return save_audio(sig)
130
+
131
 
132
  def cleanup_memory():
133
  """Clean up GPU and system memory."""
 
188
 
189
  with gr.Blocks(title="MegaTTS3 Voice Cloning") as demo:
190
  gr.Markdown("# MegaTTS 3 Voice Cloning")
191
+ # gr.Markdown("MegaTTS 3 is a text-to-speech model trained by ByteDance with exceptional voice cloning capabilities. The original authors did not release the WavVAE encoder, so voice cloning was not publicly available; however, thanks to [@ACoderPassBy](https://modelscope.cn/models/ACoderPassBy/MegaTTS-SFT)'s WavVAE encoder, we can now clone voices with MegaTTS 3!")
192
+ # gr.Markdown("This is by no means the best voice cloning solution, but it works pretty well for some specific use-cases. Try out multiple and see which one works best for you.")
193
+ # gr.Markdown("**Please use this Space responsibly and do not abuse it!** This demo is for research and educational purposes only!")
194
+ # gr.Markdown("h/t to MysteryShack on Discord for the info about the unofficial WavVAE encoder!")
195
+ # gr.Markdown("Upload a reference audio clip and enter text to generate speech with the cloned voice.")
196
 
197
+ reference_audio = gr.Audio(
198
+ label="Reference Audio",
199
+ type="filepath"
200
+ )
201
+ text_input = gr.Textbox(
202
+ label="Text to Generate",
203
+ placeholder="Enter the text you want to synthesize..."
204
+ )
205
+
206
+ infer_timestep = gr.Number(
207
+ label="Inference Timesteps",
208
+ value=32,
209
+ minimum=1,
210
+ maximum=100,
211
+ step=1
212
+ )
213
+ p_w = gr.Number(
214
+ label="Intelligibility Weight",
215
+ value=1.4,
216
+ minimum=0.1,
217
+ maximum=5.0,
218
+ step=0.1
219
+ )
220
+ t_w = gr.Number(
221
+ label="Similarity Weight",
222
+ value=3.0,
223
+ minimum=0.1,
224
+ maximum=10.0,
225
+ step=0.1
226
+ )
227
+
228
+ output_audio = gr.Audio(type="filepath", label="Generated Audio")
 
 
 
 
 
 
 
 
229
 
230
+ _ = build_endpoint(
231
+ model_card=model_card,
232
+ input_components=[reference_audio, text_input, infer_timestep, p_w, t_w],
233
+ output_components=[output_audio],
234
+ process_fn=process_fn
235
  )
236
 
237
  if __name__ == '__main__':
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  torch
2
  torchaudio
3
  numpy
@@ -15,6 +16,6 @@ x-transformers==1.44.4
15
  torchdiffeq==0.2.5
16
  openai-whisper==20240930
17
  httpx==0.28.1
18
- gradio==5.23.1
19
  hf-transfer
20
  soundfile
 
1
+ git+https://github.com/TEAMuP-dev/pyharp.git@v0.3.0
2
  torch
3
  torchaudio
4
  numpy
 
16
  torchdiffeq==0.2.5
17
  openai-whisper==20240930
18
  httpx==0.28.1
19
+ gradio==5.28.0
20
  hf-transfer
21
  soundfile
tts/infer_cli.py CHANGED
@@ -250,7 +250,8 @@ class MegaTTS3DiTInfer():
250
  wav_pred_.append(wav_pred)
251
 
252
  wav_pred = combine_audio_segments(wav_pred_, sr=self.sr).astype(float)
253
- return to_wav_bytes(wav_pred, self.sr)
 
254
 
255
 
256
  if __name__ == '__main__':
 
250
  wav_pred_.append(wav_pred)
251
 
252
  wav_pred = combine_audio_segments(wav_pred_, sr=self.sr).astype(float)
253
+ return self.sr, wav_pred
254
+ # return to_wav_bytes(wav_pred, self.sr)
255
 
256
 
257
  if __name__ == '__main__':