meg-huggingface commited on
Commit
9a41dfd
·
1 Parent(s): f3a2306

Passing in the consent audio to the voice cloning client.

Browse files
Files changed (1) hide show
  1. app.py +49 -13
app.py CHANGED
@@ -1,16 +1,20 @@
1
  import gradio as gr
2
  # import spaces
 
3
 
4
  import src.generate as generate
5
  import src.process as process
6
 
7
- chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
 
8
 
 
 
9
  # ------------------- UI printing functions -------------------
10
  def clear_all():
11
  # target, user_transcript, score_html, diff_html, result_html,
12
- # tts_text, clone_status, tts_audio
13
- return "", "", "", "", "", "", "", None
14
 
15
 
16
  def make_result_html(pass_threshold, passed, ratio):
@@ -119,7 +123,7 @@ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
119
  score_html: HTML string to display the score
120
  diff_html: HTML string for displaying the differences between target and user utterance
121
  result_html: HTML string describing the results, or an error message
122
- clone_audio: Bool for whether to allow audio cloning: This makes the audio cloning component visible
123
  """
124
  clone_audio = False
125
  # Transcribe user input
@@ -142,6 +146,18 @@ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
142
 
143
  return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
144
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  # ------------------- UI -------------------
147
  with gr.Blocks(title="Say the Sentence (English)") as demo:
@@ -164,8 +180,7 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
164
  btn_clear = gr.Button("🧹 Clear")
165
 
166
  with gr.Row():
167
- audio = gr.Audio(sources=["microphone"], type="filepath",
168
- label="Record your voice")
169
 
170
  with gr.Accordion("Advanced settings", open=False):
171
  model_id = gr.Dropdown(
@@ -196,10 +211,31 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
196
  diff_html = gr.HTML(
197
  label="Word-level diff (red = expected but missing / green = extra or replacement)")
198
 
 
199
  with gr.Row(visible=False) as tts_ui:
200
- with gr.Row():
201
- gr.Markdown("## 🔁 Voice cloning (gated)")
202
- chatterbox_space.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  # -------- Events --------
205
  # Use pre-specified sentence bank by default
@@ -207,18 +243,18 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
207
  # Or use LLM generation:
208
  # btn_gen.click(fn=generate.gen_sentence_llm, outputs=target)
209
 
 
210
  btn_clear.click(
211
  fn=clear_all,
212
- outputs=[target, user_transcript, score_html, result_html, diff_html,]
213
- # tts_text, clone_status, tts_audio]
214
  )
215
 
216
  btn_check.click(
217
  fn=transcribe_check,
218
- inputs=[audio, target, model_id, device_pref, pass_threshold],
219
  outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
220
  )
221
 
222
 
223
  if __name__ == "__main__":
224
- demo.launch()
 
1
  import gradio as gr
2
  # import spaces
3
+ from gradio_client import Client, handle_file
4
 
5
  import src.generate as generate
6
  import src.process as process
7
 
8
+ # TODO: Abusing the 'global' notation for now so we can be flexible to multiple clients.
9
+ global client
10
 
11
+ # TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
12
+ #chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
13
  # ------------------- UI printing functions -------------------
14
  def clear_all():
15
  # target, user_transcript, score_html, diff_html, result_html,
16
+ # TODO(?): Add tts_text, tts_audio, clone_status (Maybe? Was there before.)
17
+ return "", "", "", "", "", "", "", None,
18
 
19
 
20
  def make_result_html(pass_threshold, passed, ratio):
 
123
  score_html: HTML string to display the score
124
  diff_html: HTML string for displaying the differences between target and user utterance
125
  result_html: HTML string describing the results, or an error message
126
+ clone_audio: Bool for whether to allow audio cloning: This makes the audio cloning components visible
127
  """
128
  clone_audio = False
129
  # Transcribe user input
 
146
 
147
  return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
148
 
149
+ def clone_voice(audio_input, text_input):
150
+ # TODO: Note that this is the 'global' hack to pass in the client.
151
+ global client
152
+ # Additional specifications for Chatterbox include:
153
+ # exaggeration_input=0.5,
154
+ # temperature_input=0.8,
155
+ # seed_num_input=0,
156
+ # cfgw_input=0.5,
157
+ # api_name="/generate_tts_audio"
158
+ return client.predict(text_input=text_input,
159
+ audio_prompt_path_input=handle_file(audio_input))
160
+
161
 
162
  # ------------------- UI -------------------
163
  with gr.Blocks(title="Say the Sentence (English)") as demo:
 
180
  btn_clear = gr.Button("🧹 Clear")
181
 
182
  with gr.Row():
183
+ consent_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record your voice", key='consent_audio')
 
184
 
185
  with gr.Accordion("Advanced settings", open=False):
186
  model_id = gr.Dropdown(
 
211
  diff_html = gr.HTML(
212
  label="Word-level diff (red = expected but missing / green = extra or replacement)")
213
 
214
+ # TODO: Ideally this is gr.Blocks, but that seems to have a visibility-change bug.
215
  with gr.Row(visible=False) as tts_ui:
216
+ # Using the render decorator so that we can easily pass in the consent audio after it's recorded.
217
+ @gr.render(inputs=consent_audio)
218
+ def show_tts(audio_input):
219
+ # TODO: Abusing global, since we can't send a Client as a component to a function.
220
+ global client
221
+ if audio_input:
222
+ client = Client("ResembleAI/Chatterbox")
223
+ with gr.Row():
224
+ gr.Markdown("# 🔁 Voice cloning")
225
+ with gr.Row():
226
+ with gr.Column():
227
+ gr.Markdown("## Audio input")
228
+ # Prepopulating with the consent audio.
229
+ tts_audio = gr.Audio(audio_input, interactive=True, type="filepath")
230
+ with gr.Row():
231
+ with gr.Column():
232
+ gr.Markdown("## Text input")
233
+ tts_text = gr.Textbox(
234
+ "Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", interactive=True)
235
+ with gr.Row():
236
+ clone_btn = gr.Button("Clone!")
237
+ cloned_audio = gr.Audio()
238
+ clone_btn.click(fn=clone_voice, inputs=[tts_audio, tts_text], outputs=[cloned_audio])
239
 
240
  # -------- Events --------
241
  # Use pre-specified sentence bank by default
 
243
  # Or use LLM generation:
244
  # btn_gen.click(fn=generate.gen_sentence_llm, outputs=target)
245
 
246
+ # TODO(?): clearing tts_text, tts_audio, clone_status (not sure what that was)
247
  btn_clear.click(
248
  fn=clear_all,
249
+ outputs=[target, user_transcript, score_html, result_html, diff_html]
 
250
  )
251
 
252
  btn_check.click(
253
  fn=transcribe_check,
254
+ inputs=[consent_audio, target, model_id, device_pref, pass_threshold],
255
  outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
256
  )
257
 
258
 
259
  if __name__ == "__main__":
260
+ demo.launch(show_error=True)