meg-huggingface commited on
Commit
b61f112
·
1 Parent(s): e3f1c3d

Works for chatterbox TTS, as long as you don't use the saved/cached example

Browse files
Files changed (1) hide show
  1. app.py +11 -79
app.py CHANGED
@@ -2,8 +2,8 @@ import gradio as gr
2
 
3
  import src.generate as generate
4
  import src.process as process
5
- import src.tts as tts
6
 
 
7
 
8
  # ------------------- UI printing functions -------------------
9
  def clear_all():
@@ -117,7 +117,9 @@ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
117
  score_html: HTML string to display the score
118
  diff_html: HTML string for displaying the differences between target and user utterance
119
  result_html: HTML string describing the results, or an error message
 
120
  """
 
121
  # Transcribe user input
122
  error_msg, user_transcript = get_user_transcript(audio_path,
123
  target_sentence, model_id,
@@ -131,53 +133,12 @@ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
131
  sentence_match = process.SentenceMatcher(target_sentence,
132
  user_transcript,
133
  pass_threshold)
 
 
134
  # Create the output to print out
135
  score_html, result_html, diff_html = make_html(sentence_match)
136
- return user_transcript, score_html, result_html, diff_html
137
 
138
-
139
- # ------------------- Voice cloning gate -------------------
140
- def clone_if_pass(
141
- audio_path, # ref voice (the same recorded clip)
142
- target_sentence, # sentence user was supposed to say
143
- user_transcript, # what ASR heard
144
- tts_text, # what we want to synthesize (in cloned voice)
145
- pass_threshold, # must meet or exceed this
146
- tts_model_id, # e.g., "coqui/XTTS-v2"
147
- tts_language, # e.g., "en"
148
- ):
149
- """
150
- If user correctly read the target (>= threshold), clone their voice from the
151
- recorded audio and speak 'tts_text'. Otherwise, refuse.
152
- """
153
- # Basic validations
154
- if audio_path is None:
155
- return None, "Record audio first (reference voice is required)."
156
- if not target_sentence:
157
- return None, "Generate a target sentence first."
158
- if not user_transcript:
159
- return None, "Transcribe first to verify the sentence."
160
- if not tts_text:
161
- return None, "Enter the sentence to synthesize."
162
-
163
- # Recompute pass/fail to avoid relying on UI state
164
- sm = process.SentenceMatcher(target_sentence, user_transcript,
165
- pass_threshold)
166
- if not sm.passed:
167
- return None, (
168
- f"❌ Cloning blocked: your reading did not reach the threshold "
169
- f"({sm.ratio * 100:.1f}% < {int(pass_threshold * 100)}%)."
170
- )
171
-
172
- # Run zero-shot cloning
173
- out = tts.run_tts_clone(audio_path, tts_text, model_id=tts_model_id,
174
- language=tts_language)
175
- if isinstance(out, Exception):
176
- return None, f"Voice cloning failed: {out}"
177
- sr, wav = out
178
- # Gradio Audio can take a tuple (sr, np.array)
179
- return (
180
- sr, wav), f"✅ Cloned and synthesized with {tts_model_id} ({tts_language})."
181
 
182
 
183
  # ------------------- UI -------------------
@@ -233,33 +194,10 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
233
  diff_html = gr.HTML(
234
  label="Word-level diff (red = expected but missing / green = extra or replacement)")
235
 
236
- # gr.Markdown("## 🔁 Voice cloning (gated)")
237
- # with gr.Row():
238
- # tts_text = gr.Textbox(
239
- # label="Text to synthesize (voice clone)",
240
- # placeholder="Type the sentence you want the cloned voice to say",
241
- # )
242
- # with gr.Row():
243
- # tts_model_id = gr.Dropdown(
244
- # choices=[
245
- # "coqui/XTTS-v2",
246
- # # add others if you like, e.g. "myshell-ai/MeloTTS"
247
- # ],
248
- # value="coqui/XTTS-v2",
249
- # label="TTS (voice cloning) model",
250
- # )
251
- # tts_language = gr.Dropdown(
252
- # choices=["en", "de", "fr", "es", "it", "pt", "pl", "tr", "ru", "nl",
253
- # "cs", "ar", "zh"],
254
- # value="en",
255
- # label="Language",
256
- # )
257
-
258
- # with gr.Row():
259
- # btn_clone = gr.Button("🔁 Clone voice (if passed)", variant="secondary")
260
- # with gr.Row():
261
- # tts_audio = gr.Audio(label="Cloned speech output", interactive=False)
262
- # clone_status = gr.Label(label="Cloning status")
263
 
264
  # -------- Events --------
265
  # Use pre-specified sentence bank by default
@@ -276,15 +214,9 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
276
  btn_check.click(
277
  fn=transcribe_check,
278
  inputs=[audio, target, model_id, device_pref, pass_threshold],
279
- outputs=[user_transcript, score_html, result_html, diff_html]
280
  )
281
 
282
- # btn_clone.click(
283
- # fn=clone_if_pass,
284
- # inputs=[audio, target, user_transcript, tts_text, pass_threshold,
285
- # tts_model_id, tts_language],
286
- # outputs=[tts_audio, clone_status],
287
- # )
288
 
289
  if __name__ == "__main__":
290
  demo.launch()
 
2
 
3
  import src.generate as generate
4
  import src.process as process
 
5
 
6
+ chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
7
 
8
  # ------------------- UI printing functions -------------------
9
  def clear_all():
 
117
  score_html: HTML string to display the score
118
  diff_html: HTML string for displaying the differences between target and user utterance
119
  result_html: HTML string describing the results, or an error message
120
+ clone_audio: Bool for whether to allow audio cloning: This makes the audio cloning component visible
121
  """
122
+ clone_audio = False
123
  # Transcribe user input
124
  error_msg, user_transcript = get_user_transcript(audio_path,
125
  target_sentence, model_id,
 
133
  sentence_match = process.SentenceMatcher(target_sentence,
134
  user_transcript,
135
  pass_threshold)
136
+ if sentence_match.passed:
137
+ clone_audio = True
138
  # Create the output to print out
139
  score_html, result_html, diff_html = make_html(sentence_match)
 
140
 
141
+ return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
 
144
  # ------------------- UI -------------------
 
194
  diff_html = gr.HTML(
195
  label="Word-level diff (red = expected but missing / green = extra or replacement)")
196
 
197
+ with gr.Row(visible=False) as tts_ui:
198
+ with gr.Row():
199
+ gr.Markdown("## 🔁 Voice cloning (gated)")
200
+ chatterbox_space.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  # -------- Events --------
203
  # Use pre-specified sentence bank by default
 
214
  btn_check.click(
215
  fn=transcribe_check,
216
  inputs=[audio, target, model_id, device_pref, pass_threshold],
217
+ outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
218
  )
219
 
 
 
 
 
 
 
220
 
221
  if __name__ == "__main__":
222
  demo.launch()