Spaces:
Running on T4
Running on T4
meg-huggingface commited on
Commit ·
b61f112
1
Parent(s): e3f1c3d
Works for chatterbox TTS, as long as you don't use the saved/cached example
Browse files
app.py
CHANGED
|
@@ -2,8 +2,8 @@ import gradio as gr
|
|
| 2 |
|
| 3 |
import src.generate as generate
|
| 4 |
import src.process as process
|
| 5 |
-
import src.tts as tts
|
| 6 |
|
|
|
|
| 7 |
|
| 8 |
# ------------------- UI printing functions -------------------
|
| 9 |
def clear_all():
|
|
@@ -117,7 +117,9 @@ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
|
|
| 117 |
score_html: HTML string to display the score
|
| 118 |
diff_html: HTML string for displaying the differences between target and user utterance
|
| 119 |
result_html: HTML string describing the results, or an error message
|
|
|
|
| 120 |
"""
|
|
|
|
| 121 |
# Transcribe user input
|
| 122 |
error_msg, user_transcript = get_user_transcript(audio_path,
|
| 123 |
target_sentence, model_id,
|
|
@@ -131,53 +133,12 @@ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
|
|
| 131 |
sentence_match = process.SentenceMatcher(target_sentence,
|
| 132 |
user_transcript,
|
| 133 |
pass_threshold)
|
|
|
|
|
|
|
| 134 |
# Create the output to print out
|
| 135 |
score_html, result_html, diff_html = make_html(sentence_match)
|
| 136 |
-
return user_transcript, score_html, result_html, diff_html
|
| 137 |
|
| 138 |
-
|
| 139 |
-
# ------------------- Voice cloning gate -------------------
|
| 140 |
-
def clone_if_pass(
|
| 141 |
-
audio_path, # ref voice (the same recorded clip)
|
| 142 |
-
target_sentence, # sentence user was supposed to say
|
| 143 |
-
user_transcript, # what ASR heard
|
| 144 |
-
tts_text, # what we want to synthesize (in cloned voice)
|
| 145 |
-
pass_threshold, # must meet or exceed this
|
| 146 |
-
tts_model_id, # e.g., "coqui/XTTS-v2"
|
| 147 |
-
tts_language, # e.g., "en"
|
| 148 |
-
):
|
| 149 |
-
"""
|
| 150 |
-
If user correctly read the target (>= threshold), clone their voice from the
|
| 151 |
-
recorded audio and speak 'tts_text'. Otherwise, refuse.
|
| 152 |
-
"""
|
| 153 |
-
# Basic validations
|
| 154 |
-
if audio_path is None:
|
| 155 |
-
return None, "Record audio first (reference voice is required)."
|
| 156 |
-
if not target_sentence:
|
| 157 |
-
return None, "Generate a target sentence first."
|
| 158 |
-
if not user_transcript:
|
| 159 |
-
return None, "Transcribe first to verify the sentence."
|
| 160 |
-
if not tts_text:
|
| 161 |
-
return None, "Enter the sentence to synthesize."
|
| 162 |
-
|
| 163 |
-
# Recompute pass/fail to avoid relying on UI state
|
| 164 |
-
sm = process.SentenceMatcher(target_sentence, user_transcript,
|
| 165 |
-
pass_threshold)
|
| 166 |
-
if not sm.passed:
|
| 167 |
-
return None, (
|
| 168 |
-
f"❌ Cloning blocked: your reading did not reach the threshold "
|
| 169 |
-
f"({sm.ratio * 100:.1f}% < {int(pass_threshold * 100)}%)."
|
| 170 |
-
)
|
| 171 |
-
|
| 172 |
-
# Run zero-shot cloning
|
| 173 |
-
out = tts.run_tts_clone(audio_path, tts_text, model_id=tts_model_id,
|
| 174 |
-
language=tts_language)
|
| 175 |
-
if isinstance(out, Exception):
|
| 176 |
-
return None, f"Voice cloning failed: {out}"
|
| 177 |
-
sr, wav = out
|
| 178 |
-
# Gradio Audio can take a tuple (sr, np.array)
|
| 179 |
-
return (
|
| 180 |
-
sr, wav), f"✅ Cloned and synthesized with {tts_model_id} ({tts_language})."
|
| 181 |
|
| 182 |
|
| 183 |
# ------------------- UI -------------------
|
|
@@ -233,33 +194,10 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
|
|
| 233 |
diff_html = gr.HTML(
|
| 234 |
label="Word-level diff (red = expected but missing / green = extra or replacement)")
|
| 235 |
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
#
|
| 239 |
-
|
| 240 |
-
# placeholder="Type the sentence you want the cloned voice to say",
|
| 241 |
-
# )
|
| 242 |
-
# with gr.Row():
|
| 243 |
-
# tts_model_id = gr.Dropdown(
|
| 244 |
-
# choices=[
|
| 245 |
-
# "coqui/XTTS-v2",
|
| 246 |
-
# # add others if you like, e.g. "myshell-ai/MeloTTS"
|
| 247 |
-
# ],
|
| 248 |
-
# value="coqui/XTTS-v2",
|
| 249 |
-
# label="TTS (voice cloning) model",
|
| 250 |
-
# )
|
| 251 |
-
# tts_language = gr.Dropdown(
|
| 252 |
-
# choices=["en", "de", "fr", "es", "it", "pt", "pl", "tr", "ru", "nl",
|
| 253 |
-
# "cs", "ar", "zh"],
|
| 254 |
-
# value="en",
|
| 255 |
-
# label="Language",
|
| 256 |
-
# )
|
| 257 |
-
|
| 258 |
-
# with gr.Row():
|
| 259 |
-
# btn_clone = gr.Button("🔁 Clone voice (if passed)", variant="secondary")
|
| 260 |
-
# with gr.Row():
|
| 261 |
-
# tts_audio = gr.Audio(label="Cloned speech output", interactive=False)
|
| 262 |
-
# clone_status = gr.Label(label="Cloning status")
|
| 263 |
|
| 264 |
# -------- Events --------
|
| 265 |
# Use pre-specified sentence bank by default
|
|
@@ -276,15 +214,9 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
|
|
| 276 |
btn_check.click(
|
| 277 |
fn=transcribe_check,
|
| 278 |
inputs=[audio, target, model_id, device_pref, pass_threshold],
|
| 279 |
-
outputs=[user_transcript, score_html, result_html, diff_html]
|
| 280 |
)
|
| 281 |
|
| 282 |
-
# btn_clone.click(
|
| 283 |
-
# fn=clone_if_pass,
|
| 284 |
-
# inputs=[audio, target, user_transcript, tts_text, pass_threshold,
|
| 285 |
-
# tts_model_id, tts_language],
|
| 286 |
-
# outputs=[tts_audio, clone_status],
|
| 287 |
-
# )
|
| 288 |
|
| 289 |
if __name__ == "__main__":
|
| 290 |
demo.launch()
|
|
|
|
| 2 |
|
| 3 |
import src.generate as generate
|
| 4 |
import src.process as process
|
|
|
|
| 5 |
|
| 6 |
+
chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
|
| 7 |
|
| 8 |
# ------------------- UI printing functions -------------------
|
| 9 |
def clear_all():
|
|
|
|
| 117 |
score_html: HTML string to display the score
|
| 118 |
diff_html: HTML string for displaying the differences between target and user utterance
|
| 119 |
result_html: HTML string describing the results, or an error message
|
| 120 |
+
clone_audio: Bool for whether to allow audio cloning: This makes the audio cloning component visible
|
| 121 |
"""
|
| 122 |
+
clone_audio = False
|
| 123 |
# Transcribe user input
|
| 124 |
error_msg, user_transcript = get_user_transcript(audio_path,
|
| 125 |
target_sentence, model_id,
|
|
|
|
| 133 |
sentence_match = process.SentenceMatcher(target_sentence,
|
| 134 |
user_transcript,
|
| 135 |
pass_threshold)
|
| 136 |
+
if sentence_match.passed:
|
| 137 |
+
clone_audio = True
|
| 138 |
# Create the output to print out
|
| 139 |
score_html, result_html, diff_html = make_html(sentence_match)
|
|
|
|
| 140 |
|
| 141 |
+
return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
|
| 144 |
# ------------------- UI -------------------
|
|
|
|
| 194 |
diff_html = gr.HTML(
|
| 195 |
label="Word-level diff (red = expected but missing / green = extra or replacement)")
|
| 196 |
|
| 197 |
+
with gr.Row(visible=False) as tts_ui:
|
| 198 |
+
with gr.Row():
|
| 199 |
+
gr.Markdown("## 🔁 Voice cloning (gated)")
|
| 200 |
+
chatterbox_space.render()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
# -------- Events --------
|
| 203 |
# Use pre-specified sentence bank by default
|
|
|
|
| 214 |
btn_check.click(
|
| 215 |
fn=transcribe_check,
|
| 216 |
inputs=[audio, target, model_id, device_pref, pass_threshold],
|
| 217 |
+
outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
|
| 218 |
)
|
| 219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
if __name__ == "__main__":
|
| 222 |
demo.launch()
|