Spaces:
Running on T4
Running on T4
meg-huggingface commited on
Commit ·
9a41dfd
1
Parent(s): f3a2306
Passing in the consent audio to the voice cloning client.
Browse files
app.py
CHANGED
|
@@ -1,16 +1,20 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
# import spaces
|
|
|
|
| 3 |
|
| 4 |
import src.generate as generate
|
| 5 |
import src.process as process
|
| 6 |
|
| 7 |
-
|
|
|
|
| 8 |
|
|
|
|
|
|
|
| 9 |
# ------------------- UI printing functions -------------------
|
| 10 |
def clear_all():
|
| 11 |
# target, user_transcript, score_html, diff_html, result_html,
|
| 12 |
-
# tts_text,
|
| 13 |
-
return "", "", "", "", "", "", "", None
|
| 14 |
|
| 15 |
|
| 16 |
def make_result_html(pass_threshold, passed, ratio):
|
|
@@ -119,7 +123,7 @@ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
|
|
| 119 |
score_html: HTML string to display the score
|
| 120 |
diff_html: HTML string for displaying the differences between target and user utterance
|
| 121 |
result_html: HTML string describing the results, or an error message
|
| 122 |
-
clone_audio: Bool for whether to allow audio cloning: This makes the audio cloning
|
| 123 |
"""
|
| 124 |
clone_audio = False
|
| 125 |
# Transcribe user input
|
|
@@ -142,6 +146,18 @@ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
|
|
| 142 |
|
| 143 |
return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
# ------------------- UI -------------------
|
| 147 |
with gr.Blocks(title="Say the Sentence (English)") as demo:
|
|
@@ -164,8 +180,7 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
|
|
| 164 |
btn_clear = gr.Button("🧹 Clear")
|
| 165 |
|
| 166 |
with gr.Row():
|
| 167 |
-
|
| 168 |
-
label="Record your voice")
|
| 169 |
|
| 170 |
with gr.Accordion("Advanced settings", open=False):
|
| 171 |
model_id = gr.Dropdown(
|
|
@@ -196,10 +211,31 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
|
|
| 196 |
diff_html = gr.HTML(
|
| 197 |
label="Word-level diff (red = expected but missing / green = extra or replacement)")
|
| 198 |
|
|
|
|
| 199 |
with gr.Row(visible=False) as tts_ui:
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
# -------- Events --------
|
| 205 |
# Use pre-specified sentence bank by default
|
|
@@ -207,18 +243,18 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
|
|
| 207 |
# Or use LLM generation:
|
| 208 |
# btn_gen.click(fn=generate.gen_sentence_llm, outputs=target)
|
| 209 |
|
|
|
|
| 210 |
btn_clear.click(
|
| 211 |
fn=clear_all,
|
| 212 |
-
outputs=[target, user_transcript, score_html, result_html, diff_html
|
| 213 |
-
# tts_text, clone_status, tts_audio]
|
| 214 |
)
|
| 215 |
|
| 216 |
btn_check.click(
|
| 217 |
fn=transcribe_check,
|
| 218 |
-
inputs=[
|
| 219 |
outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
|
| 220 |
)
|
| 221 |
|
| 222 |
|
| 223 |
if __name__ == "__main__":
|
| 224 |
-
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
# import spaces
|
| 3 |
+
from gradio_client import Client, handle_file
|
| 4 |
|
| 5 |
import src.generate as generate
|
| 6 |
import src.process as process
|
| 7 |
|
| 8 |
+
# TODO: Abusing the 'global' notation for now so we can be flexible to multiple clients.
|
| 9 |
+
global client
|
| 10 |
|
| 11 |
+
# TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
|
| 12 |
+
#chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
|
| 13 |
# ------------------- UI printing functions -------------------
|
| 14 |
def clear_all():
|
| 15 |
# target, user_transcript, score_html, diff_html, result_html,
|
| 16 |
+
# TODO(?): Add tts_text, tts_audio, clone_status (Maybe? Was there before.)
|
| 17 |
+
return "", "", "", "", "", "", "", None,
|
| 18 |
|
| 19 |
|
| 20 |
def make_result_html(pass_threshold, passed, ratio):
|
|
|
|
| 123 |
score_html: HTML string to display the score
|
| 124 |
diff_html: HTML string for displaying the differences between target and user utterance
|
| 125 |
result_html: HTML string describing the results, or an error message
|
| 126 |
+
clone_audio: Bool for whether to allow audio cloning: This makes the audio cloning components visible
|
| 127 |
"""
|
| 128 |
clone_audio = False
|
| 129 |
# Transcribe user input
|
|
|
|
| 146 |
|
| 147 |
return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
|
| 148 |
|
| 149 |
+
def clone_voice(audio_input, text_input):
|
| 150 |
+
# TODO: Note that this is the 'global' hack to pass in the client.
|
| 151 |
+
global client
|
| 152 |
+
# Additional specifications for Chatterbox include:
|
| 153 |
+
# exaggeration_input=0.5,
|
| 154 |
+
# temperature_input=0.8,
|
| 155 |
+
# seed_num_input=0,
|
| 156 |
+
# cfgw_input=0.5,
|
| 157 |
+
# api_name="/generate_tts_audio"
|
| 158 |
+
return client.predict(text_input=text_input,
|
| 159 |
+
audio_prompt_path_input=handle_file(audio_input))
|
| 160 |
+
|
| 161 |
|
| 162 |
# ------------------- UI -------------------
|
| 163 |
with gr.Blocks(title="Say the Sentence (English)") as demo:
|
|
|
|
| 180 |
btn_clear = gr.Button("🧹 Clear")
|
| 181 |
|
| 182 |
with gr.Row():
|
| 183 |
+
consent_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record your voice", key='consent_audio')
|
|
|
|
| 184 |
|
| 185 |
with gr.Accordion("Advanced settings", open=False):
|
| 186 |
model_id = gr.Dropdown(
|
|
|
|
| 211 |
diff_html = gr.HTML(
|
| 212 |
label="Word-level diff (red = expected but missing / green = extra or replacement)")
|
| 213 |
|
| 214 |
+
# TODO: Ideally this is gr.Blocks, but that seems to have a visibility-change bug.
|
| 215 |
with gr.Row(visible=False) as tts_ui:
|
| 216 |
+
# Using the render decorator so that we can easily pass in the consent audio after it's recorded.
|
| 217 |
+
@gr.render(inputs=consent_audio)
|
| 218 |
+
def show_tts(audio_input):
|
| 219 |
+
# TODO: Abusing global, since we can't send a Client as a component to a function.
|
| 220 |
+
global client
|
| 221 |
+
if audio_input:
|
| 222 |
+
client = Client("ResembleAI/Chatterbox")
|
| 223 |
+
with gr.Row():
|
| 224 |
+
gr.Markdown("# 🔁 Voice cloning")
|
| 225 |
+
with gr.Row():
|
| 226 |
+
with gr.Column():
|
| 227 |
+
gr.Markdown("## Audio input")
|
| 228 |
+
# Prepopulating with the consent audio.
|
| 229 |
+
tts_audio = gr.Audio(audio_input, interactive=True, type="filepath")
|
| 230 |
+
with gr.Row():
|
| 231 |
+
with gr.Column():
|
| 232 |
+
gr.Markdown("## Text input")
|
| 233 |
+
tts_text = gr.Textbox(
|
| 234 |
+
"Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", interactive=True)
|
| 235 |
+
with gr.Row():
|
| 236 |
+
clone_btn = gr.Button("Clone!")
|
| 237 |
+
cloned_audio = gr.Audio()
|
| 238 |
+
clone_btn.click(fn=clone_voice, inputs=[tts_audio, tts_text], outputs=[cloned_audio])
|
| 239 |
|
| 240 |
# -------- Events --------
|
| 241 |
# Use pre-specified sentence bank by default
|
|
|
|
| 243 |
# Or use LLM generation:
|
| 244 |
# btn_gen.click(fn=generate.gen_sentence_llm, outputs=target)
|
| 245 |
|
| 246 |
+
# TODO(?): clearing tts_text, tts_audio, clone_status (not sure what that was)
|
| 247 |
btn_clear.click(
|
| 248 |
fn=clear_all,
|
| 249 |
+
outputs=[target, user_transcript, score_html, result_html, diff_html]
|
|
|
|
| 250 |
)
|
| 251 |
|
| 252 |
btn_check.click(
|
| 253 |
fn=transcribe_check,
|
| 254 |
+
inputs=[consent_audio, target, model_id, device_pref, pass_threshold],
|
| 255 |
outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
|
| 256 |
)
|
| 257 |
|
| 258 |
|
| 259 |
if __name__ == "__main__":
|
| 260 |
+
demo.launch(show_error=True)
|