Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,7 +9,8 @@ import os
|
|
| 9 |
|
| 10 |
#from diffusers import StableDiffusionPipeline
|
| 11 |
|
| 12 |
-
|
|
|
|
| 13 |
### ββββββββββββββββββββββββββββββββββββββββ
|
| 14 |
|
| 15 |
title="Draw Me an Insect π /Dessine-moi un insecte π"
|
|
@@ -32,11 +33,11 @@ def get_images(prompt):
|
|
| 32 |
|
| 33 |
def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
|
| 34 |
|
| 35 |
-
whisper_results =
|
| 36 |
-
prompt = whisper_results[
|
| 37 |
images = get_images(prompt)
|
| 38 |
|
| 39 |
-
return whisper_results[0], whisper_results[1],
|
| 40 |
|
| 41 |
#def diffuse(prompt, guidance_scale, nb_iterations, seed):
|
| 42 |
#
|
|
@@ -75,40 +76,19 @@ def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
|
|
| 75 |
#
|
| 76 |
# return images
|
| 77 |
|
| 78 |
-
def
|
| 79 |
print("""
|
| 80 |
β
|
| 81 |
Sending audio to Whisper ...
|
| 82 |
β
|
| 83 |
""")
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
|
| 88 |
-
print('DateTime String:', date_time_str)
|
| 89 |
-
|
| 90 |
-
audio = whisper.load_audio(audio)
|
| 91 |
-
audio = whisper.pad_or_trim(audio)
|
| 92 |
-
|
| 93 |
-
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
|
| 94 |
-
|
| 95 |
-
_, probs = whisper_model.detect_language(mel)
|
| 96 |
-
|
| 97 |
-
transcript_options = whisper.DecodingOptions(task="transcribe", fp16 = False)
|
| 98 |
-
translate_options = whisper.DecodingOptions(task="translate", fp16 = False)
|
| 99 |
-
|
| 100 |
-
transcription = whisper.decode(whisper_model, mel, transcript_options)
|
| 101 |
-
translation = whisper.decode(whisper_model, mel, translate_options)
|
| 102 |
-
|
| 103 |
-
print("language spoken: " + transcription.language)
|
| 104 |
-
print("transcript: " + transcription.text)
|
| 105 |
print("βββββββββββββββββββββββββββββββββββββββββββ")
|
| 106 |
-
print("translated: " +
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
else:
|
| 110 |
-
tr_flag = flag.flag(transcription.language)
|
| 111 |
-
return tr_flag, transcription.text, translation.text
|
| 112 |
|
| 113 |
### ββββββββββββββββββββββββββββββββββββββββ
|
| 114 |
|
|
@@ -295,8 +275,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 295 |
|
| 296 |
"""
|
| 297 |
)
|
| 298 |
-
|
| 299 |
-
with gr.Tab(label="Record/Enregistrer", elem_id="record_tab"):
|
| 300 |
with gr.Column():
|
| 301 |
record_input = gr.Audio(
|
| 302 |
source="microphone",
|
|
@@ -320,6 +299,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 320 |
audio_u_translate = gr.Button("Check the transcription/VΓ©rifier la transcription π", elem_id="check_btn_2")
|
| 321 |
audio_u_direct_sd = gr.Button("Generate the image right now! / GΓ©nerer l'image directement! ποΈ", elem_id="magic_btn_2")
|
| 322 |
|
|
|
|
| 323 |
with gr.Accordion(label="Image generation Settings/Configuration de gΓ©nΓ©ration d'image", elem_id="sd_settings", visible=False):
|
| 324 |
with gr.Row():
|
| 325 |
guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
|
|
@@ -335,21 +315,21 @@ with gr.Blocks(css=css) as demo:
|
|
| 335 |
with gr.Row():
|
| 336 |
|
| 337 |
transcripted_output = gr.Textbox(
|
| 338 |
-
label="Transcription",
|
| 339 |
lines=3,
|
| 340 |
elem_id="transcripted"
|
| 341 |
)
|
| 342 |
-
language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3)
|
| 343 |
|
| 344 |
with gr.Column():
|
| 345 |
translated_output = gr.Textbox(
|
| 346 |
-
label="
|
| 347 |
lines=4,
|
| 348 |
elem_id="translated"
|
| 349 |
)
|
| 350 |
with gr.Row():
|
| 351 |
clear_btn = gr.Button(value="Clear")
|
| 352 |
-
diffuse_btn = gr.Button(value="
|
| 353 |
|
| 354 |
clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
|
| 355 |
|
|
@@ -407,18 +387,18 @@ with gr.Blocks(css=css) as demo:
|
|
| 407 |
|
| 408 |
""", elem_id="about")
|
| 409 |
|
| 410 |
-
|
| 411 |
inputs = record_input,
|
| 412 |
outputs = [
|
| 413 |
-
language_detected_output,
|
| 414 |
transcripted_output,
|
| 415 |
translated_output
|
| 416 |
])
|
| 417 |
|
| 418 |
-
audio_u_translate.click(
|
| 419 |
inputs = upload_input,
|
| 420 |
outputs = [
|
| 421 |
-
language_detected_output,
|
| 422 |
transcripted_output,
|
| 423 |
translated_output
|
| 424 |
])
|
|
@@ -431,7 +411,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 431 |
seed
|
| 432 |
],
|
| 433 |
outputs = [
|
| 434 |
-
language_detected_output,
|
| 435 |
transcripted_output,
|
| 436 |
translated_output,
|
| 437 |
sd_output
|
|
@@ -445,7 +425,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 445 |
seed
|
| 446 |
],
|
| 447 |
outputs = [
|
| 448 |
-
language_detected_output,
|
| 449 |
transcripted_output,
|
| 450 |
translated_output,
|
| 451 |
sd_output
|
|
@@ -456,7 +436,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 456 |
translated_output
|
| 457 |
],
|
| 458 |
outputs = sd_output
|
| 459 |
-
|
| 460 |
gr.HTML('''
|
| 461 |
<div class="footer">
|
| 462 |
<p> This Space is based on the <a href="https://huggingface.co/spaces/fffiloni/whisper-to-stable-diffusion" target="_blank">Whisper to Stable Diffusion Space</a> created by <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a>.
|
|
|
|
| 9 |
|
| 10 |
#from diffusers import StableDiffusionPipeline
|
| 11 |
|
| 12 |
+
whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
|
| 13 |
+
stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5")
|
| 14 |
### ββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
|
| 16 |
title="Draw Me an Insect π /Dessine-moi un insecte π"
|
|
|
|
| 33 |
|
| 34 |
def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
|
| 35 |
|
| 36 |
+
whisper_results = translate_better(audio)
|
| 37 |
+
prompt = whisper_results[1]
|
| 38 |
images = get_images(prompt)
|
| 39 |
|
| 40 |
+
return whisper_results[0], whisper_results[1], images
|
| 41 |
|
| 42 |
#def diffuse(prompt, guidance_scale, nb_iterations, seed):
|
| 43 |
#
|
|
|
|
| 76 |
#
|
| 77 |
# return images
|
| 78 |
|
| 79 |
+
def translate_better(audio):
|
| 80 |
print("""
|
| 81 |
β
|
| 82 |
Sending audio to Whisper ...
|
| 83 |
β
|
| 84 |
""")
|
| 85 |
+
transcribe_text_result = whisper(audio, None, "transcribe", fn_index=0)
|
| 86 |
+
translate_text_result = whisper(audio, None, "translate", fn_index=0)
|
| 87 |
+
print("transcript: " + transcribe_text_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
print("βββββββββββββββββββββββββββββββββββββββββββ")
|
| 89 |
+
print("translated: " + translate_text_result)
|
| 90 |
+
|
| 91 |
+
return transcribe_text_result, translate_text_result
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
### ββββββββββββββββββββββββββββββββββββββββ
|
| 94 |
|
|
|
|
| 275 |
|
| 276 |
"""
|
| 277 |
)
|
| 278 |
+
with gr.Tab(label="Record/Enregistrer", elem_id="record_tab"):
|
|
|
|
| 279 |
with gr.Column():
|
| 280 |
record_input = gr.Audio(
|
| 281 |
source="microphone",
|
|
|
|
| 299 |
audio_u_translate = gr.Button("Check the transcription/VΓ©rifier la transcription π", elem_id="check_btn_2")
|
| 300 |
audio_u_direct_sd = gr.Button("Generate the image right now! / GΓ©nerer l'image directement! ποΈ", elem_id="magic_btn_2")
|
| 301 |
|
| 302 |
+
|
| 303 |
with gr.Accordion(label="Image generation Settings/Configuration de gΓ©nΓ©ration d'image", elem_id="sd_settings", visible=False):
|
| 304 |
with gr.Row():
|
| 305 |
guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
|
|
|
|
| 315 |
with gr.Row():
|
| 316 |
|
| 317 |
transcripted_output = gr.Textbox(
|
| 318 |
+
label="Transcription in your detected spoken language",
|
| 319 |
lines=3,
|
| 320 |
elem_id="transcripted"
|
| 321 |
)
|
| 322 |
+
#language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3)
|
| 323 |
|
| 324 |
with gr.Column():
|
| 325 |
translated_output = gr.Textbox(
|
| 326 |
+
label="Transcript translated in English by Whisper",
|
| 327 |
lines=4,
|
| 328 |
elem_id="translated"
|
| 329 |
)
|
| 330 |
with gr.Row():
|
| 331 |
clear_btn = gr.Button(value="Clear")
|
| 332 |
+
diffuse_btn = gr.Button(value="OK, Diffuse this prompt !", elem_id="diffuse_btn")
|
| 333 |
|
| 334 |
clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
|
| 335 |
|
|
|
|
| 387 |
|
| 388 |
""", elem_id="about")
|
| 389 |
|
| 390 |
+
audio_r_translate.click(translate_better,
|
| 391 |
inputs = record_input,
|
| 392 |
outputs = [
|
| 393 |
+
#language_detected_output,
|
| 394 |
transcripted_output,
|
| 395 |
translated_output
|
| 396 |
])
|
| 397 |
|
| 398 |
+
audio_u_translate.click(translate_better,
|
| 399 |
inputs = upload_input,
|
| 400 |
outputs = [
|
| 401 |
+
#language_detected_output,
|
| 402 |
transcripted_output,
|
| 403 |
translated_output
|
| 404 |
])
|
|
|
|
| 411 |
seed
|
| 412 |
],
|
| 413 |
outputs = [
|
| 414 |
+
#language_detected_output,
|
| 415 |
transcripted_output,
|
| 416 |
translated_output,
|
| 417 |
sd_output
|
|
|
|
| 425 |
seed
|
| 426 |
],
|
| 427 |
outputs = [
|
| 428 |
+
#language_detected_output,
|
| 429 |
transcripted_output,
|
| 430 |
translated_output,
|
| 431 |
sd_output
|
|
|
|
| 436 |
translated_output
|
| 437 |
],
|
| 438 |
outputs = sd_output
|
| 439 |
+
)
|
| 440 |
gr.HTML('''
|
| 441 |
<div class="footer">
|
| 442 |
<p> This Space is based on the <a href="https://huggingface.co/spaces/fffiloni/whisper-to-stable-diffusion" target="_blank">Whisper to Stable Diffusion Space</a> created by <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a>.
|