Spaces:
Runtime error
Runtime error
Fix issue with single uploaded file not being transcribed if too long
Browse files
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: 🐠
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 3.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
|
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 3.12
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
app.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
|
|
|
| 3 |
import uuid
|
| 4 |
import tempfile
|
| 5 |
import subprocess
|
| 6 |
import re
|
| 7 |
import time
|
|
|
|
| 8 |
|
| 9 |
import gradio as gr
|
| 10 |
import pytube as pt
|
|
@@ -24,6 +26,8 @@ os.environ[constants.NEMO_ENV_CACHE_DIR] = "/tmp/nemo/"
|
|
| 24 |
|
| 25 |
SAMPLE_RATE = 16000 # Default sample rate for ASR
|
| 26 |
BUFFERED_INFERENCE_DURATION_THRESHOLD = 60.0 # 60 second and above will require chunked inference.
|
|
|
|
|
|
|
| 27 |
|
| 28 |
TITLE = "NeMo ASR Inference on Hugging Face"
|
| 29 |
DESCRIPTION = "Demo of all languages supported by NeMo ASR"
|
|
@@ -184,11 +188,14 @@ def convert_audio(audio_filepath):
|
|
| 184 |
return audio_filepath
|
| 185 |
|
| 186 |
out_filename = os.path.join(filedir, filename + '.wav')
|
|
|
|
| 187 |
process = subprocess.Popen(
|
| 188 |
-
['ffmpeg', '-i', audio_filepath, '-ac', '1', '-ar', str(SAMPLE_RATE), out_filename],
|
| 189 |
stdout=subprocess.PIPE,
|
| 190 |
stderr=subprocess.STDOUT,
|
|
|
|
| 191 |
)
|
|
|
|
| 192 |
stdout, stderr = process.communicate()
|
| 193 |
|
| 194 |
if os.path.exists(out_filename):
|
|
@@ -368,6 +375,7 @@ def infer_audio(model_name: str, audio_file: str) -> str:
|
|
| 368 |
|
| 369 |
def transcribe(microphone, audio_file, model_name):
|
| 370 |
|
|
|
|
| 371 |
warn_output = ""
|
| 372 |
if (microphone is not None) and (audio_file is not None):
|
| 373 |
warn_output = (
|
|
@@ -384,15 +392,32 @@ def transcribe(microphone, audio_file, model_name):
|
|
| 384 |
else:
|
| 385 |
audio_data = audio_file
|
| 386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
time_diff = None
|
| 388 |
try:
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
|
| 395 |
except Exception as e:
|
|
|
|
|
|
|
| 396 |
transcriptions = ""
|
| 397 |
warn_output = warn_output
|
| 398 |
|
|
@@ -412,8 +437,6 @@ def transcribe(microphone, audio_file, model_name):
|
|
| 412 |
if transcriptions.startswith("Error:-"):
|
| 413 |
html_output = build_html_output(transcriptions, style="result_item_error")
|
| 414 |
else:
|
| 415 |
-
audio_duration = parse_duration(audio_data)
|
| 416 |
-
|
| 417 |
output = f"Successfully transcribed on {get_device()} ! <br>" f"Transcription Time : {time_diff: 0.3f} s"
|
| 418 |
|
| 419 |
if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
|
|
@@ -533,10 +556,11 @@ with demo:
|
|
| 533 |
|
| 534 |
lang_selector, models_in_lang = create_lang_selector_component()
|
| 535 |
|
|
|
|
|
|
|
| 536 |
transcript = gr.components.Label(label='Transcript')
|
| 537 |
audio_html_output = gr.components.HTML()
|
| 538 |
|
| 539 |
-
run = gr.components.Button('Transcribe')
|
| 540 |
run.click(
|
| 541 |
transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript, audio_html_output]
|
| 542 |
)
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
+
import shutil
|
| 4 |
import uuid
|
| 5 |
import tempfile
|
| 6 |
import subprocess
|
| 7 |
import re
|
| 8 |
import time
|
| 9 |
+
import traceback
|
| 10 |
|
| 11 |
import gradio as gr
|
| 12 |
import pytube as pt
|
|
|
|
| 26 |
|
| 27 |
SAMPLE_RATE = 16000 # Default sample rate for ASR
|
| 28 |
BUFFERED_INFERENCE_DURATION_THRESHOLD = 60.0 # 60 second and above will require chunked inference.
|
| 29 |
+
CHUNK_LEN_IN_SEC = 20.0 # Chunk size
|
| 30 |
+
BUFFER_LEN_IN_SEC = 30.0 # Total buffer size
|
| 31 |
|
| 32 |
TITLE = "NeMo ASR Inference on Hugging Face"
|
| 33 |
DESCRIPTION = "Demo of all languages supported by NeMo ASR"
|
|
|
|
| 188 |
return audio_filepath
|
| 189 |
|
| 190 |
out_filename = os.path.join(filedir, filename + '.wav')
|
| 191 |
+
|
| 192 |
process = subprocess.Popen(
|
| 193 |
+
['ffmpeg', '-y', '-i', audio_filepath, '-ac', '1', '-ar', str(SAMPLE_RATE), out_filename],
|
| 194 |
stdout=subprocess.PIPE,
|
| 195 |
stderr=subprocess.STDOUT,
|
| 196 |
+
close_fds=True,
|
| 197 |
)
|
| 198 |
+
|
| 199 |
stdout, stderr = process.communicate()
|
| 200 |
|
| 201 |
if os.path.exists(out_filename):
|
|
|
|
| 375 |
|
| 376 |
def transcribe(microphone, audio_file, model_name):
|
| 377 |
|
| 378 |
+
audio_data = None
|
| 379 |
warn_output = ""
|
| 380 |
if (microphone is not None) and (audio_file is not None):
|
| 381 |
warn_output = (
|
|
|
|
| 392 |
else:
|
| 393 |
audio_data = audio_file
|
| 394 |
|
| 395 |
+
if audio_data is not None:
|
| 396 |
+
audio_duration = parse_duration(audio_data)
|
| 397 |
+
else:
|
| 398 |
+
audio_duration = None
|
| 399 |
+
|
| 400 |
time_diff = None
|
| 401 |
try:
|
| 402 |
+
with tempfile.TemporaryDirectory() as tempdir:
|
| 403 |
+
filename = os.path.split(audio_data)[-1]
|
| 404 |
+
new_audio_data = os.path.join(tempdir, filename)
|
| 405 |
+
shutil.copy2(audio_data, new_audio_data)
|
| 406 |
+
|
| 407 |
+
if os.path.exists(audio_data):
|
| 408 |
+
os.remove(audio_data)
|
| 409 |
+
|
| 410 |
+
audio_data = new_audio_data`
|
| 411 |
+
|
| 412 |
+
# Use HF API for transcription
|
| 413 |
+
start = time.time()
|
| 414 |
+
transcriptions = infer_audio(model_name, audio_data)
|
| 415 |
+
end = time.time()
|
| 416 |
+
time_diff = end - start
|
| 417 |
|
| 418 |
except Exception as e:
|
| 419 |
+
print(traceback.print_exc())
|
| 420 |
+
|
| 421 |
transcriptions = ""
|
| 422 |
warn_output = warn_output
|
| 423 |
|
|
|
|
| 437 |
if transcriptions.startswith("Error:-"):
|
| 438 |
html_output = build_html_output(transcriptions, style="result_item_error")
|
| 439 |
else:
|
|
|
|
|
|
|
| 440 |
output = f"Successfully transcribed on {get_device()} ! <br>" f"Transcription Time : {time_diff: 0.3f} s"
|
| 441 |
|
| 442 |
if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
|
|
|
|
| 556 |
|
| 557 |
lang_selector, models_in_lang = create_lang_selector_component()
|
| 558 |
|
| 559 |
+
run = gr.components.Button('Transcribe')
|
| 560 |
+
|
| 561 |
transcript = gr.components.Label(label='Transcript')
|
| 562 |
audio_html_output = gr.components.HTML()
|
| 563 |
|
|
|
|
| 564 |
run.click(
|
| 565 |
transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript, audio_html_output]
|
| 566 |
)
|