add gpu support

Files changed (11) hide show

Dockerfile +6 -1
app.py +115 -0
funasr_onnx/utils/__pycache__/__init__.cpython-38.pyc +0 -0
funasr_onnx/utils/__pycache__/e2e_vad.cpython-38.pyc +0 -0
funasr_onnx/utils/__pycache__/frontend.cpython-38.pyc +0 -0
funasr_onnx/utils/__pycache__/postprocess_utils.cpython-38.pyc +0 -0
funasr_onnx/utils/__pycache__/timestamp_utils.cpython-38.pyc +0 -0
funasr_onnx/utils/__pycache__/utils.cpython-38.pyc +0 -0
requirements-gradio.txt +12 -0
requirements.txt +1 -0
transcribe.py +60 -17

Dockerfile CHANGED Viewed

@@ -1,4 +1,9 @@
 FROM nvcr.io/nvidia/pytorch:22.12-py3
 COPY ./ /workspace/
 WORKDIR /workspace/
-RUN pip3 install -r requirements.txt

 FROM nvcr.io/nvidia/pytorch:22.12-py3
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y ffmpeg
 COPY ./ /workspace/
 WORKDIR /workspace/
+RUN pip3 install --no-cache-dir --upgrade -r requirements-gradio.txt
+RUN chmod -R 777 /workspace/*
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from funasr_onnx import Fsmn_vad, Paraformer, CT_Transformer
+from transcribe import get_models, transcribe
+import soundfile
+import gradio as gr
+import pytube as pt
+import datetime
+import os
+asr_model, vad_model, punc_model = get_models("./models")
+def convert_to_wav(in_filename: str) -> str:
+    """Convert the input audio file to a wave file"""
+    out_filename = in_filename + ".wav"
+    if '.mp3' in in_filename:
+        _ = os.system(f"ffmpeg -y -i '{in_filename}' -acodec pcm_s16le -ac 1 -ar 16000 '{out_filename}'")
+    else:
+        _ = os.system(f"ffmpeg -hide_banner -y -i '{in_filename}' -ar 16000 '{out_filename}'")
+    speech, _ = soundfile.read(out_filename)
+    print(f"load speech shape {speech.shape}")
+    return speech
+def file_transcribe(microphone, file_upload):
+    warn_output = ""
+    if (microphone is not None) and (file_upload is not None):
+        warn_output = (
+            "WARNING: You've uploaded an audio file and used the microphone. "
+            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
+        )
+    elif (microphone is None) and (file_upload is None):
+        return "ERROR: You have to either use the microphone or upload an audio file"
+    file = microphone if microphone is not None else file_upload
+    speech = convert_to_wav(file)
+    items = []
+    vad_model.vad_scorer.AllResetDetection()
+    for item in transcribe(speech, asr_model, vad_model, punc_model):
+        items.append(item)
+        print(item)
+    text = "\n".join(items)
+    return warn_output + text
+def _return_yt_html_embed(yt_url):
+    video_id = yt_url.split("?v=")[-1]
+    HTML_str = (
+        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
+        " </center>"
+    )
+    return HTML_str
+def youtube_transcribe(yt_url):
+    yt = pt.YouTube(yt_url)
+    html_embed_str = _return_yt_html_embed(yt_url)
+    stream = yt.streams.filter(only_audio=True)[0]
+    filename = f"audio.mp3"
+    stream.download(filename=filename)
+    speech=convert_to_wav(filename)
+    items = []
+    vad_model.vad_scorer.AllResetDetection()
+    for item in transcribe(speech, asr_model, vad_model, punc_model):
+        items.append(item)
+        print(item)
+    text = "\n".join(items)
+    os.system(f"rm -rf audio.mp3 audio.mp3.wav")
+    return html_embed_str, text
+def run():
+    gr.close_all()
+    demo = gr.Blocks()
+    mf_transcribe = gr.Interface(
+        fn=file_transcribe,
+        inputs=[
+            gr.inputs.Audio(source="microphone", type="filepath", optional=True),
+            gr.inputs.Audio(source="upload", type="filepath", optional=True),
+        ],
+        outputs="text",
+        layout="horizontal",
+        theme="huggingface",
+        title="ParaformerX: Copilot for Audio",
+        description=(
+            "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the pretrained paraformer model to transcribe audio files of arbitrary length."
+        ),
+        allow_flagging="never",
+    )
+    yt_transcribe = gr.Interface(
+        fn=youtube_transcribe,
+        inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
+        outputs=["html", "text"],
+        layout="horizontal",
+        theme="huggingface",
+        title="Demo: Transcribe YouTube",
+        description=(
+            "Transcribe long-form YouTube videos with the click of a button! Demo uses the the pretrained paraformer model to transcribe audio files of arbitrary length."
+        ),
+        allow_flagging="never",
+    )
+    with demo:
+        gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
+    demo.launch(server_name="0.0.0.0", server_port=7860, enable_queue=True)
+if __name__ == "__main__":
+    run()

funasr_onnx/utils/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (164 Bytes)

funasr_onnx/utils/__pycache__/e2e_vad.cpython-38.pyc DELETED Viewed

Binary file (16.4 kB)

funasr_onnx/utils/__pycache__/frontend.cpython-38.pyc DELETED Viewed

Binary file (6.1 kB)

funasr_onnx/utils/__pycache__/postprocess_utils.cpython-38.pyc DELETED Viewed

Binary file (3.84 kB)

funasr_onnx/utils/__pycache__/timestamp_utils.cpython-38.pyc DELETED Viewed

Binary file (1.52 kB)

funasr_onnx/utils/__pycache__/utils.cpython-38.pyc DELETED Viewed

Binary file (10.8 kB)

requirements-gradio.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+WeTextProcessing
+onnxruntime-gpu
+onnxruntime
+soundfile
+librosa
+scipy
+numpy
+typeguard==2.13.3
+kaldi-native-fbank
+PyYAML>=5.1.2
+gradio
+pytube

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 WeTextProcessing
 onnxruntime
 soundfile
 librosa

 WeTextProcessing
+onnxruntime-gpu
 onnxruntime
 soundfile
 librosa

transcribe.py CHANGED Viewed

@@ -3,6 +3,7 @@ from funasr_onnx import Fsmn_vad, Paraformer, CT_Transformer
 import datetime
 from itn.chinese.inverse_normalizer import InverseNormalizer
 import argparse
 def get_args():
     parser = argparse.ArgumentParser(
@@ -29,14 +30,17 @@ def process_time(milliseconds):
     delta = datetime.timedelta(milliseconds=milliseconds)
     time_str = str(delta)
     time_parts = time_str.split(".")[0].split(":")
-    time_hms = "{:02d}:{:02d}:{:02d}".format(int(time_parts[0]), int(time_parts[1]), int(time_parts[2]))
     return time_hms
-def get_models(model_dir):
     vad_model_dir = model_dir + "/speech_fsmn_vad_zh-cn-16k-common-pytorch"
     asr_model_dir =  model_dir + "/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
     punc_model_dir = model_dir + "/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
-    asr_model = Paraformer(asr_model_dir, batch_size=1, plot_timestamp_to="./", pred_bias=0)  # cpu
     punc_model= CT_Transformer(punc_model_dir)
     vad_model = Fsmn_vad(vad_model_dir)
     return asr_model, vad_model, punc_model
@@ -47,22 +51,61 @@ def load_audio(wav_path):
 def transcribe(speech, asr_model, vad_model=None, punc_model=None, invnormalizer=None):
     if vad_model:
         segments_info = vad_model(audio_in=speech)
-        assert len(segments_info) == 1, "only support batch_size 1"
-        for seg in segments_info[0]:
-            if seg[1] == -1:  # end of speech
-                seg[1] = len(speech) // 16
-            seg_speech = speech[seg[0]*16:seg[1]*16]
-            result = asr_model(seg_speech)
-            result = result[0]['preds'][0]
-            if invnormalizer:
-                result = invnormalizer.normalize(result)
-            if punc_model:
-                result = punc_model(result)
-                result = result[0]
-            item = f"{process_time(seg[0])}-->{process_time(seg[1])}  {result}"
-            yield item
 if __name__ == "__main__":
    args = get_args()

 import datetime
 from itn.chinese.inverse_normalizer import InverseNormalizer
 import argparse
+import torch
 def get_args():
     parser = argparse.ArgumentParser(
     delta = datetime.timedelta(milliseconds=milliseconds)
     time_str = str(delta)
     time_parts = time_str.split(".")[0].split(":")
+    time_hms = "{:02d}:{:02d}:{:02d}:{:03d}".format(int(time_parts[0]), int(time_parts[1]), int(time_parts[2]), int(str(milliseconds)[-3:]))
     return time_hms
+def get_models(model_dir, batch_size=16, enable_gpu=False):
     vad_model_dir = model_dir + "/speech_fsmn_vad_zh-cn-16k-common-pytorch"
     asr_model_dir =  model_dir + "/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
     punc_model_dir = model_dir + "/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+    if torch.cuda.is_available() and enable_gpu:
+        asr_model = Paraformer(asr_model_dir, batch_size=batch_size, device_id=0, plot_timestamp_to="./", pred_bias=0)  # gpu
+    else:
+        asr_model = Paraformer(asr_model_dir, batch_size=1, plot_timestamp_to="./", pred_bias=0)  # cpu
     punc_model= CT_Transformer(punc_model_dir)
     vad_model = Fsmn_vad(vad_model_dir)
     return asr_model, vad_model, punc_model
 def transcribe(speech, asr_model, vad_model=None, punc_model=None, invnormalizer=None):
     if vad_model:
+        vad_model.vad_scorer.AllResetDetection()
         segments_info = vad_model(audio_in=speech)
+        assert len(segments_info) == 1, "only support batch_size 1"
+        if asr_model.batch_size > 1:
+            all_results = []
+            assert torch.cuda.is_available(), "only support batch_size > 1 on gpu"
+            i, end, step = 0, len(segments_info[0]), asr_model.batch_size
+            while i < end:
+                sub_segments_info = segments_info[0][i:i+step]
+                seg_speech_list, duration = [], 0
+                for seg in sub_segments_info:
+                    if seg[1] == -1:  # end of speech
+                        seg[1] = len(speech) // 16
+                    seg_speech = speech[seg[0]*16:seg[1]*16]
+                    duration += (seg[1] - seg[0]) /1000
+                    if duration < 8 * asr_model.batch_size:  # max audio length should never exceed 8s * batch_size
+                        seg_speech_list.append(seg_speech)
+                        i += 1
+                    else:
+                        break
+                    assert seg_speech_list
+                result = asr_model(seg_speech_list)
+                all_results.extend(result)
+            assert len(all_results) == len(segments_info[0])
+            for i, seg in enumerate(segments_info[0]):
+                if seg[1] == -1:  # end of speech
+                    seg[1] = len(speech) // 16
+                result = all_results[i]['preds'][0]
+                if invnormalizer:
+                    try:
+                        result = invnormalizer.normalize(result)
+                    except:
+                        print("error in normalization")
+                if punc_model:
+                    if result:
+                        result = punc_model(result)
+                        result = result[0]
+                item = f"{process_time(seg[0])}-->{process_time(seg[1])}  {result}"
+                yield item
+        else:
+            for seg in segments_info[0]:
+                if seg[1] == -1:  # end of speech
+                    seg[1] = len(speech) // 16
+                seg_speech = speech[seg[0]*16:seg[1]*16]
+                result = asr_model(seg_speech)
+                result = result[0]['preds'][0]
+                if invnormalizer:
+                    result = invnormalizer.normalize(result)
+                if punc_model:
+                    if result:
+                        result = punc_model(result)
+                        result = result[0]
+                item = f"{process_time(seg[0])}-->{process_time(seg[1])}  {result}"
+                yield item
 if __name__ == "__main__":
    args = get_args()