rbiojout
/

whisperX-endpoint

Model card Files Files and versions

xet

Community

raphaelbiojout commited on Nov 17, 2023

Commit

a60c8b9

1 Parent(s): f276e75

Encode Base64

Browse files

Files changed (1) hide show

handler.py +55 -12

handler.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 import os
 import time
 import json
 import subprocess
 import numpy as np
@@ -31,29 +32,22 @@ def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
     """
     ar = f"{sampling_rate}"
     ac = "1"
-    format_for_conversion = "s16le" # was "f32le"
     ffmpeg_command = [
         "ffmpeg",
-        "-nostdin",
-        "-threads",
-        "0",
         "-i",
         "pipe:0",
-        "-f",
-        format_for_conversion,
         "-ac",
         ac,
         "-ar",
         ar,
-        "-acodec",
-        "pcm_s16le",
         "-hide_banner",
         "-loglevel",
         "quiet",
         "pipe:1",
     ]
     try:
         with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
@@ -71,6 +65,51 @@ def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
     return audio
 class EndpointHandler():
     def __init__(self, path=""):
         # load the model
@@ -102,7 +141,11 @@ class EndpointHandler():
                 print(f"key: {x}, value: {data[x]}    ")
             # 1. process input
-            inputs = data.pop("inputs", data)
             audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
             # audio_tensor= torch.from_numpy(audio_nparray)
@@ -110,7 +153,7 @@ class EndpointHandler():
             # 2. transcribe
             device, batch_size, compute_type, whisper_model = whisper_config()
-            transcription = self.model.transcribe(audio_nparray, batch_size=batch_size)
             results.append({"transcription": transcription["segments"]})
             logger.info(transcription["segments"])

 import os
 import time
 import json
+import base64
 import subprocess
 import numpy as np
     """
     ar = f"{sampling_rate}"
     ac = "1"
+    format_for_conversion = "f32le"
     ffmpeg_command = [
         "ffmpeg",
         "-i",
         "pipe:0",
         "-ac",
         ac,
         "-ar",
         ar,
+        "-f",
+        format_for_conversion,
         "-hide_banner",
         "-loglevel",
         "quiet",
         "pipe:1",
     ]
     try:
         with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
     return audio
+# FROM WHISPERX
+def load_audio(file: str, sr: int = SAMPLE_RATE):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    try:
+        # Launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI to be installed.
+        cmd = [
+            "ffmpeg",
+            "-nostdin",
+            "-threads",
+            "0",
+            "-i",
+            file,
+            "-f",
+            "s16le",
+            "-ac",
+            "1",
+            "-acodec",
+            "pcm_s16le",
+            "-ar",
+            str(sr),
+            "-",
+        ]
+        out = subprocess.run(cmd, capture_output=True, check=True).stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
 class EndpointHandler():
     def __init__(self, path=""):
         # load the model
                 print(f"key: {x}, value: {data[x]}    ")
             # 1. process input
+            inputs_encoded = data.pop("inputs", data)
+            inputs = base64.b64decode(inputs_encoded)
+            print(inputs)
             audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
             # audio_tensor= torch.from_numpy(audio_nparray)
             # 2. transcribe
             device, batch_size, compute_type, whisper_model = whisper_config()
+            transcription = self.model.transcribe(audio_nparray, batch_size=batch_size,language="fr")
             results.append({"transcription": transcription["segments"]})
             logger.info(transcription["segments"])