raphaelbiojout commited on
Commit
a60c8b9
·
1 Parent(s): f276e75

Encode Base64

Browse files
Files changed (1) hide show
  1. handler.py +55 -12
handler.py CHANGED
@@ -4,6 +4,7 @@ import torch
4
  import os
5
  import time
6
  import json
 
7
  import subprocess
8
  import numpy as np
9
 
@@ -31,29 +32,22 @@ def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
31
  """
32
  ar = f"{sampling_rate}"
33
  ac = "1"
34
- format_for_conversion = "s16le" # was "f32le"
35
  ffmpeg_command = [
36
  "ffmpeg",
37
- "-nostdin",
38
- "-threads",
39
- "0",
40
  "-i",
41
  "pipe:0",
42
- "-f",
43
- format_for_conversion,
44
  "-ac",
45
  ac,
46
  "-ar",
47
  ar,
48
- "-acodec",
49
- "pcm_s16le",
50
  "-hide_banner",
51
  "-loglevel",
52
  "quiet",
53
  "pipe:1",
54
  ]
55
-
56
-
57
 
58
  try:
59
  with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
@@ -71,6 +65,51 @@ def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
71
  return audio
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  class EndpointHandler():
75
  def __init__(self, path=""):
76
  # load the model
@@ -102,7 +141,11 @@ class EndpointHandler():
102
  print(f"key: {x}, value: {data[x]} ")
103
 
104
  # 1. process input
105
- inputs = data.pop("inputs", data)
 
 
 
 
106
  audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
107
  # audio_tensor= torch.from_numpy(audio_nparray)
108
 
@@ -110,7 +153,7 @@ class EndpointHandler():
110
 
111
  # 2. transcribe
112
  device, batch_size, compute_type, whisper_model = whisper_config()
113
- transcription = self.model.transcribe(audio_nparray, batch_size=batch_size)
114
  results.append({"transcription": transcription["segments"]})
115
  logger.info(transcription["segments"])
116
 
 
4
  import os
5
  import time
6
  import json
7
+ import base64
8
  import subprocess
9
  import numpy as np
10
 
 
32
  """
33
  ar = f"{sampling_rate}"
34
  ac = "1"
35
+ format_for_conversion = "f32le"
36
  ffmpeg_command = [
37
  "ffmpeg",
 
 
 
38
  "-i",
39
  "pipe:0",
 
 
40
  "-ac",
41
  ac,
42
  "-ar",
43
  ar,
44
+ "-f",
45
+ format_for_conversion,
46
  "-hide_banner",
47
  "-loglevel",
48
  "quiet",
49
  "pipe:1",
50
  ]
 
 
51
 
52
  try:
53
  with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
 
65
  return audio
66
 
67
 
68
+ # FROM WHISPERX
69
+ def load_audio(file: str, sr: int = SAMPLE_RATE):
70
+ """
71
+ Open an audio file and read as mono waveform, resampling as necessary
72
+
73
+ Parameters
74
+ ----------
75
+ file: str
76
+ The audio file to open
77
+
78
+ sr: int
79
+ The sample rate to resample the audio if necessary
80
+
81
+ Returns
82
+ -------
83
+ A NumPy array containing the audio waveform, in float32 dtype.
84
+ """
85
+ try:
86
+ # Launches a subprocess to decode audio while down-mixing and resampling as necessary.
87
+ # Requires the ffmpeg CLI to be installed.
88
+ cmd = [
89
+ "ffmpeg",
90
+ "-nostdin",
91
+ "-threads",
92
+ "0",
93
+ "-i",
94
+ file,
95
+ "-f",
96
+ "s16le",
97
+ "-ac",
98
+ "1",
99
+ "-acodec",
100
+ "pcm_s16le",
101
+ "-ar",
102
+ str(sr),
103
+ "-",
104
+ ]
105
+ out = subprocess.run(cmd, capture_output=True, check=True).stdout
106
+ except subprocess.CalledProcessError as e:
107
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
108
+
109
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
110
+
111
+
112
+
113
  class EndpointHandler():
114
  def __init__(self, path=""):
115
  # load the model
 
141
  print(f"key: {x}, value: {data[x]} ")
142
 
143
  # 1. process input
144
+ inputs_encoded = data.pop("inputs", data)
145
+ inputs = base64.b64decode(inputs_encoded)
146
+
147
+ print(inputs)
148
+
149
  audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
150
  # audio_tensor= torch.from_numpy(audio_nparray)
151
 
 
153
 
154
  # 2. transcribe
155
  device, batch_size, compute_type, whisper_model = whisper_config()
156
+ transcription = self.model.transcribe(audio_nparray, batch_size=batch_size,language="fr")
157
  results.append({"transcription": transcription["segments"]})
158
  logger.info(transcription["segments"])
159