ericmattmann commited on
Commit
3d5a24c
·
1 Parent(s): 1366c4e

update to perform only transcription

Browse files
Files changed (1) hide show
  1. handler.py +162 -157
handler.py CHANGED
@@ -14,7 +14,7 @@ import json
14
  import base64
15
  import numpy as np
16
 
17
- DEVNULL = open(os.devnull, 'w')
18
 
19
  # from transformers.pipelines.audio_utils import ffmpeg_read
20
  from typing import Dict, List, Any
@@ -26,39 +26,47 @@ logger = logging.getLogger(__name__)
26
 
27
  SAMPLE_RATE = 16000
28
 
 
29
  def whisper_config():
30
  device = "cuda" if torch.cuda.is_available() else "cpu"
31
- whisper_model = "large-v2"
32
- batch_size = 16 # reduce if low on GPU mem, 16 initailly
33
  # change to "int8" if low on GPU mem (may reduce accuracy)
34
  compute_type = "float16" if device == "cuda" else "int8"
35
  return device, batch_size, compute_type, whisper_model
36
 
 
37
  # From https://gist.github.com/kylemcdonald/85d70bf53e207bab3775
38
  # load_audio can not detect the input type
39
  def ffmpeg_load_audio(filename, sr=44100, mono=False, normalize=True, in_type=np.int16, out_type=np.float32):
40
  channels = 1 if mono else 2
41
  format_strings = {
42
- np.float64: 'f64le',
43
- np.float32: 'f32le',
44
- np.int16: 's16le',
45
- np.int32: 's32le',
46
- np.uint32: 'u32le'
47
  }
48
  format_string = format_strings[in_type]
49
  command = [
50
- 'ffmpeg',
51
- '-i', filename,
52
- '-f', format_string,
53
- '-acodec', 'pcm_' + format_string,
54
- '-ar', str(sr),
55
- '-ac', str(channels),
56
- '-']
 
 
 
 
 
 
57
  p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=DEVNULL, bufsize=4096)
58
  bytes_per_sample = np.dtype(in_type).itemsize
59
  frame_size = bytes_per_sample * channels
60
- chunk_size = frame_size * sr # read in 1-second chunks
61
- raw = b''
62
  with p.stdout as stdout:
63
  while True:
64
  data = stdout.read(chunk_size)
@@ -80,6 +88,7 @@ def ffmpeg_load_audio(filename, sr=44100, mono=False, normalize=True, in_type=np
80
  audio /= np.iinfo(in_type).max
81
  return audio
82
 
 
83
  # FROM HuggingFace
84
  def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
85
  """
@@ -167,14 +176,15 @@ def load_audio(file: str, sr: int = SAMPLE_RATE):
167
  def display_gpu_infos():
168
  if not torch.cuda.is_available():
169
  return "NO CUDA"
170
-
171
  infos = "torch.cuda.current_device(): " + str(torch.cuda.current_device()) + ", "
172
- infos = infos + "torch.cuda.device(0): " + str(torch.cuda.device(0)) + ", "
173
  infos = infos + "torch.cuda.device_count(): " + str(torch.cuda.device_count()) + ", "
174
  infos = infos + "torch.cuda.get_device_name(0): " + str(torch.cuda.get_device_name(0))
175
  return infos
176
 
177
- class EndpointHandler():
 
178
  def __init__(self, path=""):
179
  # load the model
180
  device, batch_size, compute_type, whisper_model = whisper_config()
@@ -182,143 +192,138 @@ class EndpointHandler():
182
  # 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 ERIC
183
  # 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 OLD
184
  logger.info(f"Model {whisper_model} initialized")
185
-
186
- self.diarize_model = whisperx.DiarizationPipeline(
187
- "pyannote/speaker-diarization-3.1",
188
- use_auth_token="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", device=device)
189
-
190
- logger.info(f"Model for diarization initialized")
191
-
192
-
193
  def __call__(self, data: Any) -> Dict[str, str]:
194
- """
195
- Args:
196
- data (:obj:):
197
- includes the deserialized audio file as bytes
198
- Return:
199
- A :obj:`dict`:. base64 encoded image
200
- """
201
- # get the start time
202
- st = time.time()
203
-
204
-
205
- logger.info("--------------- CONFIGURATION ------------------------")
206
- device, batch_size, compute_type, whisper_model = whisper_config()
207
- logger.info(f"device: {device}, batch_size: {batch_size}, compute_type:{compute_type}, whisper_model: {whisper_model}")
208
- logger.info(display_gpu_infos())
209
-
210
- # 1. process input
211
- inputs_encoded = data.pop("inputs", data)
212
- parameters = data.pop("parameters", None)
213
- options = data.pop("options", None)
214
-
215
- # OPTIONS are given as parameters
216
- info = False
217
- if options and "info" in options.keys() and options['info']:
218
- info = True
219
-
220
- alignment = False
221
- if options and "alignment" in options.keys() and options['alignment']:
222
- alignment = True
223
-
224
- diarization = True
225
- if options and "diarization" in options.keys() and not options['diarization']:
226
- diarization = False
227
-
228
- language = "fr"
229
- if parameters and "language" in parameters.keys():
230
- language = parameters["language"]
231
-
232
- inputs = base64.b64decode(inputs_encoded)
233
- # make a tmp file
234
- with open('/tmp/myfile.tmp', 'wb') as w:
235
- w.write(inputs)
236
-
237
- # audio_nparray = ffmpeg_load_audio('/tmp/myfile.tmp', sr=SAMPLE_RATE, mono=True, out_type=np.float32)
238
- audio_nparray = load_audio('/tmp/myfile.tmp', sr=SAMPLE_RATE)
239
- # clean up
240
- os.remove('/tmp/myfile.tmp')
241
-
242
- # audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
243
- # audio_tensor= torch.from_numpy(audio_nparray)
244
-
245
- # get the end time
246
- et = time.time()
247
-
248
- # get the execution time
249
- elapsed_time = et - st
250
- logger.info(f"TIME for audio processing : {elapsed_time:.2f} seconds")
251
- if info:
252
- print(f"TIME for audio processing : {elapsed_time:.2f} seconds")
253
-
254
- # 2. transcribe
255
- logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
256
- transcription = self.model.transcribe(audio_nparray, batch_size=batch_size,language=language)
257
- if info:
258
- print(transcription["segments"][0:10000]) # before alignment
259
- logger.info(transcription["segments"][0:10000])
260
-
261
- try:
262
- first_text = transcription["segments"][0]["text"]
263
- except:
264
- logger.warning("No transcription")
265
- return {"transcription": transcription["segments"]}
266
-
267
- # get the execution time
268
- et = time.time()
269
- elapsed_time = et - st
270
- st = time.time()
271
- logger.info(f"TIME for audio transcription : {elapsed_time:.2f} seconds")
272
- if info:
273
- print(f"TIME for audio transcription : {elapsed_time:.2f} seconds")
274
-
275
- # 3. align
276
- if alignment:
277
- logger.info("--------------- STARTING ALIGNMENT ------------------------")
278
- model_a, metadata = whisperx.load_align_model(
279
- language_code=transcription["language"], device=device)
280
- transcription = whisperx.align(
281
- transcription["segments"], model_a, metadata, audio_nparray, device, return_char_alignments=False)
282
- if info:
283
- print(transcription["segments"][0:10000])
284
- logger.info(transcription["segments"][0:10000])
285
-
286
- # get the execution time
287
- et = time.time()
288
- elapsed_time = et - st
289
- st = time.time()
290
- logger.info(f"TIME for alignment : {elapsed_time:.2f} seconds")
291
- if info:
292
- print(f"TIME for alignment : {elapsed_time:.2f} seconds")
293
-
294
- # 4. Assign speaker labels
295
- if diarization:
296
- logger.info("--------------- STARTING DIARIZATION ------------------------")
297
- # add min/max number of speakers if known
298
- diarize_segments = self.diarize_model(audio_nparray)
299
- if info:
300
- print(diarize_segments)
301
- logger.info(diarize_segments)
302
- # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
303
-
304
- transcription = whisperx.assign_word_speakers(diarize_segments, transcription)
305
- if info:
306
- print(transcription["segments"][0:10000])
307
- logger.info(transcription["segments"][0:10000]) # segments are now assigned speaker IDs
308
-
309
- # get the execution time
310
- et = time.time()
311
- elapsed_time = et - st
312
- st = time.time()
313
- logger.info(f"TIME for audio diarization : {elapsed_time:.2f} seconds")
314
- if info:
315
- print(f"TIME for audio diarization : {elapsed_time:.2f} seconds")
316
-
317
- # results_json = json.dumps(results)
318
- # return {"results": results_json}
319
  return {"transcription": transcription["segments"]}
320
 
321
-
322
-
323
-
324
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  import base64
15
  import numpy as np
16
 
17
+ DEVNULL = open(os.devnull, "w")
18
 
19
  # from transformers.pipelines.audio_utils import ffmpeg_read
20
  from typing import Dict, List, Any
 
26
 
27
  SAMPLE_RATE = 16000
28
 
29
+
30
  def whisper_config():
31
  device = "cuda" if torch.cuda.is_available() else "cpu"
32
+ whisper_model = "large-v3"
33
+ batch_size = 24 # reduce if low on GPU mem, 16 initailly
34
  # change to "int8" if low on GPU mem (may reduce accuracy)
35
  compute_type = "float16" if device == "cuda" else "int8"
36
  return device, batch_size, compute_type, whisper_model
37
 
38
+
39
  # From https://gist.github.com/kylemcdonald/85d70bf53e207bab3775
40
  # load_audio can not detect the input type
41
  def ffmpeg_load_audio(filename, sr=44100, mono=False, normalize=True, in_type=np.int16, out_type=np.float32):
42
  channels = 1 if mono else 2
43
  format_strings = {
44
+ np.float64: "f64le",
45
+ np.float32: "f32le",
46
+ np.int16: "s16le",
47
+ np.int32: "s32le",
48
+ np.uint32: "u32le",
49
  }
50
  format_string = format_strings[in_type]
51
  command = [
52
+ "ffmpeg",
53
+ "-i",
54
+ filename,
55
+ "-f",
56
+ format_string,
57
+ "-acodec",
58
+ "pcm_" + format_string,
59
+ "-ar",
60
+ str(sr),
61
+ "-ac",
62
+ str(channels),
63
+ "-",
64
+ ]
65
  p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=DEVNULL, bufsize=4096)
66
  bytes_per_sample = np.dtype(in_type).itemsize
67
  frame_size = bytes_per_sample * channels
68
+ chunk_size = frame_size * sr # read in 1-second chunks
69
+ raw = b""
70
  with p.stdout as stdout:
71
  while True:
72
  data = stdout.read(chunk_size)
 
88
  audio /= np.iinfo(in_type).max
89
  return audio
90
 
91
+
92
  # FROM HuggingFace
93
  def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
94
  """
 
176
  def display_gpu_infos():
177
  if not torch.cuda.is_available():
178
  return "NO CUDA"
179
+
180
  infos = "torch.cuda.current_device(): " + str(torch.cuda.current_device()) + ", "
181
+ infos = infos + "torch.cuda.device(0): " + str(torch.cuda.device(0)) + ", "
182
  infos = infos + "torch.cuda.device_count(): " + str(torch.cuda.device_count()) + ", "
183
  infos = infos + "torch.cuda.get_device_name(0): " + str(torch.cuda.get_device_name(0))
184
  return infos
185
 
186
+
187
+ class EndpointHandler:
188
  def __init__(self, path=""):
189
  # load the model
190
  device, batch_size, compute_type, whisper_model = whisper_config()
 
192
  # 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 ERIC
193
  # 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 OLD
194
  logger.info(f"Model {whisper_model} initialized")
195
+
196
+ # self.diarize_model = whisperx.DiarizationPipeline(
197
+ # "pyannote/speaker-diarization-3.1", use_auth_token="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", device=device
198
+ # )
199
+
200
+ # logger.info(f"Model for diarization initialized")
201
+
 
202
  def __call__(self, data: Any) -> Dict[str, str]:
203
+ """
204
+ Args:
205
+ data (:obj:):
206
+ includes the deserialized audio file as bytes
207
+ Return:
208
+ A :obj:`dict`:. base64 encoded image
209
+ """
210
+ # get the start time
211
+ st = time.time()
212
+
213
+ logger.info("--------------- CONFIGURATION ------------------------")
214
+ device, batch_size, compute_type, whisper_model = whisper_config()
215
+ logger.info(
216
+ f"device: {device}, batch_size: {batch_size}, compute_type:{compute_type}, whisper_model: {whisper_model}"
217
+ )
218
+ logger.info(display_gpu_infos())
219
+
220
+ # 1. process input
221
+ inputs_encoded = data.pop("inputs", data)
222
+ parameters = data.pop("parameters", None)
223
+ options = data.pop("options", None)
224
+
225
+ # OPTIONS are given as parameters
226
+ info = False
227
+ if options and "info" in options.keys() and options["info"]:
228
+ info = True
229
+
230
+ alignment = False
231
+ if options and "alignment" in options.keys() and options["alignment"]:
232
+ alignment = True
233
+
234
+ diarization = True
235
+ if options and "diarization" in options.keys() and not options["diarization"]:
236
+ diarization = False
237
+
238
+ language = "fr"
239
+ if parameters and "language" in parameters.keys():
240
+ language = parameters["language"]
241
+
242
+ inputs = base64.b64decode(inputs_encoded)
243
+ # make a tmp file
244
+ with open("/tmp/myfile.tmp", "wb") as w:
245
+ w.write(inputs)
246
+
247
+ # audio_nparray = ffmpeg_load_audio('/tmp/myfile.tmp', sr=SAMPLE_RATE, mono=True, out_type=np.float32)
248
+ audio_nparray = load_audio("/tmp/myfile.tmp", sr=SAMPLE_RATE)
249
+ # clean up
250
+ os.remove("/tmp/myfile.tmp")
251
+
252
+ # audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
253
+ # audio_tensor= torch.from_numpy(audio_nparray)
254
+
255
+ # get the end time
256
+ et = time.time()
257
+
258
+ # get the execution time
259
+ elapsed_time = et - st
260
+ logger.info(f"TIME for audio processing : {elapsed_time:.2f} seconds")
261
+ if info:
262
+ print(f"TIME for audio processing : {elapsed_time:.2f} seconds")
263
+
264
+ # 2. transcribe
265
+ logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
266
+ transcription = self.model.transcribe(audio_nparray, batch_size=batch_size, language=language)
267
+ if info:
268
+ print(transcription["segments"][0:10000]) # before alignment
269
+ logger.info(transcription["segments"][0:10000])
270
+
271
+ try:
272
+ first_text = transcription["segments"][0]["text"]
273
+ except:
274
+ logger.warning("No transcription")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  return {"transcription": transcription["segments"]}
276
 
277
+ # get the execution time
278
+ et = time.time()
279
+ elapsed_time = et - st
280
+ st = time.time()
281
+ logger.info(f"TIME for audio transcription : {elapsed_time:.2f} seconds")
282
+ if info:
283
+ print(f"TIME for audio transcription : {elapsed_time:.2f} seconds")
284
+
285
+ # # 3. align
286
+ # if alignment:
287
+ # logger.info("--------------- STARTING ALIGNMENT ------------------------")
288
+ # model_a, metadata = whisperx.load_align_model(language_code=transcription["language"], device=device)
289
+ # transcription = whisperx.align(
290
+ # transcription["segments"], model_a, metadata, audio_nparray, device, return_char_alignments=False
291
+ # )
292
+ # if info:
293
+ # print(transcription["segments"][0:10000])
294
+ # logger.info(transcription["segments"][0:10000])
295
+
296
+ # # get the execution time
297
+ # et = time.time()
298
+ # elapsed_time = et - st
299
+ # st = time.time()
300
+ # logger.info(f"TIME for alignment : {elapsed_time:.2f} seconds")
301
+ # if info:
302
+ # print(f"TIME for alignment : {elapsed_time:.2f} seconds")
303
+
304
+ # # 4. Assign speaker labels
305
+ # if diarization:
306
+ # logger.info("--------------- STARTING DIARIZATION ------------------------")
307
+ # # add min/max number of speakers if known
308
+ # diarize_segments = self.diarize_model(audio_nparray)
309
+ # if info:
310
+ # print(diarize_segments)
311
+ # logger.info(diarize_segments)
312
+ # # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
313
+
314
+ # transcription = whisperx.assign_word_speakers(diarize_segments, transcription)
315
+ # if info:
316
+ # print(transcription["segments"][0:10000])
317
+ # logger.info(transcription["segments"][0:10000]) # segments are now assigned speaker IDs
318
+
319
+ # # get the execution time
320
+ # et = time.time()
321
+ # elapsed_time = et - st
322
+ # st = time.time()
323
+ # logger.info(f"TIME for audio diarization : {elapsed_time:.2f} seconds")
324
+ # if info:
325
+ # print(f"TIME for audio diarization : {elapsed_time:.2f} seconds")
326
+
327
+ # results_json = json.dumps(results)
328
+ # return {"results": results_json}
329
+ return {"transcription": transcription["segments"]}