ericmattmann commited on
Commit
b46a961
·
1 Parent(s): 688d28c

go back to whisperx

Browse files
Files changed (1) hide show
  1. handler.py +13 -25
handler.py CHANGED
@@ -7,12 +7,9 @@ import torch
7
  # process = subprocess.Popen(['pip', 'install', '--force-reinstall', 'onnxruntime-gpu'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
8
  # stdout, stderr = process.communicate()
9
 
10
- from faster_whisper import WhisperModel
11
- from pathlib import Path
12
  import os
13
 
14
- # import nvidia.cublas.lib
15
- # import nvidia.cudnn.lib
16
  import time
17
  import json
18
  import base64
@@ -28,11 +25,6 @@ import logging
28
 
29
  logger = logging.getLogger(__name__)
30
 
31
- # logger.info(Path(nvidia.cublas.lib.__file__).parent)
32
- # logger.info(Path(nvidia.cudnn.lib.__file__).parent)
33
- # os.environ["LD_LIBRARY_PATH"] = ":".join(
34
- # [Path(nvidia.cublas.lib.__file__).parent, Path(nvidia.cudnn.lib.__file__).parent]
35
- # )
36
  SAMPLE_RATE = 16000
37
 
38
 
@@ -196,8 +188,7 @@ class EndpointHandler:
196
  def __init__(self, path=""):
197
  # load the model
198
  device, batch_size, compute_type, whisper_model = whisper_config()
199
-
200
- self.model = WhisperModel(whisper_model, device=device, compute_type=compute_type)
201
  # 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 ERIC
202
  # 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 OLD
203
  logger.info(f"Model {whisper_model} initialized")
@@ -253,13 +244,13 @@ class EndpointHandler:
253
  with open("/tmp/myfile.tmp", "wb") as w:
254
  w.write(inputs)
255
 
256
- # audio_nparray = ffmpeg_load_audio('/tmp/myfile.tmp', sr=SAMPLE_RATE, mono=True, out_type=np.float32)
257
- # audio_nparray = load_audio("/tmp/myfile.tmp", sr=SAMPLE_RATE)
258
  # clean up
259
- # os.remove("/tmp/myfile.tmp")
260
 
261
- # audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
262
- # audio_tensor= torch.from_numpy(audio_nparray)
263
 
264
  # get the end time
265
  et = time.time()
@@ -272,19 +263,16 @@ class EndpointHandler:
272
 
273
  # 2. transcribe
274
  logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
275
- # transcription = self.model.transcribe(audio_nparray, batch_size=batch_size, language=language)
276
- segments, _ = self.model.transcribe("/tmp/myfile.tmp", beam_size=5)
277
  if info:
278
- # print(transcription["segments"][0:10000]) # before alignment
279
- print(segments[0:10000]) # before alignment
280
- # logger.info(transcription["segments"][0:10000])
281
- logger.info("segments"[0:10000])
282
 
283
  try:
284
- first_text = segments[0]["text"]
285
  except:
286
  logger.warning("No transcription")
287
- return {"transcription": segments}
288
 
289
  # get the execution time
290
  et = time.time()
@@ -338,4 +326,4 @@ class EndpointHandler:
338
 
339
  # results_json = json.dumps(results)
340
  # return {"results": results_json}
341
- return {"transcription": [s["text"] for s in segments]}
 
7
  # process = subprocess.Popen(['pip', 'install', '--force-reinstall', 'onnxruntime-gpu'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
8
  # stdout, stderr = process.communicate()
9
 
10
+ import whisperx
 
11
  import os
12
 
 
 
13
  import time
14
  import json
15
  import base64
 
25
 
26
  logger = logging.getLogger(__name__)
27
 
 
 
 
 
 
28
  SAMPLE_RATE = 16000
29
 
30
 
 
188
  def __init__(self, path=""):
189
  # load the model
190
  device, batch_size, compute_type, whisper_model = whisper_config()
191
+ self.model = whisperx.load_model(whisper_model, device=device, compute_type=compute_type)
 
192
  # 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 ERIC
193
  # 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 OLD
194
  logger.info(f"Model {whisper_model} initialized")
 
244
  with open("/tmp/myfile.tmp", "wb") as w:
245
  w.write(inputs)
246
 
247
+ audio_nparray = ffmpeg_load_audio("/tmp/myfile.tmp", sr=SAMPLE_RATE, mono=True, out_type=np.float32)
248
+ audio_nparray = load_audio("/tmp/myfile.tmp", sr=SAMPLE_RATE)
249
  # clean up
250
+ os.remove("/tmp/myfile.tmp")
251
 
252
+ audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
253
+ audio_tensor = torch.from_numpy(audio_nparray)
254
 
255
  # get the end time
256
  et = time.time()
 
263
 
264
  # 2. transcribe
265
  logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
266
+ transcription = self.model.transcribe(audio_nparray, batch_size=batch_size, language=language)
 
267
  if info:
268
+ print(transcription["segments"][0:10000]) # before alignment
269
+ logger.info(transcription["segments"][0:10000])
 
 
270
 
271
  try:
272
+ first_text = transcription["segments"][0]["text"]
273
  except:
274
  logger.warning("No transcription")
275
+ return {"transcription": transcription["segments"]}
276
 
277
  # get the execution time
278
  et = time.time()
 
326
 
327
  # results_json = json.dumps(results)
328
  # return {"results": results_json}
329
+ return {"transcription": [s["text"] for s in transcription["segments"]]}