Commit
·
b46a961
1
Parent(s):
688d28c
go back to whisperx
Browse files- handler.py +13 -25
handler.py
CHANGED
|
@@ -7,12 +7,9 @@ import torch
|
|
| 7 |
# process = subprocess.Popen(['pip', 'install', '--force-reinstall', 'onnxruntime-gpu'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 8 |
# stdout, stderr = process.communicate()
|
| 9 |
|
| 10 |
-
|
| 11 |
-
from pathlib import Path
|
| 12 |
import os
|
| 13 |
|
| 14 |
-
# import nvidia.cublas.lib
|
| 15 |
-
# import nvidia.cudnn.lib
|
| 16 |
import time
|
| 17 |
import json
|
| 18 |
import base64
|
|
@@ -28,11 +25,6 @@ import logging
|
|
| 28 |
|
| 29 |
logger = logging.getLogger(__name__)
|
| 30 |
|
| 31 |
-
# logger.info(Path(nvidia.cublas.lib.__file__).parent)
|
| 32 |
-
# logger.info(Path(nvidia.cudnn.lib.__file__).parent)
|
| 33 |
-
# os.environ["LD_LIBRARY_PATH"] = ":".join(
|
| 34 |
-
# [Path(nvidia.cublas.lib.__file__).parent, Path(nvidia.cudnn.lib.__file__).parent]
|
| 35 |
-
# )
|
| 36 |
SAMPLE_RATE = 16000
|
| 37 |
|
| 38 |
|
|
@@ -196,8 +188,7 @@ class EndpointHandler:
|
|
| 196 |
def __init__(self, path=""):
|
| 197 |
# load the model
|
| 198 |
device, batch_size, compute_type, whisper_model = whisper_config()
|
| 199 |
-
|
| 200 |
-
self.model = WhisperModel(whisper_model, device=device, compute_type=compute_type)
|
| 201 |
# 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 ERIC
|
| 202 |
# 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 OLD
|
| 203 |
logger.info(f"Model {whisper_model} initialized")
|
|
@@ -253,13 +244,13 @@ class EndpointHandler:
|
|
| 253 |
with open("/tmp/myfile.tmp", "wb") as w:
|
| 254 |
w.write(inputs)
|
| 255 |
|
| 256 |
-
|
| 257 |
-
|
| 258 |
# clean up
|
| 259 |
-
|
| 260 |
|
| 261 |
-
|
| 262 |
-
|
| 263 |
|
| 264 |
# get the end time
|
| 265 |
et = time.time()
|
|
@@ -272,19 +263,16 @@ class EndpointHandler:
|
|
| 272 |
|
| 273 |
# 2. transcribe
|
| 274 |
logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
|
| 275 |
-
|
| 276 |
-
segments, _ = self.model.transcribe("/tmp/myfile.tmp", beam_size=5)
|
| 277 |
if info:
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
# logger.info(transcription["segments"][0:10000])
|
| 281 |
-
logger.info("segments"[0:10000])
|
| 282 |
|
| 283 |
try:
|
| 284 |
-
first_text = segments[0]["text"]
|
| 285 |
except:
|
| 286 |
logger.warning("No transcription")
|
| 287 |
-
return {"transcription": segments}
|
| 288 |
|
| 289 |
# get the execution time
|
| 290 |
et = time.time()
|
|
@@ -338,4 +326,4 @@ class EndpointHandler:
|
|
| 338 |
|
| 339 |
# results_json = json.dumps(results)
|
| 340 |
# return {"results": results_json}
|
| 341 |
-
return {"transcription": [s["text"] for s in segments]}
|
|
|
|
| 7 |
# process = subprocess.Popen(['pip', 'install', '--force-reinstall', 'onnxruntime-gpu'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 8 |
# stdout, stderr = process.communicate()
|
| 9 |
|
| 10 |
+
import whisperx
|
|
|
|
| 11 |
import os
|
| 12 |
|
|
|
|
|
|
|
| 13 |
import time
|
| 14 |
import json
|
| 15 |
import base64
|
|
|
|
| 25 |
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
SAMPLE_RATE = 16000
|
| 29 |
|
| 30 |
|
|
|
|
| 188 |
def __init__(self, path=""):
|
| 189 |
# load the model
|
| 190 |
device, batch_size, compute_type, whisper_model = whisper_config()
|
| 191 |
+
self.model = whisperx.load_model(whisper_model, device=device, compute_type=compute_type)
|
|
|
|
| 192 |
# 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 ERIC
|
| 193 |
# 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 OLD
|
| 194 |
logger.info(f"Model {whisper_model} initialized")
|
|
|
|
| 244 |
with open("/tmp/myfile.tmp", "wb") as w:
|
| 245 |
w.write(inputs)
|
| 246 |
|
| 247 |
+
audio_nparray = ffmpeg_load_audio("/tmp/myfile.tmp", sr=SAMPLE_RATE, mono=True, out_type=np.float32)
|
| 248 |
+
audio_nparray = load_audio("/tmp/myfile.tmp", sr=SAMPLE_RATE)
|
| 249 |
# clean up
|
| 250 |
+
os.remove("/tmp/myfile.tmp")
|
| 251 |
|
| 252 |
+
audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
|
| 253 |
+
audio_tensor = torch.from_numpy(audio_nparray)
|
| 254 |
|
| 255 |
# get the end time
|
| 256 |
et = time.time()
|
|
|
|
| 263 |
|
| 264 |
# 2. transcribe
|
| 265 |
logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
|
| 266 |
+
transcription = self.model.transcribe(audio_nparray, batch_size=batch_size, language=language)
|
|
|
|
| 267 |
if info:
|
| 268 |
+
print(transcription["segments"][0:10000]) # before alignment
|
| 269 |
+
logger.info(transcription["segments"][0:10000])
|
|
|
|
|
|
|
| 270 |
|
| 271 |
try:
|
| 272 |
+
first_text = transcription["segments"][0]["text"]
|
| 273 |
except:
|
| 274 |
logger.warning("No transcription")
|
| 275 |
+
return {"transcription": transcription["segments"]}
|
| 276 |
|
| 277 |
# get the execution time
|
| 278 |
et = time.time()
|
|
|
|
| 326 |
|
| 327 |
# results_json = json.dumps(results)
|
| 328 |
# return {"results": results_json}
|
| 329 |
+
return {"transcription": [s["text"] for s in transcription["segments"]]}
|