raphaelbiojout
commited on
Commit
·
c2bb0db
1
Parent(s):
41738b8
update
Browse files- handler.py +47 -12
handler.py
CHANGED
|
@@ -162,10 +162,10 @@ def display_gpu_infos():
|
|
| 162 |
if not torch.cuda.is_available():
|
| 163 |
return "NO CUDA"
|
| 164 |
|
| 165 |
-
infos = "torch.cuda.current_device(): " + str(torch.cuda.current_device()) + "
|
| 166 |
-
infos = infos + "torch.cuda.device(0): " + str(torch.cuda.device(0)) + "
|
| 167 |
-
infos = infos + "torch.cuda.device_count(): " + str(torch.cuda.device_count()) + "
|
| 168 |
-
infos = infos + "torch.cuda.get_device_name(0): " + str(torch.cuda.get_device_name(0))
|
| 169 |
return infos
|
| 170 |
|
| 171 |
class EndpointHandler():
|
|
@@ -192,8 +192,13 @@ class EndpointHandler():
|
|
| 192 |
Return:
|
| 193 |
A :obj:`dict`:. base64 encoded image
|
| 194 |
"""
|
|
|
|
|
|
|
| 195 |
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
| 197 |
logger.info(display_gpu_infos())
|
| 198 |
|
| 199 |
# 1. process input
|
|
@@ -227,18 +232,36 @@ class EndpointHandler():
|
|
| 227 |
# audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
|
| 228 |
# audio_tensor= torch.from_numpy(audio_nparray)
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
# 2. transcribe
|
| 231 |
-
device, batch_size, compute_type, whisper_model = whisper_config()
|
| 232 |
logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
|
| 233 |
-
logger.info(f"device: {device}, batch_size: {batch_size}, compute_type:{compute_type}, whisper_model: {whisper_model}")
|
| 234 |
transcription = self.model.transcribe(audio_nparray, batch_size=batch_size,language=language)
|
| 235 |
if info:
|
| 236 |
print(transcription["segments"][0:10000]) # before alignment
|
| 237 |
logger.info(transcription["segments"][0:10000])
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
| 239 |
logger.warning("No transcription")
|
| 240 |
return {"transcription": transcription["segments"]}
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
# 3. align
|
| 243 |
if alignment:
|
| 244 |
logger.info("--------------- STARTING ALIGNMENT ------------------------")
|
|
@@ -249,6 +272,14 @@ class EndpointHandler():
|
|
| 249 |
if info:
|
| 250 |
print(transcription["segments"][0:10000])
|
| 251 |
logger.info(transcription["segments"][0:10000])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
# 4. Assign speaker labels
|
| 254 |
logger.info("--------------- STARTING DIARIZATION ------------------------")
|
|
@@ -263,11 +294,15 @@ class EndpointHandler():
|
|
| 263 |
if info:
|
| 264 |
print(transcription["segments"][0:10000])
|
| 265 |
logger.info(transcription["segments"][0:10000]) # segments are now assigned speaker IDs
|
| 266 |
-
|
| 267 |
-
if torch.cuda.is_available():
|
| 268 |
-
logger.info("--------------- GPU AFTER ------------------------")
|
| 269 |
-
logger.info(display_gpu_infos())
|
| 270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
# results_json = json.dumps(results)
|
| 272 |
# return {"results": results_json}
|
| 273 |
return {"transcription": transcription["segments"]}
|
|
|
|
| 162 |
if not torch.cuda.is_available():
|
| 163 |
return "NO CUDA"
|
| 164 |
|
| 165 |
+
infos = "torch.cuda.current_device(): " + str(torch.cuda.current_device()) + ", "
|
| 166 |
+
infos = infos + "torch.cuda.device(0): " + str(torch.cuda.device(0)) + ", "
|
| 167 |
+
infos = infos + "torch.cuda.device_count(): " + str(torch.cuda.device_count()) + ", "
|
| 168 |
+
infos = infos + "torch.cuda.get_device_name(0): " + str(torch.cuda.get_device_name(0))
|
| 169 |
return infos
|
| 170 |
|
| 171 |
class EndpointHandler():
|
|
|
|
| 192 |
Return:
|
| 193 |
A :obj:`dict`:. base64 encoded image
|
| 194 |
"""
|
| 195 |
+
# get the start time
|
| 196 |
+
st = time.time()
|
| 197 |
|
| 198 |
+
|
| 199 |
+
logger.info("--------------- CONFIGURATION ------------------------")
|
| 200 |
+
device, batch_size, compute_type, whisper_model = whisper_config()
|
| 201 |
+
logger.info(f"device: {device}, batch_size: {batch_size}, compute_type:{compute_type}, whisper_model: {whisper_model}")
|
| 202 |
logger.info(display_gpu_infos())
|
| 203 |
|
| 204 |
# 1. process input
|
|
|
|
| 232 |
# audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
|
| 233 |
# audio_tensor= torch.from_numpy(audio_nparray)
|
| 234 |
|
| 235 |
+
# get the end time
|
| 236 |
+
et = time.time()
|
| 237 |
+
|
| 238 |
+
# get the execution time
|
| 239 |
+
elapsed_time = et - st
|
| 240 |
+
logger.info(f"TIME for audio processing : {elapsed_time:.2f} seconds")
|
| 241 |
+
if info:
|
| 242 |
+
print(f"TIME for audio processing : {elapsed_time:.2f} seconds")
|
| 243 |
+
|
| 244 |
# 2. transcribe
|
|
|
|
| 245 |
logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
|
|
|
|
| 246 |
transcription = self.model.transcribe(audio_nparray, batch_size=batch_size,language=language)
|
| 247 |
if info:
|
| 248 |
print(transcription["segments"][0:10000]) # before alignment
|
| 249 |
logger.info(transcription["segments"][0:10000])
|
| 250 |
+
|
| 251 |
+
try:
|
| 252 |
+
first_text = transcription["segments"][0]["text"]
|
| 253 |
+
except:
|
| 254 |
logger.warning("No transcription")
|
| 255 |
return {"transcription": transcription["segments"]}
|
| 256 |
|
| 257 |
+
# get the execution time
|
| 258 |
+
et = time.time()
|
| 259 |
+
elapsed_time = et - st
|
| 260 |
+
st = time.time()
|
| 261 |
+
logger.info(f"TIME for audio transcription : {elapsed_time:.2f} seconds")
|
| 262 |
+
if info:
|
| 263 |
+
print(f"TIME for audio transcription : {elapsed_time:.2f} seconds")
|
| 264 |
+
|
| 265 |
# 3. align
|
| 266 |
if alignment:
|
| 267 |
logger.info("--------------- STARTING ALIGNMENT ------------------------")
|
|
|
|
| 272 |
if info:
|
| 273 |
print(transcription["segments"][0:10000])
|
| 274 |
logger.info(transcription["segments"][0:10000])
|
| 275 |
+
|
| 276 |
+
# get the execution time
|
| 277 |
+
et = time.time()
|
| 278 |
+
elapsed_time = et - st
|
| 279 |
+
st = time.time()
|
| 280 |
+
logger.info(f"TIME for alignment : {elapsed_time:.2f} seconds")
|
| 281 |
+
if info:
|
| 282 |
+
print(f"TIME for alignment : {elapsed_time:.2f} seconds")
|
| 283 |
|
| 284 |
# 4. Assign speaker labels
|
| 285 |
logger.info("--------------- STARTING DIARIZATION ------------------------")
|
|
|
|
| 294 |
if info:
|
| 295 |
print(transcription["segments"][0:10000])
|
| 296 |
logger.info(transcription["segments"][0:10000]) # segments are now assigned speaker IDs
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
|
| 298 |
+
# get the execution time
|
| 299 |
+
et = time.time()
|
| 300 |
+
elapsed_time = et - st
|
| 301 |
+
st = time.time()
|
| 302 |
+
logger.info(f"TIME for audio diarization : {elapsed_time:.2f} seconds")
|
| 303 |
+
if info:
|
| 304 |
+
print(f"TIME for audio diarization : {elapsed_time:.2f} seconds")
|
| 305 |
+
|
| 306 |
# results_json = json.dumps(results)
|
| 307 |
# return {"results": results_json}
|
| 308 |
return {"transcription": transcription["segments"]}
|