faster-distil-whisper-large-v3

Update handler.py

ce0bcc2 verified 21 days ago

1.55 kB

	# handler.py (for handling asr with faster_whisper)
	from faster_whisper import WhisperModel, BatchedInferencePipeline
	from typing import Any, Dict, List
	from transformers.pipelines.audio_utils import ffmpeg_read

	class EndpointHandler:

	def __init__(self, path=""):
	self.model_size = "distil-large-v3" # the distilled whisper v3 model
	self.model = WhisperModel(self.model_size, device="cuda", compute_type="float16")
	self.batched_model = BatchedInferencePipeline(model=self.model)

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	Args:
	data (:obj:):
	includes the base64 encoded audio file as 'inputs'
	whether to use batching as the 'batched' argument
	and any additional arguments as a 'parameters' dict
	Return:
	segments of transcribed text, joined
	"""
	# process input
	inputs = data.pop("inputs", data)
	audio_nparray = ffmpeg_read(inputs, 16000) # read the encoded audio and convert at 16k
	# Retrieve custom arguments
	batched = data.pop("batched", True) # default is True if not specified
	params = data.pop("parameters", {}) # all parameters for the model
	if batched:
	segments, info = self.batched_model.transcribe(audio_nparray, **params)
	else:
	segments, info = self.model.transcribe(audio_nparray, beam_size=5)

	segments = [segment.text for segment in segments]
	return " ".join(segments)