zou8944
/

asr-model

Model card Files Files and versions

asr-model / handler.py

zou8944's picture

Update handler.py

4d6b797 verified over 1 year ago

history blame contribute delete

1.59 kB

	import json
	import whisper
	from typing import Dict

	from huggingface_hub import hf_hub_download
	from whisper import load_model, transcribe
	from transformers.pipelines.audio_utils import ffmpeg_read

	SAMPLE_RATE = 16000


	class EndpointHandler():
	def __init__(self, path=""):
	# load the model
	self.model = whisper.load_model("large-v3")

	def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
	"""
	Args:
	data (:obj:):
	includes the deserialized audio file as bytes
	Return:
	A :obj:`dict`:. base64 encoded image
	"""
	# process input
	inputs = data.pop("inputs", data)
	audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)

	pred_out = transcribe(
	self.model,
	audio=audio_nparray,
	language="en",
	word_timestamps=True,
	)
	segments = pred_out['segments']
	result = []
	for segment in segments:
	start = segment['start']
	end = segment['end']
	text = segment['text']
	words = segment['words']
	print(f"Segment: {start} - {end}: {text}\n")
	result.append({
	"start": start,
	"end": end,
	"text": text,
	"words": [{
	"start": word['start'],
	"end": word['end'],
	"word": word['word']
	} for word in words]
	})
	return {"text": json.dumps(result, indent=2, ensure_ascii=False)}