Duplicated from geninhu/whisper-large-v2-multiset-vi

vphu123
/

whisper-endpoint

Automatic Speech Recognition

Generated from Trainer

Eval Results (legacy)

Model card Files Files and versions

Metrics Training metrics Community

whisper-endpoint / handler.py

vphu123's picture

Update handler.py

4e7ae38 over 2 years ago

1.09 kB

	from typing import Dict, Any

	import torch
	from transformers import pipeline
	from transformers.pipelines.audio_utils import ffmpeg_read


	class EndpointHandler:

	def __init__(self, asr_model_path: str = "vphu123/whisper-endpoint"):

	device = 0 if torch.cuda.is_available() else "cpu"
	self.pipe = pipeline(
	task="automatic-speech-recognition",
	model=asr_model_path,
	chunk_length_s=30,
	device=device,
	max_new_tokens = 10000,
	)

	self.pipe.model.config.forced_decoder_ids = self.pipe.tokenizer.get_decoder_prompt_ids(language="vi", task="transcribe")


	def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:

	# process input
	inputs = data.pop("inputs", data)
	audio_nparray = ffmpeg_read(inputs, 16000)
	audio_tensor= torch.from_numpy(audio_nparray)

	# Process the audio data with the ASR pipeline
	result = self.pipe(audio_nparray)

	# Convert the transcription to JSON
	return {"text": result["text"]}