S-Fry
/

large

Automatic Speech Recognition

hf-asr-leaderboard

Model card Files Files and versions

S-Fry commited on Apr 27, 2023

Commit

e4b911e

·

1 Parent(s): d9964f9

Update handler.py

Files changed (1) hide show

handler.py +23 -28

handler.py CHANGED Viewed

@@ -1,35 +1,30 @@
 from typing import  Dict
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
-from transformers.pipelines.audio_utils import ffmpeg_read
-#import Torch
-#from datasets import load_dataset
-SAMPLE_RATE = 16000
 class EndpointHandler():
     def __init__(self, path=""):
-        # load the model
-        self.processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
-        self.model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
-        self.classifier = AudioClassificationPipeline(model=self.model, processor=self.processor, device=0)
-        self.forced_decoder_ids = self.processor.get_decoder_prompt_ids(language="Danish", task="transcribe")
     def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
-        """
-        Args:
-            data (:obj:):
-                includes the deserialized audio file as bytes
-        Return:
-            A :obj:`dict`:. base64 encoded image
-        """
-        # process input
         inputs = data.pop("inputs", data)
         audio_nparray = ffmpeg_read(inputs, sample_rate=SAMPLE_RATE)
-        audio_tensor= torch.from_numpy(audio_nparray)
-        # run inference pipeline
-        result = self.classifier(audio_nparray)
-        # postprocess the prediction
-        return {"txt": result[0]["transcription"]}

+import torch
 from typing import  Dict
+from transformers import pipeline
+from datasets import load_dataset
+SAMPLE_RATE=16000
 class EndpointHandler():
     def __init__(self, path=""):
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        pipe = pipeline(
+          "automatic-speech-recognition",
+          model="openai/whisper-large",
+          chunk_length_s=30,
+          device=device,
+        )
     def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
+        #ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        #sample = ds[0]["audio"]
         inputs = data.pop("inputs", data)
         audio_nparray = ffmpeg_read(inputs, sample_rate=SAMPLE_RATE)
+        audio_tensor = torch.from_numpy(audio_nparray)
+        prediction = pipe(audio_nparray, return_timestamps=True)
+        return {"text": prediction[0]["transcription"]}
+        # we can also return timestamps for the predictions
+        #prediction = pipe(sample, return_timestamps=True)["chunks"]
+        #[{'text': ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.',
+        #  'timestamp': (0.0, 5.44)}]