Add VAD to asr

Browse files

Files changed (6) hide show

README.md +6 -0
fireredasr/data/asr_feat.py +15 -0
fireredasr_axmodel.py +232 -170
fireredasr_onnx.py +529 -0
test_ax_model.py +45 -76
test_wer.py +115 -113

README.md CHANGED Viewed

@@ -19,6 +19,12 @@ license: apache-2.0
 ## 安装依赖
 ### Python
 测试环境为Python 3.12，建议使用[Miniconda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-aarch64.sh

 ## 安装依赖
+### Audio backend
+```
+sudo apt install libsnffile1
+```
 ### Python
 测试环境为Python 3.12，建议使用[Miniconda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-aarch64.sh

fireredasr/data/asr_feat.py CHANGED Viewed

@@ -18,6 +18,7 @@ class ASRFeatExtractor:
         durs = []
         for wav_path in wav_paths:
             sample_rate, wav_np = kaldiio.load_mat(wav_path)
             dur = wav_np.shape[0] / sample_rate
             fbank = self.fbank((sample_rate, wav_np))
             if self.cmvn is not None:
@@ -28,6 +29,20 @@ class ASRFeatExtractor:
         lengths = torch.tensor([feat.size(0) for feat in feats]).long()
         feats_pad = self.pad_feat(feats, 0.0)
         return feats_pad, lengths, durs
     def pad_feat(self, xs, pad_value):
         # type: (List[Tensor], int) -> Tensor

         durs = []
         for wav_path in wav_paths:
             sample_rate, wav_np = kaldiio.load_mat(wav_path)
             dur = wav_np.shape[0] / sample_rate
             fbank = self.fbank((sample_rate, wav_np))
             if self.cmvn is not None:
         lengths = torch.tensor([feat.size(0) for feat in feats]).long()
         feats_pad = self.pad_feat(feats, 0.0)
         return feats_pad, lengths, durs
+    def run_chunk(self, wav_np, sample_rate):
+        feats = []
+        dur = wav_np.shape[0] / sample_rate
+        fbank = self.fbank((sample_rate, wav_np))
+        if self.cmvn is not None:
+            fbank = self.cmvn(fbank)
+        fbank = torch.from_numpy(fbank).float()
+        feats.append(fbank)
+        lengths = torch.tensor([feat.size(0) for feat in feats]).long()
+        feats_pad = self.pad_feat(feats, 0.0)
+        return feats_pad, lengths, dur
     def pad_feat(self, xs, pad_value):
         # type: (List[Tensor], int) -> Tensor

fireredasr_axmodel.py CHANGED Viewed

@@ -9,9 +9,19 @@ from torch import Tensor
 from typing import Tuple, List, Dict
 import os
 import time
 INF = 1e10
 def to_numpy(tensor):
     if isinstance(tensor, np.ndarray):
         return tensor
@@ -19,12 +29,12 @@ def to_numpy(tensor):
         return tensor.detach().cpu().numpy()
     else:
         return tensor.cpu().numpy()
 def set_finished_beam_score_to_zero(scores, is_finished):
     NB, B = scores.size()
     is_finished = is_finished.float()
-    mask_score = torch.tensor([0.0] + [-INF]*(B-1)).float()
     mask_score = mask_score.view(1, B).repeat(NB, 1)
     return scores * (1 - is_finished) + mask_score * is_finished
@@ -36,21 +46,21 @@ def set_finished_beam_y_to_eos(ys, is_finished, eos_id):
 class FireRedASRAxModel:
     def __init__(
-        self,
-        encoder_path: str,
         decoder_loop_path: str,
         cmvn_file: str,
-        dict_file: str,
         spm_model_path: str,
-        providers=['AxEngineExecutionProvider'],
         decode_max_len=128,
-        audio_dur=10
     ):
         # NOTE: 参考whisper设置的最大的解码长度
         # FireRedASR-AED 模型支持的最长语音为 60s
         # ref: https://github.com/FireRedTeam/FireRedASR?tab=readme-ov-file#input-length-limitations
         self.decode_max_len = decode_max_len
         self.decoder_hidden_dim = 1280
         self.audio_dur = audio_dur
         self.max_feat_len = self.calc_feat_len(audio_dur)
@@ -59,47 +69,35 @@ class FireRedASRAxModel:
         self.sos_id = 3
         self.eos_id = 4
         self.pad_id = 2
         self.feature_extractor = ASRFeatExtractor(cmvn_file)
         self.tokenizer = ChineseCharEnglishSpmTokenizer(dict_file, spm_model_path)
         self.init_encoder(encoder_path, providers)
         self.init_decoder_loop(decoder_loop_path, providers)
         self.pe = self.init_pe(decoder_loop_path)
     def init_encoder(self, encoder_path, providers=None):
-        self.encoder = axe.InferenceSession(
-            encoder_path,
-            providers=providers
-        )
     def init_decoder_loop(self, decoder_path, providers=None):
-        self.decoder_loop = axe.InferenceSession(
-            decoder_path,
-            providers=providers
-        )
     def init_pe(self, decoder_path):
         decoder_path = os.path.dirname(decoder_path)
         decoder_path = os.path.join(decoder_path, "pe.npy")
         return np.load(decoder_path)
-    def run_encoder(self, input: np.ndarray,
-                    input_length: np.ndarray
     ) -> Tuple[Tensor, Tensor, Tensor]:
         n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.encoder.run(
-            None,
-            {
-                "encoder_input": input,
-                "encoder_input_lengths": input_length
-            }
-        )
-        return (
-            n_layer_cross_k,
-            n_layer_cross_v,
-            cross_attn_mask
         )
     def decode_loop_one_token(
         self,
@@ -110,9 +108,13 @@ class FireRedASRAxModel:
         n_layer_cross_v_cache: np.ndarray,
         pe: np.ndarray,
         self_attn_mask: np.ndarray,
-        cross_attn_mask: np.ndarray
     ) -> Tuple[Tensor, Tensor, Tensor]:
-        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_loop.run(
             None,
             {
                 "tokens": tokens,
@@ -123,52 +125,50 @@ class FireRedASRAxModel:
                 "pe": pe,
                 "self_attn_mask": self_attn_mask,
                 "cross_attn_mask": cross_attn_mask,
-            }
         )
-        return (
-            logits,
-            out_n_layer_self_k_cache,
-            out_n_layer_self_v_cache
-        )
     def run_decoder(
-        self,
-        n_layer_cross_k,
-        n_layer_cross_v,
-        cross_attn_mask,
-        beam_size,
-        nbest
     ):
         num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
         encoder_out_length = cross_attn_mask.shape[-1]
         cross_attn_mask = torch.from_numpy(cross_attn_mask).to(torch.float32)
-        cross_attn_mask = cross_attn_mask.unsqueeze(1).repeat(
-            1, beam_size, 1, 1
-        ).view(beam_size * batch_size, -1, encoder_out_length)
         n_layer_cross_k = torch.from_numpy(n_layer_cross_k)
         n_layer_cross_v = torch.from_numpy(n_layer_cross_v)
-        n_layer_cross_k = n_layer_cross_k.unsqueeze(2).repeat(
-            1, 1, beam_size, 1, 1
-        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
-        n_layer_cross_v = n_layer_cross_v.unsqueeze(2).repeat(
-            1, 1, beam_size, 1, 1
-        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
-        prediction_tokens = torch.ones(
-            beam_size * batch_size, 1).fill_(self.sos_id).long()
         tokens = prediction_tokens
         offset = torch.zeros(1, dtype=torch.int64)
         n_layer_self_k_cache, n_layer_self_v_cache = self.get_initialized_self_cache(
             batch_size, beam_size
         )
-        scores = torch.tensor([0.0] + [-INF]*(beam_size - 1)).float()
         scores = scores.repeat(batch_size).view(batch_size * beam_size, 1)
         is_finished = torch.zeros_like(scores)
         self_attn_mask = np.zeros((batch_size * beam_size, 1, 1), dtype=np.float32)
         for i in range(self.decode_max_len):
@@ -180,95 +180,111 @@ class FireRedASRAxModel:
             n_layer_cross_v = to_numpy(n_layer_cross_v)
             cross_attn_mask = to_numpy(cross_attn_mask)
-            self_attn_mask = np.zeros((batch_size * beam_size, 1, self.decode_max_len), dtype=np.float32)
-            self_attn_mask[:, :, :self.decode_max_len - offset[0] - 1] = -np.inf
-            logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_loop_one_token(
-                    to_numpy(tokens),
-                    to_numpy(n_layer_self_k_cache),
-                    to_numpy(n_layer_self_v_cache),
-                    to_numpy(n_layer_cross_k),
-                    to_numpy(n_layer_cross_v),
-                    self.pe[offset],
-                    self_attn_mask,
-                    to_numpy(cross_attn_mask)
-                )
             offset += 1
             logits = torch.from_numpy(logits)
             logits = logits.squeeze(1)
             t_scores = F.log_softmax(logits, dim=-1)
             t_topB_scores, t_topB_ys = torch.topk(t_scores, k=beam_size, dim=1)
             t_topB_scores = set_finished_beam_score_to_zero(t_topB_scores, is_finished)
             t_topB_ys = set_finished_beam_y_to_eos(t_topB_ys, is_finished, self.eos_id)
             scores = scores + t_topB_scores
             scores = scores.view(batch_size, beam_size * beam_size)
             scores, topB_score_ids = torch.topk(scores, k=beam_size, dim=1)
             scores = scores.view(-1, 1)
             topB_row_number_in_each_B_rows_of_ys = torch.div(
-                topB_score_ids, beam_size).view(batch_size * beam_size)
-            stride = beam_size * torch.arange(batch_size).view(
-                batch_size, 1).repeat(1, beam_size).view(batch_size * beam_size)
-            topB_row_number_in_ys = topB_row_number_in_each_B_rows_of_ys.long() + stride.long()
             prediction_tokens = prediction_tokens[topB_row_number_in_ys]
             t_ys = torch.gather(
-                t_topB_ys.view(batch_size, beam_size * beam_size),
-                dim=1, index=topB_score_ids
             ).view(beam_size * batch_size, 1)
             tokens = t_ys
             prediction_tokens = torch.cat((prediction_tokens, t_ys), dim=1)
             n_layer_self_k_cache = torch.from_numpy(n_layer_self_k_cache)
             n_layer_self_v_cache = torch.from_numpy(n_layer_self_v_cache)
             for i, self_k_cache in enumerate(n_layer_self_k_cache):
                 n_layer_self_k_cache[i] = n_layer_self_k_cache[i][topB_row_number_in_ys]
             for i, self_v_cache in enumerate(n_layer_self_v_cache):
                 n_layer_self_v_cache[i] = n_layer_self_v_cache[i][topB_row_number_in_ys]
             is_finished = t_ys.eq(self.eos_id)
             if is_finished.sum().item() == beam_size * batch_size:
                 break
         scores = scores.view(batch_size, beam_size)
         prediction_valid_token_lengths = torch.sum(
-            torch.ne(
-                prediction_tokens.view(batch_size, beam_size, -1),
-                self.eos_id),
-            dim=-1
         ).int()
         nbest_scores, nbest_ids = torch.topk(scores, k=nbest, dim=1)
-        index = nbest_ids + beam_size * torch.arange(batch_size).view(batch_size, 1).long()
-        nbest_prediction_tokens = prediction_tokens.view(batch_size * beam_size, -1)[index.view(-1)]
-        nbest_prediction_tokens = nbest_prediction_tokens.view(batch_size, nbest_ids.size(1), -1)
         nbest_prediction_valid_token_lengths = prediction_valid_token_lengths.view(
-            batch_size * beam_size)[index.view(-1)].view(batch_size, -1)
-        nbest_hyps: List[List[Dict[str, torch.Tensor]]] = []
-        for i in range(batch_size):
-            i_best_hyps: List[Dict[str, torch.Tensor]] = []
-            for j, score in enumerate(nbest_scores[i]):
-                hyp = {
-                    "token_ids": nbest_prediction_tokens[i, j, 1:nbest_prediction_valid_token_lengths[i, j]],
-                    "score": score
-                }
-                i_best_hyps.append(hyp)
-            nbest_hyps.append(i_best_hyps)
-        return nbest_hyps
-    def get_initialized_self_cache(self,
-                                   batch_size,
-                                   beam_size
-                                   ) -> Tuple[Tensor, Tensor]:
         n_layer_self_k_cache = torch.zeros(
             self.num_decoder_blocks,
             batch_size * beam_size,
@@ -282,55 +298,101 @@ class FireRedASRAxModel:
             self.decoder_hidden_dim,
         )
         return n_layer_self_k_cache, n_layer_self_v_cache
     def calc_feat_len(self, audio_dur):
         import math
-        sample_rate = 16000
         frame_length = 25 * sample_rate / 1000
         frame_shift = 10 * sample_rate / 1000
         length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
         return length
-    def transcribe(self,
-                   batch_wav_path: List[str],
-                   beam_size: int = 1,
-                   nbest: int = 1
-                ) -> List[Dict]:
-        feats, lengths, wav_durations = self.feature_extractor(batch_wav_path)
-        # print(f"feats.shape: {feats.shape}")
-        if feats.shape[1] < self.max_feat_len:
-            feats = np.concatenate([feats, np.zeros((1, self.max_feat_len - feats.shape[1], 80), dtype=np.float32)], axis=1)
-        feats = feats[:, :self.max_feat_len, :]
-        lengths = torch.minimum(lengths, torch.tensor(self.max_feat_len))
-        feats = to_numpy(feats)
-        lengths = to_numpy(lengths).astype(np.int32)
-        start_time = time.time()
-        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.run_encoder(
-            to_numpy(feats),
-            to_numpy(lengths)
-        )
-        # print(f"run encoder take {(time.time() - start_time) * 1000}ms")
-        nbest_hyps = self.run_decoder(n_layer_cross_k,
-                                      n_layer_cross_v,
-                                      cross_attn_mask,
-                                      beam_size,
-                                      nbest,
-                                      )
-        transcribe_durations = time.time() - start_time
-        results: List[Dict] = []
-        for wav, hyp in zip(batch_wav_path, nbest_hyps):
-            hyp = hyp[0]
-            hyp_ids = [int(id) for id in hyp["token_ids"].cpu()]
-            score = hyp["score"].item()
-            text = self.tokenizer.detokenize(hyp_ids)
-            results.append(
-                {
-                    "wav": wav,
-                    "text": text,
-                    "score": score
-                }
             )
-        return results, wav_durations, transcribe_durations

 from typing import Tuple, List, Dict
 import os
 import time
+import torchaudio
+try:
+    torchaudio.set_audio_backend("soundfile")
+except Exception as e:
+    print("Please run apt install libsnffile1 first")
+    raise e
+from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
 INF = 1e10
 def to_numpy(tensor):
     if isinstance(tensor, np.ndarray):
         return tensor
         return tensor.detach().cpu().numpy()
     else:
         return tensor.cpu().numpy()
 def set_finished_beam_score_to_zero(scores, is_finished):
     NB, B = scores.size()
     is_finished = is_finished.float()
+    mask_score = torch.tensor([0.0] + [-INF] * (B - 1)).float()
     mask_score = mask_score.view(1, B).repeat(NB, 1)
     return scores * (1 - is_finished) + mask_score * is_finished
 class FireRedASRAxModel:
     def __init__(
+        self,
+        encoder_path: str,
         decoder_loop_path: str,
         cmvn_file: str,
+        dict_file: str,
         spm_model_path: str,
+        providers=["AxEngineExecutionProvider"],
         decode_max_len=128,
+        audio_dur=10,
     ):
         # NOTE: 参考whisper设置的最大的解码长度
         # FireRedASR-AED 模型支持的最长语音为 60s
         # ref: https://github.com/FireRedTeam/FireRedASR?tab=readme-ov-file#input-length-limitations
         self.decode_max_len = decode_max_len
+        self.sample_rate = 16000
         self.decoder_hidden_dim = 1280
         self.audio_dur = audio_dur
         self.max_feat_len = self.calc_feat_len(audio_dur)
         self.sos_id = 3
         self.eos_id = 4
         self.pad_id = 2
         self.feature_extractor = ASRFeatExtractor(cmvn_file)
         self.tokenizer = ChineseCharEnglishSpmTokenizer(dict_file, spm_model_path)
         self.init_encoder(encoder_path, providers)
         self.init_decoder_loop(decoder_loop_path, providers)
         self.pe = self.init_pe(decoder_loop_path)
+        self.vad_model = load_silero_vad()
     def init_encoder(self, encoder_path, providers=None):
+        self.encoder = axe.InferenceSession(encoder_path, providers=providers)
     def init_decoder_loop(self, decoder_path, providers=None):
+        self.decoder_loop = axe.InferenceSession(decoder_path, providers=providers)
     def init_pe(self, decoder_path):
         decoder_path = os.path.dirname(decoder_path)
         decoder_path = os.path.join(decoder_path, "pe.npy")
         return np.load(decoder_path)
+    def run_encoder(
+        self, input: np.ndarray, input_length: np.ndarray
     ) -> Tuple[Tensor, Tensor, Tensor]:
         n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.encoder.run(
+            None, {"encoder_input": input, "encoder_input_lengths": input_length}
         )
+        return (n_layer_cross_k, n_layer_cross_v, cross_attn_mask)
     def decode_loop_one_token(
         self,
         n_layer_cross_v_cache: np.ndarray,
         pe: np.ndarray,
         self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray,
     ) -> Tuple[Tensor, Tensor, Tensor]:
+        (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache,
+        ) = self.decoder_loop.run(
             None,
             {
                 "tokens": tokens,
                 "pe": pe,
                 "self_attn_mask": self_attn_mask,
                 "cross_attn_mask": cross_attn_mask,
+            },
         )
+        return (logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache)
     def run_decoder(
+        self, n_layer_cross_k, n_layer_cross_v, cross_attn_mask, beam_size, nbest
     ):
         num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
         encoder_out_length = cross_attn_mask.shape[-1]
         cross_attn_mask = torch.from_numpy(cross_attn_mask).to(torch.float32)
+        cross_attn_mask = (
+            cross_attn_mask.unsqueeze(1)
+            .repeat(1, beam_size, 1, 1)
+            .view(beam_size * batch_size, -1, encoder_out_length)
+        )
         n_layer_cross_k = torch.from_numpy(n_layer_cross_k)
         n_layer_cross_v = torch.from_numpy(n_layer_cross_v)
+        n_layer_cross_k = (
+            n_layer_cross_k.unsqueeze(2)
+            .repeat(1, 1, beam_size, 1, 1)
+            .view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        )
+        n_layer_cross_v = (
+            n_layer_cross_v.unsqueeze(2)
+            .repeat(1, 1, beam_size, 1, 1)
+            .view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        )
+        prediction_tokens = (
+            torch.ones(beam_size * batch_size, 1).fill_(self.sos_id).long()
+        )
         tokens = prediction_tokens
         offset = torch.zeros(1, dtype=torch.int64)
         n_layer_self_k_cache, n_layer_self_v_cache = self.get_initialized_self_cache(
             batch_size, beam_size
         )
+        scores = torch.tensor([0.0] + [-INF] * (beam_size - 1)).float()
         scores = scores.repeat(batch_size).view(batch_size * beam_size, 1)
         is_finished = torch.zeros_like(scores)
         self_attn_mask = np.zeros((batch_size * beam_size, 1, 1), dtype=np.float32)
         for i in range(self.decode_max_len):
             n_layer_cross_v = to_numpy(n_layer_cross_v)
             cross_attn_mask = to_numpy(cross_attn_mask)
+            self_attn_mask = np.zeros(
+                (batch_size * beam_size, 1, self.decode_max_len), dtype=np.float32
+            )
+            self_attn_mask[:, :, : self.decode_max_len - offset[0] - 1] = -np.inf
+            (
+                logits,
+                n_layer_self_k_cache,
+                n_layer_self_v_cache,
+            ) = self.decode_loop_one_token(
+                to_numpy(tokens),
+                to_numpy(n_layer_self_k_cache),
+                to_numpy(n_layer_self_v_cache),
+                to_numpy(n_layer_cross_k),
+                to_numpy(n_layer_cross_v),
+                self.pe[offset],
+                self_attn_mask,
+                to_numpy(cross_attn_mask),
+            )
             offset += 1
             logits = torch.from_numpy(logits)
             logits = logits.squeeze(1)
             t_scores = F.log_softmax(logits, dim=-1)
             t_topB_scores, t_topB_ys = torch.topk(t_scores, k=beam_size, dim=1)
             t_topB_scores = set_finished_beam_score_to_zero(t_topB_scores, is_finished)
             t_topB_ys = set_finished_beam_y_to_eos(t_topB_ys, is_finished, self.eos_id)
             scores = scores + t_topB_scores
             scores = scores.view(batch_size, beam_size * beam_size)
             scores, topB_score_ids = torch.topk(scores, k=beam_size, dim=1)
             scores = scores.view(-1, 1)
             topB_row_number_in_each_B_rows_of_ys = torch.div(
+                topB_score_ids, beam_size
+            ).view(batch_size * beam_size)
+            stride = beam_size * torch.arange(batch_size).view(batch_size, 1).repeat(
+                1, beam_size
+            ).view(batch_size * beam_size)
+            topB_row_number_in_ys = (
+                topB_row_number_in_each_B_rows_of_ys.long() + stride.long()
+            )
             prediction_tokens = prediction_tokens[topB_row_number_in_ys]
             t_ys = torch.gather(
+                t_topB_ys.view(batch_size, beam_size * beam_size),
+                dim=1,
+                index=topB_score_ids,
             ).view(beam_size * batch_size, 1)
             tokens = t_ys
             prediction_tokens = torch.cat((prediction_tokens, t_ys), dim=1)
             n_layer_self_k_cache = torch.from_numpy(n_layer_self_k_cache)
             n_layer_self_v_cache = torch.from_numpy(n_layer_self_v_cache)
             for i, self_k_cache in enumerate(n_layer_self_k_cache):
                 n_layer_self_k_cache[i] = n_layer_self_k_cache[i][topB_row_number_in_ys]
             for i, self_v_cache in enumerate(n_layer_self_v_cache):
                 n_layer_self_v_cache[i] = n_layer_self_v_cache[i][topB_row_number_in_ys]
             is_finished = t_ys.eq(self.eos_id)
             if is_finished.sum().item() == beam_size * batch_size:
                 break
         scores = scores.view(batch_size, beam_size)
         prediction_valid_token_lengths = torch.sum(
+            torch.ne(prediction_tokens.view(batch_size, beam_size, -1), self.eos_id),
+            dim=-1,
         ).int()
         nbest_scores, nbest_ids = torch.topk(scores, k=nbest, dim=1)
+        index = (
+            nbest_ids + beam_size * torch.arange(batch_size).view(batch_size, 1).long()
+        )
+        nbest_prediction_tokens = prediction_tokens.view(batch_size * beam_size, -1)[
+            index.view(-1)
+        ]
+        nbest_prediction_tokens = nbest_prediction_tokens.view(
+            batch_size, nbest_ids.size(1), -1
+        )
         nbest_prediction_valid_token_lengths = prediction_valid_token_lengths.view(
+            batch_size * beam_size
+        )[index.view(-1)].view(batch_size, -1)
+        # batch_size is always 1
+        i_best_hyps: List[Dict[str, torch.Tensor]] = []
+        for j, score in enumerate(nbest_scores[0]):
+            hyp = {
+                "token_ids": nbest_prediction_tokens[
+                    0, j, 1 : nbest_prediction_valid_token_lengths[0, j]
+                ],
+                "score": score,
+            }
+            i_best_hyps.append(hyp)
+        return i_best_hyps
+    def get_initialized_self_cache(
+        self, batch_size, beam_size
+    ) -> Tuple[Tensor, Tensor]:
         n_layer_self_k_cache = torch.zeros(
             self.num_decoder_blocks,
             batch_size * beam_size,
             self.decoder_hidden_dim,
         )
         return n_layer_self_k_cache, n_layer_self_v_cache
     def calc_feat_len(self, audio_dur):
         import math
+        sample_rate = self.sample_rate
         frame_length = 25 * sample_rate / 1000
         frame_shift = 10 * sample_rate / 1000
         length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
         return length
+    def collect_chunks(self, wav, speech_timestamps, audio_dur, sample_rate):
+        max_chunk_samples = int(audio_dur * sample_rate)
+        chunks = []
+        for ts in speech_timestamps:
+            start, end = ts["start"], ts["end"]
+            cur_chunk = wav[start:end]
+            if (
+                len(chunks) > 0
+                and chunks[-1].shape[0] + cur_chunk.shape[0] < max_chunk_samples
+            ):
+                chunks[-1] = torch.concat([chunks[-1], cur_chunk], dim=0)
+            else:
+                if cur_chunk.shape[0] > max_chunk_samples:
+                    # greedy split if one chunk is too big
+                    chunks.append(cur_chunk[:max_chunk_samples])
+                    chunks.append(cur_chunk[max_chunk_samples:])
+                else:
+                    chunks.append(cur_chunk)
+        return chunks
+    def transcribe(
+        self, batch_wav_path: List[str], beam_size: int = 1, nbest: int = 1
+    ) -> List[Dict]:
+        # Run vad, greedy split audio to fit audio_dur
+        try:
+            wav = read_audio(batch_wav_path[0], sampling_rate=self.sample_rate)
+        except Exception as e:
+            print("Please run apt install libsnffile1 first")
+            raise e
+        max_chunk_samples = int(self.sample_rate * self.audio_dur)
+        if wav.shape[0] < max_chunk_samples:
+            chunks = [wav]
+        else:
+            speech_timestamps = get_speech_timestamps(
+                wav,
+                self.vad_model,
+                return_seconds=False,  # Return speech timestamps in seconds (default is samples)
+            )
+            chunks = self.collect_chunks(
+                wav, speech_timestamps, self.audio_dur, self.sample_rate
+            )
+            # print(f"Split to {len(chunks)} chunks")
+        transcribe_durations = 0
+        wav_durations = []
+        tokens = []
+        for chunk in chunks:
+            chunk = (chunk.clamp(-1, 1) * 32768).to(torch.int16)
+            feats, lengths, wav_duration = self.feature_extractor.run_chunk(
+                chunk, self.sample_rate
             )
+            wav_durations.append(wav_duration)
+            if feats.shape[1] < self.max_feat_len:
+                feats = np.concatenate(
+                    [
+                        feats,
+                        np.zeros(
+                            (1, self.max_feat_len - feats.shape[1], 80),
+                            dtype=np.float32,
+                        ),
+                    ],
+                    axis=1,
+                )
+            feats = feats[:, : self.max_feat_len, :]
+            lengths = torch.minimum(lengths, torch.tensor(self.max_feat_len))
+            feats = to_numpy(feats)
+            lengths = to_numpy(lengths).astype(np.int32)
+            start_time = time.time()
+            n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.run_encoder(
+                to_numpy(feats), to_numpy(lengths)
+            )
+            # print(f"run encoder take {(time.time() - start_time) * 1000}ms")
+            nbest_hyps = self.run_decoder(
+                n_layer_cross_k, n_layer_cross_v, cross_attn_mask, beam_size, nbest
+            )
+            tokens.extend([int(id) for id in nbest_hyps[0]["token_ids"].cpu()])
+            transcribe_durations += time.time() - start_time
+        text = self.tokenizer.detokenize(tokens)
+        return {"text": text}, wav_durations, transcribe_durations

fireredasr_onnx.py ADDED Viewed

	@@ -0,0 +1,529 @@

+from fireredasr.data.asr_feat import ASRFeatExtractor
+from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer
+import onnxruntime as ort
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torch import Tensor
+from typing import Tuple, List, Dict
+import argparse
+import os
+import time
+import logging
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+logger_stream_hander = logging.StreamHandler()
+logger_stream_hander.setLevel("INFO")
+logger.addHandler(logger_stream_hander)
+INF = 1e10
+def to_numpy(tensor):
+    if isinstance(tensor, np.ndarray):
+        return tensor
+    if tensor.requires_grad:
+        return tensor.detach().cpu().numpy()
+    else:
+        return tensor.cpu().numpy()
+def set_finished_beam_score_to_zero(scores, is_finished):
+    NB, B = scores.size()
+    is_finished = is_finished.float()
+    mask_score = torch.tensor([0.0] + [-INF] * (B - 1)).float()
+    mask_score = mask_score.view(1, B).repeat(NB, 1)
+    return scores * (1 - is_finished) + mask_score * is_finished
+def set_finished_beam_y_to_eos(ys, is_finished, eos_id):
+    is_finished = is_finished.long()
+    return ys * (1 - is_finished) + eos_id * is_finished
+class FireRedASROnnxModel:
+    def __init__(
+        self,
+        encoder_path: str,
+        decoder_path: str,
+        cmvn_file: str,
+        dict_file: str,
+        spm_model_path: str,
+        providers=["CUDAExecutionProvider"],
+        decode_max_len=128,
+        audio_dur=10,
+    ):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+        # session_opts.log_severity_level = 1
+        self.session_opts = session_opts
+        # NOTE: 参考whisper设置的最大的解码长度
+        # FireRedASR-AED 模型支持的最长语音为 60s
+        # ref: https://github.com/FireRedTeam/FireRedASR?tab=readme-ov-file#input-length-limitations
+        self.decode_max_len = decode_max_len
+        self.decoder_hidden_dim = 1280
+        self.num_decoder_blocks = 16
+        self.blank_id = 0
+        self.sos_id = 3
+        self.eos_id = 4
+        self.pad_id = 2
+        self.feature_extractor = ASRFeatExtractor(cmvn_file)
+        self.tokenizer = ChineseCharEnglishSpmTokenizer(dict_file, spm_model_path)
+        self.encoder = None
+        self.decoder = None
+        self.audio_dur = audio_dur
+        self.init_encoder(encoder_path, providers)
+        self.init_decoder_main(decoder_path, providers)
+        self.init_decoder_loop(decoder_path, providers)
+        self.pe = self.init_pe(decoder_path)
+    def init_encoder(self, encoder_path, providers=None):
+        start_time = time.time()
+        self.encoder = ort.InferenceSession(
+            encoder_path, sess_options=self.session_opts, providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load encoder cost {end_time - start_time} seconds")
+    def init_decoder(self, decoder_path, providers=None):
+        start_time = time.time()
+        self.decoder = ort.InferenceSession(
+            decoder_path, sess_options=self.session_opts, providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder cost {end_time - start_time} seconds")
+    def init_decoder_main(self, decoder_path, providers=None):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "decoder_main.onnx")
+        start_time = time.time()
+        self.decoder_main = ort.InferenceSession(
+            decoder_path, sess_options=self.session_opts, providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder_main cost {end_time - start_time} seconds")
+        input_names = [i.name for i in self.decoder_main.get_inputs()]
+        print(f"decoder_main.input_names: {input_names}")
+    def init_decoder_loop(self, decoder_path, providers=None):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "decoder_loop.onnx")
+        start_time = time.time()
+        self.decoder_loop = ort.InferenceSession(
+            decoder_path, sess_options=self.session_opts, providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder_loop cost {end_time - start_time} seconds")
+        input_names = [i.name for i in self.decoder_loop.get_inputs()]
+        print(f"decoder_loop.input_names: {input_names}")
+    def init_pe(self, decoder_path):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "pe.npy")
+        return np.load(decoder_path)
+    def run_encoder(
+        self, input: np.ndarray, input_length: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.encoder.run(
+            None,
+            {
+                self.encoder.get_inputs()[0].name: input,
+                self.encoder.get_inputs()[1].name: input_length,
+            },
+        )
+        return (n_layer_cross_k, n_layer_cross_v, cross_attn_mask)
+    def decode_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        offset: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"offset.shape: {offset.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        # print(f"self_attn_mask: {self_attn_mask}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder.run(
+            None,
+            {
+                self.decoder.get_inputs()[0].name: tokens,
+                self.decoder.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder.get_inputs()[2].name: n_layer_self_v_cache,
+                self.decoder.get_inputs()[3].name: n_layer_cross_k_cache,
+                self.decoder.get_inputs()[4].name: n_layer_cross_v_cache,
+                self.decoder.get_inputs()[5].name: offset,
+                self.decoder.get_inputs()[6].name: self_attn_mask,
+                self.decoder.get_inputs()[7].name: cross_attn_mask,
+            },
+        )
+        return (logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache)
+    def decode_main_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        pe: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode_main:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"pe.shape: {pe.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache,
+        ) = self.decoder_main.run(
+            None,
+            {
+                self.decoder_main.get_inputs()[0].name: tokens,
+                # self.decoder_main.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder_main.get_inputs()[1].name: n_layer_cross_k_cache,
+                self.decoder_main.get_inputs()[2].name: n_layer_cross_v_cache,
+                # self.decoder_main.get_inputs()[3].name: pe,
+                # self.decoder_main.get_inputs()[4].name: self_attn_mask,
+                self.decoder_main.get_inputs()[3].name: cross_attn_mask,
+                # self.decoder_main.get_inputs()[7].name: cross_attn_mask,
+            },
+        )
+        return (logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache)
+    def decode_loop_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        pe: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode_loop:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"pe.shape: {pe.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache,
+        ) = self.decoder_loop.run(
+            None,
+            {
+                self.decoder_loop.get_inputs()[0].name: tokens,
+                self.decoder_loop.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder_loop.get_inputs()[2].name: n_layer_self_v_cache,
+                self.decoder_loop.get_inputs()[3].name: n_layer_cross_k_cache,
+                self.decoder_loop.get_inputs()[4].name: n_layer_cross_v_cache,
+                self.decoder_loop.get_inputs()[5].name: pe,
+                self.decoder_loop.get_inputs()[6].name: self_attn_mask,
+                self.decoder_loop.get_inputs()[7].name: cross_attn_mask,
+            },
+        )
+        return (logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache)
+    def run_decoder(
+        self, n_layer_cross_k, n_layer_cross_v, cross_attn_mask, beam_size, nbest
+    ):
+        num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
+        encoder_out_length = cross_attn_mask.shape[-1]
+        cross_attn_mask = torch.from_numpy(cross_attn_mask).to(torch.float32)
+        cross_attn_mask = (
+            cross_attn_mask.unsqueeze(1)
+            .repeat(1, beam_size, 1, 1)
+            .view(beam_size * batch_size, -1, encoder_out_length)
+        )
+        n_layer_cross_k = torch.from_numpy(n_layer_cross_k)
+        n_layer_cross_v = torch.from_numpy(n_layer_cross_v)
+        n_layer_cross_k = (
+            n_layer_cross_k.unsqueeze(2)
+            .repeat(1, 1, beam_size, 1, 1)
+            .view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        )
+        n_layer_cross_v = (
+            n_layer_cross_v.unsqueeze(2)
+            .repeat(1, 1, beam_size, 1, 1)
+            .view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        )
+        prediction_tokens = (
+            torch.ones(beam_size * batch_size, 1).fill_(self.sos_id).long()
+        )
+        tokens = prediction_tokens
+        offset = torch.zeros(1, dtype=torch.int64)
+        n_layer_self_k_cache, n_layer_self_v_cache = self.get_initialized_self_cache(
+            batch_size, beam_size
+        )
+        scores = torch.tensor([0.0] + [-INF] * (beam_size - 1)).float()
+        scores = scores.repeat(batch_size).view(batch_size * beam_size, 1)
+        is_finished = torch.zeros_like(scores)
+        # self_attn_mask = torch.zeros(
+        #     batch_size * beam_size,
+        #     1, 1
+        # )
+        results = [self.sos_id]
+        for i in range(self.decode_max_len):
+            # ==== ORIGIN ====
+            # self_attn_mask = torch.empty(
+            #     batch_size * beam_size,
+            #     prediction_tokens.shape[-1], prediction_tokens.shape[-1]
+            # ).fill_(-np.inf).triu_(1)
+            # self_attn_mask = self_attn_mask[:, -1:, :]
+            # self_attn_mask = to_numpy(self_attn_mask)
+            # logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_one_token(
+            #     to_numpy(tokens),
+            #     to_numpy(n_layer_self_k_cache),
+            #     to_numpy(n_layer_self_v_cache),
+            #     to_numpy(n_layer_cross_k),
+            #     to_numpy(n_layer_cross_v),
+            #     to_numpy(offset),
+            #     to_numpy(self_attn_mask),
+            #     to_numpy(cross_attn_mask)
+            # )
+            # ==== ORIGIN ====
+            tokens = to_numpy(tokens)
+            n_layer_self_k_cache = to_numpy(n_layer_self_k_cache)
+            n_layer_self_v_cache = to_numpy(n_layer_self_v_cache)
+            n_layer_cross_k = to_numpy(n_layer_cross_k)
+            n_layer_cross_v = to_numpy(n_layer_cross_v)
+            cross_attn_mask = to_numpy(cross_attn_mask)
+            self_attn_mask = np.zeros(
+                (batch_size * beam_size, 1, self.decode_max_len), dtype=np.float32
+            )
+            self_attn_mask[:, :, : self.decode_max_len - offset[0] - 1] = -np.inf
+            if i == 0:
+                (
+                    logits,
+                    n_layer_self_k_cache,
+                    n_layer_self_v_cache,
+                ) = self.decode_main_one_token(
+                    to_numpy(tokens),
+                    to_numpy(n_layer_self_k_cache),
+                    to_numpy(n_layer_self_v_cache),
+                    to_numpy(n_layer_cross_k),
+                    to_numpy(n_layer_cross_v),
+                    self.pe[0],
+                    self_attn_mask,
+                    to_numpy(cross_attn_mask),
+                )
+            else:
+                (
+                    logits,
+                    n_layer_self_k_cache,
+                    n_layer_self_v_cache,
+                ) = self.decode_loop_one_token(
+                    to_numpy(tokens),
+                    to_numpy(n_layer_self_k_cache),
+                    to_numpy(n_layer_self_v_cache),
+                    to_numpy(n_layer_cross_k),
+                    to_numpy(n_layer_cross_v),
+                    self.pe[offset],
+                    self_attn_mask,
+                    to_numpy(cross_attn_mask),
+                )
+            # logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_loop_one_token(
+            #         to_numpy(tokens),
+            #         to_numpy(n_layer_self_k_cache),
+            #         to_numpy(n_layer_self_v_cache),
+            #         to_numpy(n_layer_cross_k),
+            #         to_numpy(n_layer_cross_v),
+            #         self.pe[offset],
+            #         self_attn_mask,
+            #         to_numpy(cross_attn_mask)
+            #     )
+            offset += 1
+            logits = torch.from_numpy(logits)
+            logits = logits.squeeze(1)
+            t_scores = F.log_softmax(logits, dim=-1)
+            t_topB_scores, t_topB_ys = torch.topk(t_scores, k=beam_size, dim=1)
+            t_topB_scores = set_finished_beam_score_to_zero(t_topB_scores, is_finished)
+            t_topB_ys = set_finished_beam_y_to_eos(t_topB_ys, is_finished, self.eos_id)
+            scores = scores + t_topB_scores
+            scores = scores.view(batch_size, beam_size * beam_size)
+            scores, topB_score_ids = torch.topk(scores, k=beam_size, dim=1)
+            scores = scores.view(-1, 1)
+            topB_row_number_in_each_B_rows_of_ys = torch.div(
+                topB_score_ids, beam_size
+            ).view(batch_size * beam_size)
+            stride = beam_size * torch.arange(batch_size).view(batch_size, 1).repeat(
+                1, beam_size
+            ).view(batch_size * beam_size)
+            topB_row_number_in_ys = (
+                topB_row_number_in_each_B_rows_of_ys.long() + stride.long()
+            )
+            prediction_tokens = prediction_tokens[topB_row_number_in_ys]
+            t_ys = torch.gather(
+                t_topB_ys.view(batch_size, beam_size * beam_size),
+                dim=1,
+                index=topB_score_ids,
+            ).view(beam_size * batch_size, 1)
+            tokens = t_ys
+            prediction_tokens = torch.cat((prediction_tokens, t_ys), dim=1)
+            n_layer_self_k_cache = torch.from_numpy(n_layer_self_k_cache)
+            n_layer_self_v_cache = torch.from_numpy(n_layer_self_v_cache)
+            for i, self_k_cache in enumerate(n_layer_self_k_cache):
+                n_layer_self_k_cache[i] = n_layer_self_k_cache[i][topB_row_number_in_ys]
+            for i, self_v_cache in enumerate(n_layer_self_v_cache):
+                n_layer_self_v_cache[i] = n_layer_self_v_cache[i][topB_row_number_in_ys]
+            is_finished = t_ys.eq(self.eos_id)
+            if is_finished.sum().item() == beam_size * batch_size:
+                break
+        scores = scores.view(batch_size, beam_size)
+        prediction_valid_token_lengths = torch.sum(
+            torch.ne(prediction_tokens.view(batch_size, beam_size, -1), self.eos_id),
+            dim=-1,
+        ).int()
+        nbest_scores, nbest_ids = torch.topk(scores, k=nbest, dim=1)
+        index = (
+            nbest_ids + beam_size * torch.arange(batch_size).view(batch_size, 1).long()
+        )
+        nbest_prediction_tokens = prediction_tokens.view(batch_size * beam_size, -1)[
+            index.view(-1)
+        ]
+        nbest_prediction_tokens = nbest_prediction_tokens.view(
+            batch_size, nbest_ids.size(1), -1
+        )
+        nbest_prediction_valid_token_lengths = prediction_valid_token_lengths.view(
+            batch_size * beam_size
+        )[index.view(-1)].view(batch_size, -1)
+        nbest_hyps: List[List[Dict[str, torch.Tensor]]] = []
+        for i in range(batch_size):
+            i_best_hyps: List[Dict[str, torch.Tensor]] = []
+            for j, score in enumerate(nbest_scores[i]):
+                hyp = {
+                    "token_ids": nbest_prediction_tokens[
+                        i, j, 1 : nbest_prediction_valid_token_lengths[i, j]
+                    ],
+                    "score": score,
+                }
+                i_best_hyps.append(hyp)
+            nbest_hyps.append(i_best_hyps)
+        return nbest_hyps
+    def get_initialized_self_cache(
+        self, batch_size, beam_size
+    ) -> Tuple[Tensor, Tensor]:
+        n_layer_self_k_cache = torch.zeros(
+            self.num_decoder_blocks,
+            batch_size * beam_size,
+            self.decode_max_len,
+            self.decoder_hidden_dim,
+        )
+        n_layer_self_v_cache = torch.zeros(
+            self.num_decoder_blocks,
+            batch_size * beam_size,
+            self.decode_max_len,
+            self.decoder_hidden_dim,
+        )
+        return n_layer_self_k_cache, n_layer_self_v_cache
+    def calc_feat_len(self, audio_dur):
+        import math
+        sample_rate = 16000
+        frame_length = 25 * sample_rate / 1000
+        frame_shift = 10 * sample_rate / 1000
+        length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
+        return length
+    def transcribe(
+        self, batch_wav_path: List[str], beam_size: int = 1, nbest: int = 1
+    ) -> List[Dict]:
+        feats, lengths, wav_durations = self.feature_extractor(batch_wav_path)
+        maxlen = self.calc_feat_len(self.audio_dur)
+        if feats.shape[1] < maxlen:
+            feats = np.concatenate(
+                [feats, np.zeros((1, maxlen - feats.shape[1], 80), dtype=np.float32)],
+                axis=1,
+            )
+        feats = feats[:, :maxlen, :]
+        lengths = torch.minimum(lengths, torch.tensor(maxlen))
+        feats = to_numpy(feats)
+        lengths = to_numpy(lengths)
+        start_time = time.time()
+        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.run_encoder(
+            to_numpy(feats), to_numpy(lengths)
+        )
+        nbest_hyps = self.run_decoder(
+            n_layer_cross_k, n_layer_cross_v, cross_attn_mask, beam_size, nbest
+        )
+        transcribe_durations = time.time() - start_time
+        results: List[Dict] = []
+        for wav, hyp in zip(batch_wav_path, nbest_hyps):
+            hyp = hyp[0]
+            hyp_ids = [int(id) for id in hyp["token_ids"].cpu()]
+            score = hyp["score"].item()
+            text = self.tokenizer.detokenize(hyp_ids)
+            results.append({"wav": wav, "text": text, "score": score})
+        return results, wav_durations, transcribe_durations

test_ax_model.py CHANGED Viewed

@@ -11,79 +11,47 @@ logger_stream_hander = logging.StreamHandler()
 logger_stream_hander.setLevel("INFO")
 logger.addHandler(logger_stream_hander)
 def parse_args():
     parser = argparse.ArgumentParser(description="FireRedASRAxModel Test")
     parser.add_argument(
-        "--encoder",
-        type=str,
         default="axmodel/encoder.axmodel",
-        help="Path to axmodel encoder"
     )
     parser.add_argument(
-        "--decoder_loop",
-        type=str,
         default="axmodel/decoder_loop.axmodel",
-        help="Path to axmodel decoder loop"
     )
     parser.add_argument(
-        "--cmvn",
-        type=str,
-        default="axmodel/cmvn.ark",
-        help="Path to cmvn"
     )
     parser.add_argument(
-        "--dict",
-        type=str,
-        default="axmodel/dict.txt",
-        help="Path to dict"
     )
     parser.add_argument(
         "--spm_model",
         type=str,
         default="axmodel/train_bpe1000.model",
-        help="Path to spm model"
-    )
-    parser.add_argument(
-        "--wavlist",
-        type=str,
-        default="wavlist.txt",
-        help="File to wav path list"
-    )
-    parser.add_argument(
-        "--hypo",
-        type=str,
-        default="hypo_axmodel.txt",
-        help="File of hypos"
-    )
-    parser.add_argument(
-        "--beam_size",
-        type=int,
-        default=3,
-        help=""
-    )
-    parser.add_argument(
-        "--nbest",
-        type=int,
-        default=1,
-        help=""
     )
     parser.add_argument(
-        "--decode_max_len",
-        type=int,
-        default=128,
-        help="max token len"
     )
     parser.add_argument(
-        "--max_dur",
-        type=int,
-        default=10,
-        help="max audio len"
     )
     return parser.parse_args()
 def parse_wavlist(wavlist: str):
     wavpaths = []
     with open(wavlist) as f:
@@ -93,24 +61,24 @@ def parse_wavlist(wavlist: str):
                 print(f"{line} doesn't exist.")
                 continue
             wavpaths.append(line)
     return wavpaths
 def main():
     args = parse_args()
     print(args)
-    model = FireRedASRAxModel(args.encoder,
-                                     args.decoder_main,
-                                     args.decoder_loop,
-                                     args.cmvn,
-                                     args.dict,
-                                     args.spm_model,
-                                     decode_max_len=args.decode_max_len,
-                                     audio_dur=args.max_dur
-                                     )
     wf = open(args.hypo, "wt")
     wavlist = parse_wavlist(args.wavlist)
@@ -118,9 +86,10 @@ def main():
     total_transcribe_durations = 0
     for wav in wavlist:
         batch_wav = [wav]
-        results, wav_durations, transcribe_durations = model.transcribe(
-            batch_wav, args.beam_size, args.nbest)
         wav_durations = sum(wav_durations)
         total_wav_durations += wav_durations
         total_transcribe_durations += transcribe_durations
@@ -129,19 +98,19 @@ def main():
         logger.info(f"Transcribe Durations: {transcribe_durations}")
         rtf = transcribe_durations / wav_durations
         logger.info(f"(Real time factor) RTF: {rtf}")
-        for result in results:
-            logger.info(f"wav: {result['wav']}")
-            logger.info(f"text: {result['text']}")
-            logger.info(f"score: {result['score']}")
-            logger.info("")
-            wf.write(f"{result['text']} ({result['wav']})\n")
     logger.info(f"total wav durations: {total_wav_durations}")
     logger.info(f"total transcribe durations: {total_transcribe_durations}")
     avg_ref = total_transcribe_durations / total_wav_durations
     logger.info(f"AVG RTF: {avg_ref}")
     wf.close()
 if __name__ == "__main__":
-    main()

 logger_stream_hander.setLevel("INFO")
 logger.addHandler(logger_stream_hander)
 def parse_args():
     parser = argparse.ArgumentParser(description="FireRedASRAxModel Test")
     parser.add_argument(
+        "--encoder",
+        type=str,
         default="axmodel/encoder.axmodel",
+        help="Path to axmodel encoder",
     )
     parser.add_argument(
+        "--decoder_loop",
+        type=str,
         default="axmodel/decoder_loop.axmodel",
+        help="Path to axmodel decoder loop",
     )
     parser.add_argument(
+        "--cmvn", type=str, default="axmodel/cmvn.ark", help="Path to cmvn"
     )
     parser.add_argument(
+        "--dict", type=str, default="axmodel/dict.txt", help="Path to dict"
     )
     parser.add_argument(
         "--spm_model",
         type=str,
         default="axmodel/train_bpe1000.model",
+        help="Path to spm model",
     )
     parser.add_argument(
+        "--wavlist", type=str, default="wavlist.txt", help="File to wav path list"
     )
     parser.add_argument(
+        "--hypo", type=str, default="hypo_axmodel.txt", help="File of hypos"
     )
+    parser.add_argument("--beam_size", type=int, default=3, help="")
+    parser.add_argument("--nbest", type=int, default=1, help="")
+    parser.add_argument("--decode_max_len", type=int, default=128, help="max token len")
+    parser.add_argument("--max_dur", type=int, default=10, help="max audio len")
     return parser.parse_args()
 def parse_wavlist(wavlist: str):
     wavpaths = []
     with open(wavlist) as f:
                 print(f"{line} doesn't exist.")
                 continue
             wavpaths.append(line)
     return wavpaths
 def main():
     args = parse_args()
     print(args)
+    model = FireRedASRAxModel(
+        args.encoder,
+        args.decoder_loop,
+        args.cmvn,
+        args.dict,
+        args.spm_model,
+        decode_max_len=args.decode_max_len,
+        audio_dur=args.max_dur,
+    )
     wf = open(args.hypo, "wt")
     wavlist = parse_wavlist(args.wavlist)
     total_transcribe_durations = 0
     for wav in wavlist:
         batch_wav = [wav]
+        result, wav_durations, transcribe_durations = model.transcribe(
+            batch_wav, args.beam_size, args.nbest
+        )
         wav_durations = sum(wav_durations)
         total_wav_durations += wav_durations
         total_transcribe_durations += transcribe_durations
         logger.info(f"Transcribe Durations: {transcribe_durations}")
         rtf = transcribe_durations / wav_durations
         logger.info(f"(Real time factor) RTF: {rtf}")
+        text = result["text"]
+        logger.info(f"text: {text}")
+        logger.info("")
+        wf.write(f"{text}\n")
     logger.info(f"total wav durations: {total_wav_durations}")
     logger.info(f"total transcribe durations: {total_transcribe_durations}")
     avg_ref = total_transcribe_durations / total_wav_durations
     logger.info(f"AVG RTF: {avg_ref}")
     wf.close()
 if __name__ == "__main__":
+    main()

test_wer.py CHANGED Viewed

@@ -10,57 +10,57 @@ def setup_logging():
     # 获取脚本所在目录
     script_dir = os.path.dirname(os.path.abspath(__file__))
     log_file = os.path.join(script_dir, "test_wer.log")
     # 配置日志格式
-    log_format = '%(asctime)s - %(levelname)s - %(message)s'
-    date_format = '%Y-%m-%d %H:%M:%S'
     # 创建logger
     logger = logging.getLogger()
     logger.setLevel(logging.INFO)
     # 清除现有的handler
     for handler in logger.handlers[:]:
         logger.removeHandler(handler)
     # 创建文件handler
-    file_handler = logging.FileHandler(log_file, mode='a', encoding='utf-8')
     file_handler.setLevel(logging.INFO)
     file_formatter = logging.Formatter(log_format, date_format)
     file_handler.setFormatter(file_formatter)
     # 创建控制台handler
     console_handler = logging.StreamHandler()
     console_handler.setLevel(logging.INFO)
     console_formatter = logging.Formatter(log_format, date_format)
     console_handler.setFormatter(console_formatter)
     # 添加handler到logger
     logger.addHandler(file_handler)
     logger.addHandler(console_handler)
     return logger
 class AIShellDataset:
-    def __init__(self, gt_path: str, voice_dir='wav'):
         """
         初始化数据集
         Args:
             json_path: voice.json文件的路径
         """
         self.gt_path = gt_path
         self.dataset_dir = os.path.dirname(gt_path)
         self.voice_dir = os.path.join(self.dataset_dir, voice_dir)
         # 检查必要文件和文件夹是否存在
         assert os.path.exists(gt_path), f"gt文件不存在: {gt_path}"
         assert os.path.exists(self.voice_dir), f"文件夹不存在: {self.voice_dir}"
         # 加载数据
         self.data = []
-        with open(gt_path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = line.strip()
                 audio_path, gt = line.split(" ")
@@ -70,50 +70,50 @@ class AIShellDataset:
         # 使用logging而不是print
         logger = logging.getLogger()
         logger.info(f"加载了 {len(self.data)} 条数据")
     def __iter__(self):
         """返回迭代器"""
         self.index = 0
         return self
     def __next__(self):
         """返回下一个数据项"""
         if self.index >= len(self.data):
             raise StopIteration
         item = self.data[self.index]
         audio_path = item["audio_path"]
         ground_truth = item["gt"]
         self.index += 1
         return audio_path, ground_truth
     def __len__(self):
         """返回数据集大小"""
         return len(self.data)
 class CommonVoiceDataset:
     """Common Voice数据集解析器"""
     def __init__(self, tsv_path: str):
         """
         初始化数据集
         Args:
             json_path: voice.json文件的路径
         """
         self.tsv_path = tsv_path
         self.dataset_dir = os.path.dirname(tsv_path)
         self.voice_dir = os.path.join(self.dataset_dir, "clips")
         # 检查必要文件和文件夹是否存在
         assert os.path.exists(tsv_path), f"{tsv_path}文件不存在: {tsv_path}"
         assert os.path.exists(self.voice_dir), f"voice文件夹不存在: {self.voice_dir}"
         # 加载JSON数据
         self.data = []
-        with open(tsv_path, 'r', encoding='utf-8') as f:
             f.readline()
             for line in f:
                 line = line.strip()
@@ -122,107 +122,100 @@ class CommonVoiceDataset:
                 gt = splits[2]
                 audio_path = os.path.join(self.voice_dir, audio_path)
                 self.data.append({"audio_path": audio_path, "gt": gt})
         # 使用logging而不是print
         logger = logging.getLogger()
         logger.info(f"加载了 {len(self.data)} 条数据")
     def __iter__(self):
         """返回迭代器"""
         self.index = 0
         return self
     def __next__(self):
         """返回下一个数据项"""
         if self.index >= len(self.data):
             raise StopIteration
         item = self.data[self.index]
         audio_path = item["audio_path"]
         ground_truth = item["gt"]
         self.index += 1
         return audio_path, ground_truth
     def __len__(self):
         """返回数据集大小"""
         return len(self.data)
 def get_args():
-    parser = argparse.ArgumentParser(
-        prog="whisper",
-        description="Test WER on dataset"
-    )
-    parser.add_argument("--dataset", "-d", type=str, required=True, choices=["aishell", "common_voice"], help="Test dataset")
-    parser.add_argument("--gt_path", "-g", type=str, required=True, help="Test dataset ground truth file")
-    parser.add_argument("--max_num", type=int, default=-1, required=False, help="Maximum test data num")
-    parser.add_argument("--language", "-l", type=str, required=False, default="zh", help="Target language, support en, zh, ja, and others. See languages.py for more options.")
     parser.add_argument(
-        "--encoder",
-        type=str,
-        default="axmodel/encoder.axmodel",
-        help="Path to onnx encoder"
     )
     parser.add_argument(
-        "--decoder_main",
-        type=str,
-        default="axmodel/decoder_main.axmodel",
-        help="Path to axmodel decoder main"
     )
     parser.add_argument(
-        "--decoder_loop",
-        type=str,
-        default="axmodel/decoder_loop.axmodel",
-        help="Path to axmodel decoder loop"
     )
     parser.add_argument(
-        "--cmvn",
         type=str,
-        default="axmodel/cmvn.ark",
-        help="Path to cmvn"
     )
     parser.add_argument(
-        "--dict",
         type=str,
-        default="axmodel/dict.txt",
-        help="Path to dict"
     )
     parser.add_argument(
-        "--spm_model",
         type=str,
-        default="axmodel/train_bpe1000.model",
-        help="Path to spm model"
     )
     parser.add_argument(
-        "--wavlist",
         type=str,
-        default="wavlist.txt",
-        help="File to wav path list"
     )
     parser.add_argument(
-        "--hypo",
-        type=str,
-        default="hypo_axmodel.txt",
-        help="File of hypos"
     )
     parser.add_argument(
-        "--beam_size",
-        type=int,
-        default=3,
-        help=""
     )
     parser.add_argument(
-        "--nbest",
-        type=int,
-        default=1,
-        help=""
     )
     parser.add_argument(
-        "--max_len",
-        type=int,
-        default=128,
-        help=""
     )
     return parser.parse_args()
@@ -235,42 +228,42 @@ def print_args(args):
 def min_distance(word1: str, word2: str) -> int:
     row = len(word1) + 1
     column = len(word2) + 1
-    cache = [ [0]*column for i in range(row) ]
     for i in range(row):
         for j in range(column):
-            if i ==0 and j ==0:
                 cache[i][j] = 0
-            elif i == 0 and j!=0:
                 cache[i][j] = j
-            elif j == 0 and i!=0:
                 cache[i][j] = i
             else:
-                if word1[i-1] == word2[j-1]:
-                    cache[i][j] = cache[i-1][j-1]
                 else:
-                    replace = cache[i-1][j-1] + 1
-                    insert = cache[i][j-1] + 1
-                    remove = cache[i-1][j] + 1
                     cache[i][j] = min(replace, insert, remove)
-    return cache[row-1][column-1]
 def remove_punctuation(text):
     # 定义正则表达式���式，匹配所有标点符号
     # 这个模式包括常见的标点符号和中文标点
-    pattern = r'[^\w\s]|_'
     # 使用sub方法将所有匹配的标点符号替换为空字符串
-    cleaned_text = re.sub(pattern, '', text)
     return cleaned_text
@@ -292,16 +285,25 @@ def main():
     max_num = args.max_num
     # Load model
-    model = FireRedASRAxModel(args.encoder,
-                            args.decoder_main,
-                            args.decoder_loop,
-                            args.cmvn,
-                            args.dict,
-                            args.spm_model,
-                            decode_max_len=args.max_len,
-                            audio_dur=10
     )
     # Iterate over dataset
     references = []
@@ -313,10 +315,9 @@ def main():
     for n, (audio_path, reference) in enumerate(dataset):
         batch_uttid = [os.path.splitext(os.path.basename(audio_path))[0]]
         batch_wav = [audio_path]
-        results, _, _ = model.transcribe(
-            batch_wav, args.beam_size, args.nbest)
-        hypothesis = results[0]['text']
         hypothesis = remove_punctuation(hypothesis)
         reference = remove_punctuation(reference)
@@ -330,7 +331,7 @@ def main():
         hyp.append(hypothesis)
         references.append(reference)
         line_content = f"({n+1}/{max_data_num}) {os.path.basename(audio_path)}  gt: {reference}  predict: {hypothesis}  WER: {character_error_rate}%"
         wer_file.write(line_content + "\n")
         logger.info(line_content)
@@ -344,5 +345,6 @@ def main():
     wer_file.write(f"Total WER: {total_character_error_rate}%")
     wer_file.close()
 if __name__ == "__main__":
     main()

     # 获取脚本所在目录
     script_dir = os.path.dirname(os.path.abspath(__file__))
     log_file = os.path.join(script_dir, "test_wer.log")
     # 配置日志格式
+    log_format = "%(asctime)s - %(levelname)s - %(message)s"
+    date_format = "%Y-%m-%d %H:%M:%S"
     # 创建logger
     logger = logging.getLogger()
     logger.setLevel(logging.INFO)
     # 清除现有的handler
     for handler in logger.handlers[:]:
         logger.removeHandler(handler)
     # 创建文件handler
+    file_handler = logging.FileHandler(log_file, mode="a", encoding="utf-8")
     file_handler.setLevel(logging.INFO)
     file_formatter = logging.Formatter(log_format, date_format)
     file_handler.setFormatter(file_formatter)
     # 创建控制台handler
     console_handler = logging.StreamHandler()
     console_handler.setLevel(logging.INFO)
     console_formatter = logging.Formatter(log_format, date_format)
     console_handler.setFormatter(console_formatter)
     # 添加handler到logger
     logger.addHandler(file_handler)
     logger.addHandler(console_handler)
     return logger
 class AIShellDataset:
+    def __init__(self, gt_path: str, voice_dir="wav"):
         """
         初始化数据集
         Args:
             json_path: voice.json文件的路径
         """
         self.gt_path = gt_path
         self.dataset_dir = os.path.dirname(gt_path)
         self.voice_dir = os.path.join(self.dataset_dir, voice_dir)
         # 检查必要文件和文件夹是否存在
         assert os.path.exists(gt_path), f"gt文件不存在: {gt_path}"
         assert os.path.exists(self.voice_dir), f"文件夹不存在: {self.voice_dir}"
         # 加载数据
         self.data = []
+        with open(gt_path, "r", encoding="utf-8") as f:
             for line in f:
                 line = line.strip()
                 audio_path, gt = line.split(" ")
         # 使用logging而不是print
         logger = logging.getLogger()
         logger.info(f"加载了 {len(self.data)} 条数据")
     def __iter__(self):
         """返回迭代器"""
         self.index = 0
         return self
     def __next__(self):
         """返回下一个数据项"""
         if self.index >= len(self.data):
             raise StopIteration
         item = self.data[self.index]
         audio_path = item["audio_path"]
         ground_truth = item["gt"]
         self.index += 1
         return audio_path, ground_truth
     def __len__(self):
         """返回数据集大小"""
         return len(self.data)
 class CommonVoiceDataset:
     """Common Voice数据集解析器"""
     def __init__(self, tsv_path: str):
         """
         初始化数据集
         Args:
             json_path: voice.json文件的路径
         """
         self.tsv_path = tsv_path
         self.dataset_dir = os.path.dirname(tsv_path)
         self.voice_dir = os.path.join(self.dataset_dir, "clips")
         # 检查必要文件和文件夹是否存在
         assert os.path.exists(tsv_path), f"{tsv_path}文件不存在: {tsv_path}"
         assert os.path.exists(self.voice_dir), f"voice文件夹不存在: {self.voice_dir}"
         # 加载JSON数据
         self.data = []
+        with open(tsv_path, "r", encoding="utf-8") as f:
             f.readline()
             for line in f:
                 line = line.strip()
                 gt = splits[2]
                 audio_path = os.path.join(self.voice_dir, audio_path)
                 self.data.append({"audio_path": audio_path, "gt": gt})
         # 使用logging而不是print
         logger = logging.getLogger()
         logger.info(f"加载了 {len(self.data)} 条数据")
     def __iter__(self):
         """返回迭代器"""
         self.index = 0
         return self
     def __next__(self):
         """返回下一个数据项"""
         if self.index >= len(self.data):
             raise StopIteration
         item = self.data[self.index]
         audio_path = item["audio_path"]
         ground_truth = item["gt"]
         self.index += 1
         return audio_path, ground_truth
     def __len__(self):
         """返回数据集大小"""
         return len(self.data)
 def get_args():
+    parser = argparse.ArgumentParser(prog="whisper", description="Test WER on dataset")
     parser.add_argument(
+        "--dataset",
+        "-d",
+        type=str,
+        required=True,
+        choices=["aishell", "common_voice"],
+        help="Test dataset",
     )
     parser.add_argument(
+        "--gt_path",
+        "-g",
+        type=str,
+        required=True,
+        help="Test dataset ground truth file",
     )
     parser.add_argument(
+        "--max_num", type=int, default=-1, required=False, help="Maximum test data num"
     )
     parser.add_argument(
+        "--language",
+        "-l",
         type=str,
+        required=False,
+        default="zh",
+        help="Target language, support en, zh, ja, and others. See languages.py for more options.",
     )
     parser.add_argument(
+        "--encoder",
         type=str,
+        default="axmodel/encoder.axmodel",
+        help="Path to onnx encoder",
     )
     parser.add_argument(
+        "--decoder_main",
         type=str,
+        default="axmodel/decoder_main.axmodel",
+        help="Path to axmodel decoder main",
     )
     parser.add_argument(
+        "--decoder_loop",
         type=str,
+        default="axmodel/decoder_loop.axmodel",
+        help="Path to axmodel decoder loop",
     )
     parser.add_argument(
+        "--cmvn", type=str, default="axmodel/cmvn.ark", help="Path to cmvn"
     )
     parser.add_argument(
+        "--dict", type=str, default="axmodel/dict.txt", help="Path to dict"
     )
     parser.add_argument(
+        "--spm_model",
+        type=str,
+        default="axmodel/train_bpe1000.model",
+        help="Path to spm model",
+    )
+    parser.add_argument(
+        "--wavlist", type=str, default="wavlist.txt", help="File to wav path list"
     )
     parser.add_argument(
+        "--hypo", type=str, default="hypo_axmodel.txt", help="File of hypos"
     )
+    parser.add_argument("--beam_size", type=int, default=3, help="")
+    parser.add_argument("--nbest", type=int, default=1, help="")
+    parser.add_argument("--max_len", type=int, default=128, help="")
     return parser.parse_args()
 def min_distance(word1: str, word2: str) -> int:
     row = len(word1) + 1
     column = len(word2) + 1
+    cache = [[0] * column for i in range(row)]
     for i in range(row):
         for j in range(column):
+            if i == 0 and j == 0:
                 cache[i][j] = 0
+            elif i == 0 and j != 0:
                 cache[i][j] = j
+            elif j == 0 and i != 0:
                 cache[i][j] = i
             else:
+                if word1[i - 1] == word2[j - 1]:
+                    cache[i][j] = cache[i - 1][j - 1]
                 else:
+                    replace = cache[i - 1][j - 1] + 1
+                    insert = cache[i][j - 1] + 1
+                    remove = cache[i - 1][j] + 1
                     cache[i][j] = min(replace, insert, remove)
+    return cache[row - 1][column - 1]
 def remove_punctuation(text):
     # 定义正则表达式���式，匹配所有标点符号
     # 这个模式包括常见的标点符号和中文标点
+    pattern = r"[^\w\s]|_"
     # 使用sub方法将所有匹配的标点符号替换为空字符串
+    cleaned_text = re.sub(pattern, "", text)
     return cleaned_text
     max_num = args.max_num
     # Load model
+    model = FireRedASRAxModel(
+        args.encoder,
+        args.decoder_loop,
+        args.cmvn,
+        args.dict,
+        args.spm_model,
+        decode_max_len=args.max_len,
+        audio_dur=10,
     )
+    # model = FireRedASROnnxModel(
+    #     args.encoder,
+    #     args.decoder,
+    #     args.cmvn,
+    #     args.dict,
+    #     args.spm_model,
+    #     decode_max_len=args.max_len,
+    #     audio_dur=10
+    # )
+    # model = FireRedAsr.from_pretrained("aed", "model_convert/pretrained_models/FireRedASR-AED-L")
     # Iterate over dataset
     references = []
     for n, (audio_path, reference) in enumerate(dataset):
         batch_uttid = [os.path.splitext(os.path.basename(audio_path))[0]]
         batch_wav = [audio_path]
+        results, _, _ = model.transcribe(batch_wav, args.beam_size, args.nbest)
+        hypothesis = results["text"]
         hypothesis = remove_punctuation(hypothesis)
         reference = remove_punctuation(reference)
         hyp.append(hypothesis)
         references.append(reference)
         line_content = f"({n+1}/{max_data_num}) {os.path.basename(audio_path)}  gt: {reference}  predict: {hypothesis}  WER: {character_error_rate}%"
         wer_file.write(line_content + "\n")
         logger.info(line_content)
     wer_file.write(f"Total WER: {total_character_error_rate}%")
     wer_file.close()
 if __name__ == "__main__":
     main()