Add model convert

Files changed (7) hide show

model_convert/model_wrapper.py +431 -0
model_convert/to_onnx.py +525 -0
test_ax_model.py +657 -0
test_decoder.py +640 -0
test_encoder.py +646 -0
test_onnx_model.py +684 -0
wavlist.txt +4 -0

model_convert/model_wrapper.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import torch
+import torch.nn as nn
+from torch import Tensor
+from fireredasr.models.module.conformer_encoder import ConformerEncoder
+from fireredasr.models.module.transformer_decoder import (
+    TransformerDecoder,
+    DecoderLayer,
+    DecoderMultiHeadAttention,
+    DecoderScaledDotProductAttention,
+    PositionalEncoding
+)
+def DecoderScaledDotProductAttentionForward(
+    self: DecoderScaledDotProductAttention,
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    mask: Tensor
+):
+    attn = torch.matmul(q, k.transpose(2, 3)) / self.temperature
+    if mask is not None:
+        # mask is such as [[[0, 0, 0, 0, ..., -inf, -inf]]]
+        attn = attn + mask
+        attn = torch.softmax(attn, dim=-1)
+    else:
+        attn = torch.softmax(attn, dim=-1)
+    output = torch.matmul(attn, v)
+    return output
+DecoderScaledDotProductAttention.forward = DecoderScaledDotProductAttentionForward
+"""
+The purpose of this is to allow the exported onnx model
+to only need to pass in the token id of the decoding result
+of the previous time step when performing decoding inference at each time step,
+rather than the token id of all previous time steps.
+"""
+def PositionalEncodingForward(
+    self: PositionalEncoding,
+    offset: Tensor
+):
+    return self.pe[:, :offset].clone().detach()[:, -1]
+PositionalEncoding.forward = PositionalEncodingForward
+"""
+NOTE(Lianghu): Why do that?
+When exporting the onnx model using original padding_position_is_0 funciton,
+the dynamic batch does not work properly for the exported onnx model.
+The code in the original padding_position_is_0 function is as follows:
+```py
+def padding_position_is_0(...):
+    N, T = padded_input.size()[:2]
+    mask = torch.ones((N, T)).to(padded_input.device)
+    ...
+```
+Because when exporting onnx, N and T are considered constants.
+Should be N = padded_input.size(0) and T = padded_input.size(1).
+"""
+def padding_position_is_0(self: ConformerEncoder,
+                          padded_input: Tensor,
+                          input_lengths: Tensor):
+    N = padded_input.size(0)
+    T = padded_input.size(1)
+    seq_range = torch.arange(T, device=padded_input.device).unsqueeze(0)  # shape: (1, T)
+    input_lengths_exp = input_lengths.unsqueeze(1)  # shape: (N, 1)
+    mask = seq_range < input_lengths_exp  # shape: (N, T)
+    mask = mask.unsqueeze(dim=1)
+    return mask.to(torch.uint8)
+ConformerEncoder.padding_position_is_0 = padding_position_is_0
+class AudioEncoderTensorCache(nn.Module):
+    def __init__(self,
+                 encoder: ConformerEncoder,
+                 decoder: TransformerDecoder):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+    def forward(self, input: Tensor, input_length: Tensor):
+        encoder_output, _, encoder_mask = self.encoder(input, input_length)
+        n_layer_cross_k_list = []
+        n_layer_cross_v_list = []
+        for layer in self.decoder.layer_stack:
+            # layer: DecoderLayer
+            n_layer_cross_k_list.append(layer.cross_attn.w_ks(encoder_output))
+            n_layer_cross_v_list.append(layer.cross_attn.w_vs(encoder_output))
+        encoder_mask = encoder_mask.to(torch.float32)
+        encoder_mask[encoder_mask == 0] = -torch.inf
+        encoder_mask[encoder_mask == 1] = 0.0
+        return (torch.stack(n_layer_cross_k_list),
+                torch.stack(n_layer_cross_v_list),
+                encoder_mask)
+class DecoderMultiHeadSelfAttention(nn.Module):
+    def __init__(self, multiHeadSelfAttention: DecoderMultiHeadAttention, loop: bool = False):
+        super().__init__()
+        self.multiHeadSelfAttention = multiHeadSelfAttention
+        self.loop = loop
+    def forward(self,
+                x: Tensor,
+                k_cache: Tensor,
+                v_cache: Tensor,
+                mask: Tensor):
+        bs = x.size(0)
+        # 当前时间步为 t
+        # k_cache 和 v_cache 是 时间步 [0: t-1] 的 self_attn_k 和 self_attn_v 的缓存
+        q = self.multiHeadSelfAttention.w_qs(x)
+        k = self.multiHeadSelfAttention.w_ks(x)
+        v = self.multiHeadSelfAttention.w_vs(x)
+        k_cache[:, -k.shape[1] :, :] = k
+        v_cache[:, -v.shape[1] :, :] = v
+        # if self.loop:
+        #     k_cache = torch.cat([k_cache[:, 1:, :], k], 1)
+        #     v_cache = torch.cat([v_cache[:, 1:, :], v], 1)
+        # else:
+        #     k_cache = k
+        #     v_cache = v
+        q = q.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
+        k = k_cache.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
+        v = v_cache.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
+        k = k.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
+        v = v.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+        output = self.multiHeadSelfAttention.attention(q, k, v, mask)
+        output = output.transpose(1, 2).contiguous().view(bs, -1, self.multiHeadSelfAttention.d_model)
+        output = self.multiHeadSelfAttention.fc(output)
+        output = self.multiHeadSelfAttention.dropout(output)
+        return output, k_cache, v_cache
+class DecoderMultiHeadSelfAttentionV2(nn.Module):
+    def __init__(self, multiHeadSelfAttention: DecoderMultiHeadAttention, loop: bool = False):
+        super().__init__()
+        self.multiHeadSelfAttention = multiHeadSelfAttention
+        self.loop = loop
+    def forward(self,
+                x: Tensor,
+                k_cache: Tensor,
+                v_cache: Tensor,
+                mask: Tensor):
+        bs = x.size(0)
+        # 当前时间步为 t
+        # k_cache 和 v_cache 是 时间步 [0: t-1] 的 self_attn_k 和 self_attn_v 的缓存
+        q = self.multiHeadSelfAttention.w_qs(x)
+        k = self.multiHeadSelfAttention.w_ks(x)
+        v = self.multiHeadSelfAttention.w_vs(x)
+        # k_cache[:, -k.shape[1] :, :] = k
+        # v_cache[:, -v.shape[1] :, :] = v
+        if self.loop:
+            k_cache = torch.cat([k_cache[:, 1:, :], k], 1)
+            v_cache = torch.cat([v_cache[:, 1:, :], v], 1)
+        else:
+            k_cache = k
+            v_cache = v
+        q = q.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
+        k = k_cache.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
+        v = v_cache.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
+        k = k.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
+        v = v.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+        output = self.multiHeadSelfAttention.attention(q, k, v, mask)
+        output = output.transpose(1, 2).contiguous().view(bs, -1, self.multiHeadSelfAttention.d_model)
+        output = self.multiHeadSelfAttention.fc(output)
+        output = self.multiHeadSelfAttention.dropout(output)
+        return output, k_cache, v_cache
+class DecoderMultiHeadCrossAttention(nn.Module):
+    def __init__(self, multiHeadCrossAttention: DecoderMultiHeadAttention):
+        super().__init__()
+        self.multiHeadCrossAttention = multiHeadCrossAttention
+    def forward(self,
+                x: Tensor,
+                k: Tensor,
+                v: Tensor,
+                mask: Tensor):
+        bs = x.size(0)
+        x = self.multiHeadCrossAttention.w_qs(x)
+        x = x.view(bs, -1, self.multiHeadCrossAttention.n_head, self.multiHeadCrossAttention.d_k)
+        k = k.view(bs, -1, self.multiHeadCrossAttention.n_head, self.multiHeadCrossAttention.d_k)
+        v = v.view(bs, -1, self.multiHeadCrossAttention.n_head, self.multiHeadCrossAttention.d_k)
+        x = x.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+        output = self.multiHeadCrossAttention.attention(x, k, v, mask)
+        output = output.transpose(1, 2).contiguous().view(bs, -1, self.multiHeadCrossAttention.d_model)
+        output = self.multiHeadCrossAttention.fc(output)
+        output = self.multiHeadCrossAttention.dropout(output)
+        return output
+class ResidualAttentionBlockTensorCache(nn.Module):
+    def __init__(self, decoder_layer: DecoderLayer, loop: bool = False):
+        super().__init__()
+        self.original_decoder_layer = decoder_layer
+        self.self_attn = DecoderMultiHeadSelfAttention(decoder_layer.self_attn, loop)
+        self.cross_attn = DecoderMultiHeadCrossAttention(decoder_layer.cross_attn)
+    def forward(self,
+                x: Tensor,
+                self_k_cache: Tensor,
+                self_v_cache: Tensor,
+                cross_k: Tensor,
+                cross_v: Tensor,
+                self_attn_mask: Tensor,
+                cross_attn_mask: Tensor):
+        # q.shape (B, 1, dim)
+        x_self_attn_norm = self.original_decoder_layer.self_attn_norm(x)
+        self_attn_x, self_k_cache_updated, self_v_cache_updated = self.self_attn(
+            x_self_attn_norm, self_k_cache, self_v_cache, self_attn_mask)
+        x = x + self_attn_x
+        residual = x
+        x_cross_attn_norm = self.original_decoder_layer.cross_attn_norm(x)
+        x_cross_attn = self.cross_attn(x_cross_attn_norm, cross_k, cross_v, cross_attn_mask)
+        x = residual + x_cross_attn
+        x = x + self.original_decoder_layer.mlp(self.original_decoder_layer.mlp_norm(x))
+        return x, self_k_cache_updated, self_v_cache_updated
+class ResidualAttentionBlockTensorCacheV2(nn.Module):
+    def __init__(self, decoder_layer: DecoderLayer, loop: bool = False):
+        super().__init__()
+        self.original_decoder_layer = decoder_layer
+        self.self_attn = DecoderMultiHeadSelfAttentionV2(decoder_layer.self_attn, loop)
+        self.cross_attn = DecoderMultiHeadCrossAttention(decoder_layer.cross_attn)
+    def forward(self,
+                x: Tensor,
+                self_k_cache: Tensor,
+                self_v_cache: Tensor,
+                cross_k: Tensor,
+                cross_v: Tensor,
+                self_attn_mask: Tensor,
+                cross_attn_mask: Tensor):
+        # q.shape (B, 1, dim)
+        x_self_attn_norm = self.original_decoder_layer.self_attn_norm(x)
+        self_attn_x, self_k_cache_updated, self_v_cache_updated = self.self_attn(
+            x_self_attn_norm, self_k_cache, self_v_cache, self_attn_mask)
+        x = x + self_attn_x
+        residual = x
+        x_cross_attn_norm = self.original_decoder_layer.cross_attn_norm(x)
+        x_cross_attn = self.cross_attn(x_cross_attn_norm, cross_k, cross_v, cross_attn_mask)
+        x = residual + x_cross_attn
+        x = x + self.original_decoder_layer.mlp(self.original_decoder_layer.mlp_norm(x))
+        return x, self_k_cache_updated, self_v_cache_updated
+class TextDecoderTensorCache(nn.Module):
+    def __init__(self, decoder: TransformerDecoder):
+        super().__init__()
+        self.decoder = decoder
+        self.blocks = []
+        for original_layer in self.decoder.layer_stack:
+            self.blocks.append(
+                ResidualAttentionBlockTensorCache(original_layer))
+    def forward(self,
+                tokens: Tensor,
+                n_layer_self_k_cache: Tensor,
+                n_layer_self_v_cache: Tensor,
+                n_layer_cross_k: Tensor,
+                n_layer_cross_v: Tensor,
+                offset: Tensor,
+                self_attn_mask: Tensor,
+                cross_attn_mask: Tensor):
+        """
+        TODO(Lianghu): Integrate self_attn_mask into the model inference process
+              instead of passing it in through an external interface.
+        """
+        x = self.decoder.dropout(
+            self.decoder.tgt_word_emb(tokens) * self.decoder.scale +
+            self.decoder.positional_encoding(offset + 1)
+        )
+        i = 0
+        for block in self.blocks:
+            self_k_cache = n_layer_self_k_cache[i, :, : offset[0] + tokens.shape[-1], :]
+            self_v_cache = n_layer_self_v_cache[i, :, : offset[0] + tokens.shape[-1], :]
+            x, self_k_cache, self_v_cache = block(
+                x,
+                self_k_cache,
+                self_v_cache,
+                n_layer_cross_k[i],
+                n_layer_cross_v[i],
+                self_attn_mask,
+                cross_attn_mask
+            )
+            n_layer_self_k_cache[i, :, : offset[0] + tokens.shape[-1], :] = self_k_cache
+            n_layer_self_v_cache[i, :, : offset[0] + tokens.shape[-1], :] = self_v_cache
+            i += 1
+        output = self.decoder.layer_norm_out(x)
+        logits = self.decoder.tgt_word_prj(output)
+        return logits, n_layer_self_k_cache, n_layer_self_v_cache
+class TextDecoderTensorCacheV2(nn.Module):
+    def __init__(self, decoder: TransformerDecoder, loop: bool = False):
+        super().__init__()
+        self.decoder = decoder
+        self.loop = loop
+        self.blocks = []
+        for original_layer in self.decoder.layer_stack:
+            self.blocks.append(
+                ResidualAttentionBlockTensorCacheV2(original_layer, loop))
+    def forward(self,
+                tokens: Tensor,
+                n_layer_self_k_cache: Tensor,
+                n_layer_self_v_cache: Tensor,
+                n_layer_cross_k: Tensor,
+                n_layer_cross_v: Tensor,
+                positional_embedding: Tensor,
+                self_attn_mask: Tensor,
+                cross_attn_mask: Tensor):
+        """
+        TODO(Lianghu): Integrate self_attn_mask into the model inference process
+              instead of passing it in through an external interface.
+        """
+        x = self.decoder.dropout(
+            self.decoder.tgt_word_emb(tokens) * self.decoder.scale +
+            positional_embedding
+        )
+        # if self.loop:
+        #     x = self.decoder.dropout(
+        #         self.decoder.tgt_word_emb(tokens) * self.decoder.scale +
+        #         positional_embedding
+        #     )
+        # else:
+        #     x = self.decoder.dropout(
+        #         self.decoder.tgt_word_emb(tokens) * self.decoder.scale +
+        #         self.decoder.positional_encoding.pe[:, : tokens.shape[-1]]
+        #     )
+        i = 0
+        self_k_cache_out = []
+        self_v_cache_out = []
+        for block in self.blocks:
+            self_k_cache = n_layer_self_k_cache[i, :, :, :]
+            self_v_cache = n_layer_self_v_cache[i, :, :, :]
+            if self.loop:
+                x, self_k_cache, self_v_cache = block(
+                    x,
+                    self_k_cache,
+                    self_v_cache,
+                    n_layer_cross_k[i],
+                    n_layer_cross_v[i],
+                    self_attn_mask,
+                    cross_attn_mask
+                )
+                self_k_cache_out.append(self_k_cache.unsqueeze(0))
+                self_v_cache_out.append(self_v_cache.unsqueeze(0))
+            else:
+                n_audio, n_text_ctx, ntext_state = self_k_cache.shape
+                x, self_k_cache, self_v_cache = block(
+                    x,
+                    self_k_cache,
+                    self_v_cache,
+                    n_layer_cross_k[i],
+                    n_layer_cross_v[i],
+                    self_attn_mask,
+                    cross_attn_mask
+                )
+                self_k_cache_out.append(torch.cat((torch.zeros([n_audio, n_text_ctx - self_k_cache.shape[1], ntext_state]).to(self_k_cache.device), self_k_cache), 1).unsqueeze(0))
+                self_v_cache_out.append(torch.cat((torch.zeros([n_audio, n_text_ctx - self_v_cache.shape[1], ntext_state]).to(self_v_cache.device), self_v_cache), 1).unsqueeze(0))
+            i += 1
+        n_layer_self_k_cache = torch.cat(self_k_cache_out, 0)
+        n_layer_self_v_cache = torch.cat(self_v_cache_out, 0)
+        output = self.decoder.layer_norm_out(x)
+        logits = self.decoder.tgt_word_prj(output)
+        return logits, n_layer_self_k_cache, n_layer_self_v_cache

model_convert/to_onnx.py ADDED Viewed

	@@ -0,0 +1,525 @@

+import model_wrapper
+from fireredasr.models.fireredasr import FireRedAsrAed
+import torch
+import onnx
+import onnxruntime
+from onnxruntime.quantization import QuantType, quantize_dynamic
+import onnxslim
+from onnx.external_data_helper import convert_model_to_external_data
+import numpy as np
+import math
+import kaldiio
+import os
+import argparse
+from typing import Dict, Any
+def to_numpy(tensor):
+    if tensor.requires_grad:
+        return tensor.detach().cpu().numpy()
+    else:
+        return tensor.cpu().numpy()
+def load_model(model_path):
+    package = torch.load(model_path,
+                         map_location=lambda storage,
+                         loc: storage, weights_only=False)
+    model = FireRedAsrAed.from_args(package["args"])
+    model.load_state_dict(package["model_state_dict"], strict=True)
+    return model, package["args"]
+def read_kaldi_cmvn(kaldi_cmvn_file):
+    assert os.path.exists(kaldi_cmvn_file)
+    stats = kaldiio.load_mat(kaldi_cmvn_file)
+    assert stats.shape[0] == 2
+    dim = stats.shape[-1] - 1
+    count = stats[0, dim]
+    assert count >= 1
+    floor = 1e-20
+    means = []
+    inverse_std_variences = []
+    for d in range(dim):
+        mean = stats[0, d] / count
+        means.append(mean.item())
+        varience = (stats[1, d] / count) - mean*mean
+        if varience < floor:
+            varience = floor
+        istd = 1.0 / math.sqrt(varience)
+        inverse_std_variences.append(istd)
+    return means, inverse_std_variences
+def add_meta_data(filename: str, meta_data: Dict[str, Any]):
+    """Add meta data to an ONNX model. It is changed in-place.
+    Args:
+      filename:
+        Filename of the ONNX model to be changed.
+      meta_data:
+        Key-value pairs.
+    """
+    model = onnx.load(filename)
+    while len(model.metadata_props):
+        model.metadata_props.pop()
+    for key, value in meta_data.items():
+        meta = model.metadata_props.add()
+        meta.key = key
+        meta.value = str(value)
+    onnx.save(model, filename)
+def calc_feat_len(audio_dur):
+    import math
+    sample_rate = 16000
+    frame_length = 25 * sample_rate / 1000
+    frame_shift = 10 * sample_rate / 1000
+    length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
+    return length
+def export_encoder(fireredasr_model, args, model_args):
+    encoder = model_wrapper.AudioEncoderTensorCache(
+        fireredasr_model.encoder,
+        fireredasr_model.decoder)
+    encoder.eval()
+    # forge encoder input
+    encoder_input = torch.randn(1, calc_feat_len(10), 80)
+    encoder_input_lengths = torch.tensor([100], dtype=torch.int64)
+    n_layer_cross_k, n_layer_cross_v, cross_attn_mask = encoder(
+        encoder_input,
+        encoder_input_lengths
+    )
+    if not os.path.exists(args.encoder):
+        os.makedirs(args.encoder)
+    onnx_encoder_file = os.path.join(args.encoder, "encoder.onnx")
+    with torch.no_grad():
+        torch.onnx.export(
+            encoder,
+            (encoder_input, encoder_input_lengths),
+            onnx_encoder_file,
+            export_params=True,
+            do_constant_folding=True,
+            opset_version=16,
+            verbose=False,
+            input_names=["encoder_input",
+                        "encoder_input_lengths"],
+            output_names=["n_layer_cross_k",
+                        "n_layer_cross_v",
+                        "cross_attn_mask"],
+            # dynamic_axes={
+            #     "encoder_input": {
+            #         0: "batch_size",
+            #         1: "input_length"
+            #     },
+            #     "encoder_input_lengths": {
+            #         0: "batch_size"
+            #     },
+            #     "n_layer_cross_k": {
+            #         1: "batch_size",
+            #         2: "length"
+            #     },
+            #     "n_layer_cross_v": {
+            #         1: "batch_size",
+            #         2: "length"
+            #     },
+            #     "cross_attn_mask": {
+            #         0: "batch_size",
+            #         2: "length"
+            #     }
+            # },
+            external_data=True
+        )
+    external_filename = os.path.basename(onnx_encoder_file).split(".onnx")[0]
+    model = onnx.load(onnx_encoder_file)
+    convert_model_to_external_data(
+        model,
+        all_tensors_to_one_file=True,
+        location=f"./{external_filename}.data",
+        size_threshold=0,
+        convert_attribute=False
+    )
+    onnx.save_model(
+        model,
+        onnx_encoder_file,
+        save_as_external_data=True,
+        all_tensors_to_one_file=True,
+        location=f"./{external_filename}.data",
+        size_threshold=0
+    )
+    onnx.checker.check_model(onnx_encoder_file, True)
+    ort_session = onnxruntime.InferenceSession(onnx_encoder_file)
+    onnx_encoder_input = to_numpy(encoder_input)
+    onxx_encoder_input_lengths = to_numpy(encoder_input_lengths)
+    ort_inputs = {ort_session.get_inputs()[0].name: onnx_encoder_input,
+                  ort_session.get_inputs()[1].name: onxx_encoder_input_lengths}
+    ort_outputs = ort_session.run(None, ort_inputs)
+    try:
+        np.testing.assert_allclose(to_numpy(n_layer_cross_k), ort_outputs[0], rtol=1e-03, atol=1e-05)
+    except AssertionError as e:
+        print(e)
+    try:
+        np.testing.assert_allclose(to_numpy(n_layer_cross_v), ort_outputs[1], rtol=1e-03, atol=1e-05)
+    except AssertionError as e:
+        print(e)
+    try:
+        np.testing.assert_allclose(to_numpy(cross_attn_mask), ort_outputs[2], rtol=1e-03, atol=1e-05)
+    except AssertionError as e:
+        print(e)
+    print("export onnx encoder done.")
+    # Generate int8 quantization models
+    # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
+    print("Generate int8 quantization models")
+    if not os.path.exists(args.encoder_int8):
+        os.mkdir(args.encoder_int8)
+    onnx_encoder_int8_file = "encoder_int8.onnx"
+    onnx_encoder_int8_file = os.path.join(args.encoder_int8, onnx_encoder_int8_file)
+    quantize_dynamic(
+        model_input=onnx_encoder_file,
+        model_output=onnx_encoder_int8_file,
+        op_types_to_quantize=["MatMul"],
+        weight_type=QuantType.QInt8,
+    )
+    cmvn_mean, cmvn_inv_stddev = read_kaldi_cmvn(args.cmvn)
+    cmvn_mean = [str(m) for m in cmvn_mean]
+    cmvn_inv_stddev = [str(istd) for istd in cmvn_inv_stddev]
+    encoder_meta_data = {
+        "model_type": "FireRedAsrAED-L",
+        "maintainer": "LiangHu",
+        "feat_dim": model_args.idim,
+        "feat_type": "fbank",
+        "num_decoder_layers": model_args.n_layers_dec,
+        "num_head": model_args.n_head,
+        "head_dim": model_args.d_model // model_args.n_head,
+        "max_len": 448,
+        "sos": model_args.sos_id,
+        "eos": model_args.eos_id,
+        "cmvn_mean": ','.join(cmvn_mean),
+        "cmvn_inv_stddev": ','.join(cmvn_inv_stddev)
+    }
+    # add_meta_data(onnx_encoder_file, encoder_meta_data)
+    add_meta_data(onnx_encoder_int8_file, encoder_meta_data)
+    return n_layer_cross_k, n_layer_cross_v, cross_attn_mask
+def export_decoder(fireredasr_model, args,
+                   n_layer_cross_k,
+                   n_layer_cross_v,
+                   cross_attn_mask):
+    beam_size = 3
+    decoder = model_wrapper.TextDecoderTensorCache(
+        fireredasr_model.decoder)
+    decoder.eval()
+    num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
+    encoder_out_length = cross_attn_mask.size(-1)
+    # preparing for batch beam search
+    cross_attn_mask = cross_attn_mask.unsqueeze(1).repeat(
+        1, beam_size, 1, 1).view(beam_size * batch_size, -1, encoder_out_length)
+    n_layer_cross_k = n_layer_cross_k.unsqueeze(2).repeat(
+        1, 1, beam_size, 1, 1
+    ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+    n_layer_cross_v = n_layer_cross_v.unsqueeze(2).repeat(
+        1, 1, beam_size, 1, 1
+    ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+    tokens = torch.ones(beam_size * batch_size, 1).fill_(decoder.decoder.sos_id).long()
+    n_layer_self_k_cache = torch.zeros(
+        (
+            len(decoder.blocks),
+            batch_size * beam_size,
+            448,
+            1280
+        )
+    )
+    n_layer_self_v_cache = torch.zeros(
+        (
+            len(decoder.blocks),
+            batch_size * beam_size,
+            448,
+            1280
+        )
+    )
+    offset = torch.zeros(1, dtype=torch.int64)
+    self_attn_mask = torch.empty(batch_size * beam_size,
+                                 tokens.shape[-1], tokens.shape[-1]
+                                 ).fill_(-np.inf).triu_(1) # fill_(-np.inf)
+    self_attn_mask = self_attn_mask[:, -1:, :]
+    logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = decoder(
+        tokens,
+        n_layer_self_k_cache,
+        n_layer_self_v_cache,
+        n_layer_cross_k,
+        n_layer_cross_v,
+        offset,
+        self_attn_mask,
+        cross_attn_mask
+    )
+    if not os.path.exists(args.decoder):
+        os.makedirs(args.decoder)
+    onnx_decoder_file = os.path.join(args.decoder, "decoder.onnx")
+    with torch.no_grad():
+        torch.onnx.export(
+            decoder,
+            (tokens,
+            n_layer_self_k_cache,
+            n_layer_self_v_cache,
+            n_layer_cross_k,
+            n_layer_cross_v,
+            offset,
+            self_attn_mask,
+            cross_attn_mask),
+            onnx_decoder_file,
+            export_params=True,
+            opset_version=13,
+            verbose=False,
+            input_names=["tokens",
+                        "in_n_layer_self_k_cache",
+                        "in_n_layer_self_v_cache",
+                        "n_layer_cross_k",
+                        "n_layer_cross_v",
+                        "offset",
+                        "self_attn_mask",
+                        "cross_attn_mask"],
+            output_names=["logits",
+                        "out_n_layer_self_k_cache",
+                        "out_n_layer_self_v_cache"],
+            dynamic_axes={
+                "tokens": {0: "n_audio", 1: "n_tokens"},
+                "in_n_layer_self_k_cache": {1: "n_audio"},
+                "in_n_layer_self_v_cache": {1: "n_audio"},
+                "n_layer_cross_k": {1: "n_audio", 2: "T"},
+                "n_layer_cross_v": {1: "n_audio", 2: "T"},
+                "self_attn_mask": {0: "n_audio", 2: "T"},
+                "cross_attn_mask": {0: "n_audio", 2: "T"},
+            },
+            external_data=True
+        )
+    onnx.checker.check_model(onnx_decoder_file)
+    ort_session = onnxruntime.InferenceSession(onnx_decoder_file)
+    onnx_tokens = to_numpy(tokens)
+    onnx_n_layer_self_k_cache = to_numpy(n_layer_self_k_cache)
+    onnx_n_layer_self_v_cache = to_numpy(n_layer_self_v_cache)
+    onnx_n_layer_cross_k = to_numpy(n_layer_cross_k)
+    onnx_n_layer_cross_v = to_numpy(n_layer_cross_v)
+    onnx_offset = to_numpy(offset)
+    onnx_self_attn_mask = to_numpy(self_attn_mask)
+    onnx_cross_attn_mask = to_numpy(cross_attn_mask)
+    ort_inputs = {ort_session.get_inputs()[0].name: onnx_tokens,
+                  ort_session.get_inputs()[1].name: onnx_n_layer_self_k_cache,
+                  ort_session.get_inputs()[2].name: onnx_n_layer_self_v_cache,
+                  ort_session.get_inputs()[3].name: onnx_n_layer_cross_k,
+                  ort_session.get_inputs()[4].name: onnx_n_layer_cross_v,
+                  ort_session.get_inputs()[5].name: onnx_offset,
+                  ort_session.get_inputs()[6].name: onnx_self_attn_mask,
+                  ort_session.get_inputs()[7].name: onnx_cross_attn_mask}
+    ort_outputs = ort_session.run(None, ort_inputs)
+    try:
+        np.testing.assert_allclose(to_numpy(logits), ort_outputs[0], rtol=1e-03, atol=1e-05)
+    except AssertionError as e:
+        print(e)
+    try:
+        np.testing.assert_allclose(to_numpy(out_n_layer_self_k_cache), ort_outputs[1], rtol=1e-03, atol=1e-05)
+    except AssertionError as e:
+        print(e)
+    try:
+        np.testing.assert_allclose(to_numpy(out_n_layer_self_v_cache), ort_outputs[2], rtol=1e-03, atol=1e-05)
+    except AssertionError as e:
+        print(e)
+    print("export onnx decoder done.")
+    if not os.path.exists(args.decoder_int8):
+        os.mkdir(args.decoder_int8)
+    onnx_decoder_int8_file = "decoder_int8.onnx"
+    onnx_decoder_int8_file = os.path.join(args.decoder_int8, onnx_decoder_int8_file)
+    quantize_dynamic(
+        model_input=onnx_decoder_file,
+        model_output=onnx_decoder_int8_file,
+        op_types_to_quantize=["MatMul"],
+        weight_type=QuantType.QInt8,
+    )
+    # decoder main
+    decoder = model_wrapper.TextDecoderTensorCacheV2(
+        fireredasr_model.decoder, loop=False)
+    decoder.eval()
+    self_attn_mask = torch.empty(batch_size * beam_size,
+                                 tokens.shape[-1], tokens.shape[-1]
+                                 ).fill_(-np.inf).triu_(1) # fill_(-np.inf)
+    self_attn_mask = self_attn_mask[:, -1:, :]
+    pe = decoder.decoder.positional_encoding.pe[0]
+    onnx_decoder_file = os.path.join(args.decoder, "decoder_main.onnx")
+    with torch.no_grad():
+        torch.onnx.export(
+            decoder,
+            (tokens,
+            n_layer_self_k_cache,
+            n_layer_self_v_cache,
+            n_layer_cross_k,
+            n_layer_cross_v,
+            pe[0],
+            self_attn_mask,
+            cross_attn_mask),
+            onnx_decoder_file,
+            export_params=True,
+            opset_version=13,
+            verbose=False,
+            input_names=["tokens",
+                        "in_n_layer_self_k_cache",
+                        "in_n_layer_self_v_cache",
+                        "n_layer_cross_k",
+                        "n_layer_cross_v",
+                        "pe",
+                        "self_attn_mask",
+                        "cross_attn_mask"],
+            output_names=["logits",
+                        "out_n_layer_self_k_cache",
+                        "out_n_layer_self_v_cache"],
+            # dynamic_axes={
+            #     "tokens": {0: "n_audio", 1: "n_tokens"},
+            #     "in_n_layer_self_k_cache": {1: "n_audio"},
+            #     "in_n_layer_self_v_cache": {1: "n_audio"},
+            #     "n_layer_cross_k": {1: "n_audio", 2: "T"},
+            #     "n_layer_cross_v": {1: "n_audio", 2: "T"},
+            #     "self_attn_mask": {0: "n_audio", 2: "T"},
+            #     "cross_attn_mask": {0: "n_audio", 2: "T"},
+            # },
+            external_data=True
+        )
+    print(f"Export decoder_main to {onnx_decoder_file}")
+    # decoder loop
+    decoder = model_wrapper.TextDecoderTensorCacheV2(
+        fireredasr_model.decoder, loop=True)
+    decoder.eval()
+    pe = decoder.decoder.positional_encoding.pe[0]
+    pe_file = os.path.join(args.decoder, "pe.npy")
+    np.save(pe_file, pe.numpy())
+    onnx_decoder_file = os.path.join(args.decoder, "decoder_loop.onnx")
+    with torch.no_grad():
+        torch.onnx.export(
+            decoder,
+            (tokens,
+            n_layer_self_k_cache,
+            n_layer_self_v_cache,
+            n_layer_cross_k,
+            n_layer_cross_v,
+            pe[0],
+            self_attn_mask,
+            cross_attn_mask),
+            onnx_decoder_file,
+            export_params=True,
+            opset_version=13,
+            verbose=False,
+            input_names=["tokens",
+                        "in_n_layer_self_k_cache",
+                        "in_n_layer_self_v_cache",
+                        "n_layer_cross_k",
+                        "n_layer_cross_v",
+                        "pe",
+                        "self_attn_mask",
+                        "cross_attn_mask"],
+            output_names=["logits",
+                        "out_n_layer_self_k_cache",
+                        "out_n_layer_self_v_cache"],
+            # dynamic_axes={
+            #     "tokens": {0: "n_audio", 1: "n_tokens"},
+            #     "in_n_layer_self_k_cache": {1: "n_audio"},
+            #     "in_n_layer_self_v_cache": {1: "n_audio"},
+            #     "n_layer_cross_k": {1: "n_audio", 2: "T"},
+            #     "n_layer_cross_v": {1: "n_audio", 2: "T"},
+            #     "self_attn_mask": {0: "n_audio", 2: "T"},
+            #     "cross_attn_mask": {0: "n_audio", 2: "T"},
+            # },
+            external_data=True
+        )
+    print(f"Export decoder_loop to {onnx_decoder_file}")
+def parse_args():
+    parser = argparse.ArgumentParser(description="export FireRedASR-AED torch model to onnx")
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Path to FireRedASR-AED torch model"
+    )
+    parser.add_argument(
+        "--encoder",
+        type=str,
+        required=True,
+        help="Dir to the exported onnx encoder"
+    )
+    parser.add_argument(
+        "--decoder",
+        type=str,
+        required=True,
+        help="Dir to the exported onnx decoder"
+    )
+    parser.add_argument(
+        "--encoder_int8",
+        type=str,
+        required=True,
+        help="Dir to the exported onnx encoder after int8 quantization"
+    )
+    parser.add_argument(
+        "--decoder_int8",
+        type=str,
+        required=True,
+        help="Dir to the exported onnx encoder after int8 quantization"
+    )
+    parser.add_argument(
+        "--cmvn",
+        type=str,
+        required=True,
+        help="cmvn.ark file"
+    )
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    fireredasr_model, model_args = load_model(args.model)
+    n_layer_cross_k, n_layer_cross_v, cross_attn_mask = export_encoder(fireredasr_model, args, model_args)
+    export_decoder(fireredasr_model, args, n_layer_cross_k, n_layer_cross_v, cross_attn_mask)
+if __name__ == "__main__":
+    main()

test_ax_model.py ADDED Viewed

	@@ -0,0 +1,657 @@

+from fireredasr.data.asr_feat import ASRFeatExtractor
+from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer
+import onnxruntime as ort
+import axengine as axe
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torch import Tensor
+from typing import Tuple, List, Dict
+import argparse
+import os
+import time
+import logging
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+logger_stream_hander = logging.StreamHandler()
+logger_stream_hander.setLevel("INFO")
+logger.addHandler(logger_stream_hander)
+INF = 1e10
+def to_numpy(tensor):
+    if isinstance(tensor, np.ndarray):
+        return tensor
+    if tensor.requires_grad:
+        return tensor.detach().cpu().numpy()
+    else:
+        return tensor.cpu().numpy()
+def set_finished_beam_score_to_zero(scores, is_finished):
+    NB, B = scores.size()
+    is_finished = is_finished.float()
+    mask_score = torch.tensor([0.0] + [-INF]*(B-1)).float()
+    mask_score = mask_score.view(1, B).repeat(NB, 1)
+    return scores * (1 - is_finished) + mask_score * is_finished
+def set_finished_beam_y_to_eos(ys, is_finished, eos_id):
+    is_finished = is_finished.long()
+    return ys * (1 - is_finished) + eos_id * is_finished
+class FireRedASROnnxModel:
+    def __init__(
+        self,
+        encoder_path: str,
+        decoder_path: str,
+        cmvn_file: str,
+        dict_file: str,
+        spm_model_path: str,
+        providers=['AXCLRTExecutionProvider', 'AxEngineExecutionProvider']
+    ):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+        # session_opts.log_severity_level = 1
+        self.session_opts = session_opts
+        # NOTE: 参考whisper设置的最大的解码长度
+        # FireRedASR-AED 模型支持的最长语音为 60s
+        # ref: https://github.com/FireRedTeam/FireRedASR?tab=readme-ov-file#input-length-limitations
+        self.decode_max_len = 448
+        self.decoder_hidden_dim = 1280
+        self.num_decoder_blocks = 16
+        self.blank_id = 0
+        self.sos_id = 3
+        self.eos_id = 4
+        self.pad_id = 2
+        self.feature_extractor = ASRFeatExtractor(cmvn_file)
+        self.tokenizer = ChineseCharEnglishSpmTokenizer(dict_file, spm_model_path)
+        self.encoder = None
+        self.decoder = None
+        self.init_encoder(encoder_path, providers)
+        # self.init_decoder(decoder_path, providers)
+        self.init_decoder_main(decoder_path, providers)
+        self.init_decoder_loop(decoder_path, providers)
+        self.pe = self.init_pe(decoder_path)
+    def init_encoder(self, encoder_path, providers=None):
+        start_time = time.time()
+        self.encoder = axe.InferenceSession(
+            encoder_path,
+            # sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load encoder cost {end_time - start_time} seconds")
+    def init_decoder(self, decoder_path, providers=None):
+        start_time = time.time()
+        self.decoder = ort.InferenceSession(
+            decoder_path,
+            # sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder cost {end_time - start_time} seconds")
+    def init_decoder_main(self, decoder_path, providers=None):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "decoder_main.axmodel")
+        start_time = time.time()
+        self.decoder_main = axe.InferenceSession(
+            decoder_path,
+            # sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder_main cost {end_time - start_time} seconds")
+        # input_names = [i.name for i in self.decoder_main.get_inputs()]
+        # print(f"decoder_main.input_names: {input_names}")
+    def init_decoder_loop(self, decoder_path, providers=None):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "decoder_loop.axmodel")
+        start_time = time.time()
+        self.decoder_loop = axe.InferenceSession(
+            decoder_path,
+            # sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder_loop cost {end_time - start_time} seconds")
+        # input_names = [i.name for i in self.decoder_loop.get_inputs()]
+        # print(f"decoder_loop.input_names: {input_names}")
+    def init_pe(self, decoder_path):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "pe.npy")
+        return np.load(decoder_path)
+    def run_encoder(self, input: np.ndarray,
+                    input_length: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.encoder.run(
+            None,
+            {
+                "encoder_input": input,
+                "encoder_input_lengths": input_length
+            }
+        )
+        # n_layer_cross_k, n_layer_cross_v, cross_attn_mask = \
+        #     outputs["n_layer_cross_k"], outputs["n_layer_cross_v"], outputs["cross_attn_mask"]
+        return (
+            n_layer_cross_k,
+            n_layer_cross_v,
+            cross_attn_mask
+        )
+    def decode_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        offset: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        print("decode:")
+        print(f"tokens.shape: {tokens.shape}")
+        print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        print(f"offset.shape: {offset.shape}")
+        print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        # print(f"self_attn_mask: {self_attn_mask}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder.run(
+            None,
+            {
+                self.decoder.get_inputs()[0].name: tokens,
+                self.decoder.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder.get_inputs()[2].name: n_layer_self_v_cache,
+                self.decoder.get_inputs()[3].name: n_layer_cross_k_cache,
+                self.decoder.get_inputs()[4].name: n_layer_cross_v_cache,
+                self.decoder.get_inputs()[5].name: offset,
+                self.decoder.get_inputs()[6].name: self_attn_mask,
+                self.decoder.get_inputs()[7].name: cross_attn_mask,
+            }
+        )
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def decode_main_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        pe: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode_main:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"pe.shape: {pe.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_main.run(
+            None,
+            {
+                "tokens": tokens,
+                # self.decoder_main.get_inputs()[1].name: n_layer_self_k_cache,
+                "n_layer_cross_k": n_layer_cross_k_cache,
+                "n_layer_cross_v": n_layer_cross_v_cache,
+                "pe": pe,
+                "self_attn_mask": self_attn_mask,
+                "cross_attn_mask": cross_attn_mask,
+                # self.decoder_main.get_inputs()[7].name: cross_attn_mask,
+            }
+        )
+        # logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = \
+        #     outputs["logits"], outputs["out_n_layer_self_k_cache"], outputs["out_n_layer_self_v_cache"]
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def decode_loop_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        pe: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode_loop:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"pe.shape: {pe.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_loop.run(
+            None,
+            {
+                "tokens": tokens,
+                "in_n_layer_self_k_cache": n_layer_self_k_cache,
+                "in_n_layer_self_v_cache": n_layer_self_v_cache,
+                "n_layer_cross_k": n_layer_cross_k_cache,
+                "n_layer_cross_v": n_layer_cross_v_cache,
+                "pe": pe,
+                "self_attn_mask": self_attn_mask,
+                "cross_attn_mask": cross_attn_mask,
+            }
+        )
+        # logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = \
+        #     outputs["logits"], outputs["out_n_layer_self_k_cache"], outputs["out_n_layer_self_v_cache"]
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def run_decoder(
+        self,
+        n_layer_cross_k,
+        n_layer_cross_v,
+        cross_attn_mask,
+        beam_size,
+        nbest
+    ):
+        num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
+        encoder_out_length = cross_attn_mask.shape[-1]
+        cross_attn_mask = torch.from_numpy(cross_attn_mask).to(torch.float32)
+        cross_attn_mask = cross_attn_mask.unsqueeze(1).repeat(
+            1, beam_size, 1, 1
+        ).view(beam_size * batch_size, -1, encoder_out_length)
+        n_layer_cross_k = torch.from_numpy(n_layer_cross_k)
+        n_layer_cross_v = torch.from_numpy(n_layer_cross_v)
+        n_layer_cross_k = n_layer_cross_k.unsqueeze(2).repeat(
+            1, 1, beam_size, 1, 1
+        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        n_layer_cross_v = n_layer_cross_v.unsqueeze(2).repeat(
+            1, 1, beam_size, 1, 1
+        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        prediction_tokens = torch.ones(
+            beam_size * batch_size, 1).fill_(self.sos_id).long()
+        tokens = prediction_tokens
+        offset = torch.zeros(1, dtype=torch.int64)
+        n_layer_self_k_cache, n_layer_self_v_cache = self.get_initialized_self_cache(
+            batch_size, beam_size
+        )
+        scores = torch.tensor([0.0] + [-INF]*(beam_size - 1)).float()
+        scores = scores.repeat(batch_size).view(batch_size * beam_size, 1)
+        is_finished = torch.zeros_like(scores)
+        # self_attn_mask = torch.zeros(
+        #     batch_size * beam_size,
+        #     1, 1
+        # )
+        self_attn_mask = np.zeros((batch_size * beam_size, 1, 1), dtype=np.float32)
+        results = [self.sos_id]
+        for i in range(self.decode_max_len):
+            # self_attn_mask = torch.empty(
+            #     batch_size * beam_size,
+            #     prediction_tokens.shape[-1], prediction_tokens.shape[-1]
+            # ).fill_(-np.inf).triu_(1)
+            # self_attn_mask = self_attn_mask[:, -1:, :]
+            # self_attn_mask = to_numpy(self_attn_mask)
+            # logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_one_token(
+            #     to_numpy(tokens),
+            #     to_numpy(n_layer_self_k_cache),
+            #     to_numpy(n_layer_self_v_cache),
+            #     to_numpy(n_layer_cross_k),
+            #     to_numpy(n_layer_cross_v),
+            #     to_numpy(offset),
+            #     to_numpy(self_attn_mask),
+            #     to_numpy(cross_attn_mask)
+            # )
+            tokens = to_numpy(tokens).astype(np.int32)
+            n_layer_self_k_cache = to_numpy(n_layer_self_k_cache)
+            n_layer_self_v_cache = to_numpy(n_layer_self_v_cache)
+            n_layer_cross_k = to_numpy(n_layer_cross_k)
+            n_layer_cross_v = to_numpy(n_layer_cross_v)
+            cross_attn_mask = to_numpy(cross_attn_mask)
+            # for name, npy in zip(
+            #     ["tokens", "n_layer_self_k_cache", "n_layer_self_v_cache", "n_layer_cross_k", "n_layer_cross_v", "pe", "self_attn_mask", "cross_attn_mask"],
+            #     [tokens, n_layer_self_k_cache, n_layer_self_v_cache, n_layer_cross_k, n_layer_cross_v, self.pe[offset], self_attn_mask, cross_attn_mask]
+            # ):
+            #     file_path = os.path.join(decoder_data_path, name)
+            #     os.makedirs(file_path, exist_ok=True)
+            #     np.save(os.path.join(file_path, f"{i}.npy"), npy)
+            if i == 0:
+                logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_main_one_token(
+                    to_numpy(tokens),
+                    to_numpy(n_layer_self_k_cache),
+                    to_numpy(n_layer_self_v_cache),
+                    to_numpy(n_layer_cross_k),
+                    to_numpy(n_layer_cross_v),
+                    self.pe[offset],
+                    self_attn_mask,
+                    to_numpy(cross_attn_mask)
+                )
+            else:
+                logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_loop_one_token(
+                    to_numpy(tokens),
+                    to_numpy(n_layer_self_k_cache),
+                    to_numpy(n_layer_self_v_cache),
+                    to_numpy(n_layer_cross_k),
+                    to_numpy(n_layer_cross_v),
+                    self.pe[offset],
+                    self_attn_mask,
+                    to_numpy(cross_attn_mask)
+                )
+            offset += 1
+            logits = torch.from_numpy(logits)
+            logits = logits.squeeze(1)
+            t_scores = F.log_softmax(logits, dim=-1)
+            t_topB_scores, t_topB_ys = torch.topk(t_scores, k=beam_size, dim=1)
+            t_topB_scores = set_finished_beam_score_to_zero(t_topB_scores, is_finished)
+            t_topB_ys = set_finished_beam_y_to_eos(t_topB_ys, is_finished, self.eos_id)
+            scores = scores + t_topB_scores
+            scores = scores.view(batch_size, beam_size * beam_size)
+            scores, topB_score_ids = torch.topk(scores, k=beam_size, dim=1)
+            scores = scores.view(-1, 1)
+            topB_row_number_in_each_B_rows_of_ys = torch.div(
+                topB_score_ids, beam_size).view(batch_size * beam_size)
+            stride = beam_size * torch.arange(batch_size).view(
+                batch_size, 1).repeat(1, beam_size).view(batch_size * beam_size)
+            topB_row_number_in_ys = topB_row_number_in_each_B_rows_of_ys.long() + stride.long()
+            prediction_tokens = prediction_tokens[topB_row_number_in_ys]
+            t_ys = torch.gather(
+                t_topB_ys.view(batch_size, beam_size * beam_size),
+                dim=1, index=topB_score_ids
+            ).view(beam_size * batch_size, 1)
+            tokens = t_ys
+            prediction_tokens = torch.cat((prediction_tokens, t_ys), dim=1)
+            n_layer_self_k_cache = torch.from_numpy(n_layer_self_k_cache)
+            n_layer_self_v_cache = torch.from_numpy(n_layer_self_v_cache)
+            for i, self_k_cache in enumerate(n_layer_self_k_cache):
+                n_layer_self_k_cache[i] = n_layer_self_k_cache[i][topB_row_number_in_ys]
+            for i, self_v_cache in enumerate(n_layer_self_v_cache):
+                n_layer_self_v_cache[i] = n_layer_self_v_cache[i][topB_row_number_in_ys]
+            is_finished = t_ys.eq(self.eos_id)
+            if is_finished.sum().item() == beam_size * batch_size:
+                break
+        scores = scores.view(batch_size, beam_size)
+        prediction_valid_token_lengths = torch.sum(
+            torch.ne(
+                prediction_tokens.view(batch_size, beam_size, -1),
+                self.eos_id),
+            dim=-1
+        ).int()
+        nbest_scores, nbest_ids = torch.topk(scores, k=nbest, dim=1)
+        index = nbest_ids + beam_size * torch.arange(batch_size).view(batch_size, 1).long()
+        nbest_prediction_tokens = prediction_tokens.view(batch_size * beam_size, -1)[index.view(-1)]
+        nbest_prediction_tokens = nbest_prediction_tokens.view(batch_size, nbest_ids.size(1), -1)
+        nbest_prediction_valid_token_lengths = prediction_valid_token_lengths.view(
+            batch_size * beam_size)[index.view(-1)].view(batch_size, -1)
+        nbest_hyps: List[List[Dict[str, torch.Tensor]]] = []
+        for i in range(batch_size):
+            i_best_hyps: List[Dict[str, torch.Tensor]] = []
+            for j, score in enumerate(nbest_scores[i]):
+                hyp = {
+                    "token_ids": nbest_prediction_tokens[i, j, 1:nbest_prediction_valid_token_lengths[i, j]],
+                    "score": score
+                }
+                i_best_hyps.append(hyp)
+            nbest_hyps.append(i_best_hyps)
+        return nbest_hyps
+    def get_initialized_self_cache(self,
+                                   batch_size,
+                                   beam_size
+                                   ) -> Tuple[Tensor, Tensor]:
+        n_layer_self_k_cache = torch.zeros(
+            self.num_decoder_blocks,
+            batch_size * beam_size,
+            self.decode_max_len,
+            self.decoder_hidden_dim,
+        )
+        n_layer_self_v_cache = torch.zeros(
+            self.num_decoder_blocks,
+            batch_size * beam_size,
+            self.decode_max_len,
+            self.decoder_hidden_dim,
+        )
+        return n_layer_self_k_cache, n_layer_self_v_cache
+    def calc_feat_len(self, audio_dur):
+        import math
+        sample_rate = 16000
+        frame_length = 25 * sample_rate / 1000
+        frame_shift = 10 * sample_rate / 1000
+        length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
+        return length
+    def transcribe(self,
+                   batch_wav_path: List[str],
+                   beam_size: int = 1,
+                   nbest: int = 1
+                ) -> List[Dict]:
+        feats, lengths, wav_durations = self.feature_extractor(batch_wav_path)
+        print(f"feats.shape: {feats.shape}")
+        maxlen = self.calc_feat_len(10)
+        if feats.shape[1] < maxlen:
+            feats = np.concatenate([feats, np.zeros((1, maxlen - feats.shape[1], 80), dtype=np.float32)], axis=1)
+        feats = feats[:, :maxlen, :]
+        # encoder_data_path = os.path.join("calib_dataset", "encoder", os.path.basename(batch_wav_path[0]))
+        # decoder_data_path = os.path.join("calib_dataset", "decoder", os.path.basename(batch_wav_path[0]))
+        # os.makedirs(encoder_data_path, exist_ok=True)
+        # os.makedirs(decoder_data_path, exist_ok=True)
+        feats = to_numpy(feats)
+        lengths = to_numpy(lengths).astype(np.int32)
+        # for name, npy in zip(["encoder_input", "encoder_input_lengths"], [feats, lengths]):
+        #     file_path = os.path.join(encoder_data_path, name + ".npy")
+        #     np.save(file_path, npy)
+        start_time = time.time()
+        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.run_encoder(
+            to_numpy(feats),
+            to_numpy(lengths)
+        )
+        nbest_hyps = self.run_decoder(n_layer_cross_k,
+                                      n_layer_cross_v,
+                                      cross_attn_mask,
+                                      beam_size,
+                                      nbest,
+                                      )
+        transcribe_durations = time.time() - start_time
+        results: List[Dict] = []
+        for wav, hyp in zip(batch_wav_path, nbest_hyps):
+            hyp = hyp[0]
+            hyp_ids = [int(id) for id in hyp["token_ids"].cpu()]
+            score = hyp["score"].item()
+            text = self.tokenizer.detokenize(hyp_ids)
+            results.append(
+                {
+                    "wav": wav,
+                    "text": text,
+                    "score": score
+                }
+            )
+        return results, wav_durations, transcribe_durations
+def parse_args():
+    parser = argparse.ArgumentParser(description="FireRedASROnnxModel Test")
+    parser.add_argument(
+        "--encoder",
+        type=str,
+        default="axmodel/encoder.axmodel",
+        help="Path to onnx encoder"
+    )
+    parser.add_argument(
+        "--decoder",
+        type=str,
+        default="axmodel/decoder_main.axmodel",
+        help="Path to onnx decoder"
+    )
+    parser.add_argument(
+        "--cmvn",
+        type=str,
+        default="axmodel/cmvn.ark",
+        help="Path to cmvn"
+    )
+    parser.add_argument(
+        "--dict",
+        type=str,
+        default="axmodel/dict.txt",
+        help="Path to dict"
+    )
+    parser.add_argument(
+        "--spm_model",
+        type=str,
+        default="axmodel/train_bpe1000.model",
+        help="Path to spm model"
+    )
+    parser.add_argument(
+        "--wavlist",
+        type=str,
+        default="wavlist.txt",
+        help="File to wav path list"
+    )
+    parser.add_argument(
+        "--hypo",
+        type=str,
+        default="hypo_axmodel.txt",
+        help="File of hypos"
+    )
+    parser.add_argument(
+        "--beam_size",
+        type=int,
+        default=3,
+        help=""
+    )
+    parser.add_argument(
+        "--nbest",
+        type=int,
+        default=1,
+        help=""
+    )
+    return parser.parse_args()
+def parse_wavlist(wavlist: str):
+    wavpaths = []
+    with open(wavlist) as f:
+        for line in f:
+            line = line.strip()
+            if not os.path.exists(line):
+                print(f"{line} doesn't exist.")
+                continue
+            wavpaths.append(line)
+    return wavpaths
+def main():
+    args = parse_args()
+    print(args)
+    onnx_model = FireRedASROnnxModel(args.encoder,
+                                     args.decoder,
+                                     args.cmvn,
+                                     args.dict,
+                                     args.spm_model,
+                                     )
+    wf = open(args.hypo, "wt")
+    wavlist = parse_wavlist(args.wavlist)
+    total_wav_durations = 0
+    total_transcribe_durations = 0
+    for wav in wavlist:
+        batch_wav = [wav]
+        results, wav_durations, transcribe_durations = onnx_model.transcribe(
+            batch_wav, args.beam_size, args.nbest)
+        wav_durations = sum(wav_durations)
+        total_wav_durations += wav_durations
+        total_transcribe_durations += transcribe_durations
+        logger.info(f"{batch_wav}")
+        logger.info(f"Durations: {wav_durations}")
+        logger.info(f"Transcribe Durations: {transcribe_durations}")
+        rtf = transcribe_durations / wav_durations
+        logger.info(f"(Real time factor) RTF: {rtf}")
+        for result in results:
+            logger.info(f"wav: {result['wav']}")
+            logger.info(f"text: {result['text']}")
+            logger.info(f"score: {result['score']}")
+            logger.info("")
+            wf.write(f"{result['text']} ({result['wav']})\n")
+    logger.info(f"total wav durations: {total_wav_durations}")
+    logger.info(f"total transcribe durations: {total_transcribe_durations}")
+    avg_ref = total_transcribe_durations / total_wav_durations
+    logger.info(f"AVG RTF: {avg_ref}")
+    wf.close()
+if __name__ == "__main__":
+    main()

test_decoder.py ADDED Viewed

	@@ -0,0 +1,640 @@

+from fireredasr.data.asr_feat import ASRFeatExtractor
+from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer
+import onnxruntime as ort
+# import axengine as axe
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torch import Tensor
+from typing import Tuple, List, Dict
+import argparse
+import os
+import time
+import logging
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+logger_stream_hander = logging.StreamHandler()
+logger_stream_hander.setLevel("INFO")
+logger.addHandler(logger_stream_hander)
+INF = 1e10
+def to_numpy(tensor):
+    if isinstance(tensor, np.ndarray):
+        return tensor
+    if tensor.requires_grad:
+        return tensor.detach().cpu().numpy()
+    else:
+        return tensor.cpu().numpy()
+def set_finished_beam_score_to_zero(scores, is_finished):
+    NB, B = scores.size()
+    is_finished = is_finished.float()
+    mask_score = torch.tensor([0.0] + [-INF]*(B-1)).float()
+    mask_score = mask_score.view(1, B).repeat(NB, 1)
+    return scores * (1 - is_finished) + mask_score * is_finished
+def set_finished_beam_y_to_eos(ys, is_finished, eos_id):
+    is_finished = is_finished.long()
+    return ys * (1 - is_finished) + eos_id * is_finished
+class FireRedASROnnxModel:
+    def __init__(
+        self,
+        encoder_path: str,
+        decoder_path: str,
+        cmvn_file: str,
+        dict_file: str,
+        spm_model_path: str,
+        providers=['CPUExecutionProvider']
+    ):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+        # session_opts.log_severity_level = 1
+        self.session_opts = session_opts
+        # NOTE: 参考whisper设置的最大的解码长度
+        # FireRedASR-AED 模型支持的最长语音为 60s
+        # ref: https://github.com/FireRedTeam/FireRedASR?tab=readme-ov-file#input-length-limitations
+        self.decode_max_len = 448
+        self.decoder_hidden_dim = 1280
+        self.num_decoder_blocks = 16
+        self.blank_id = 0
+        self.sos_id = 3
+        self.eos_id = 4
+        self.pad_id = 2
+        self.feature_extractor = ASRFeatExtractor(cmvn_file)
+        self.tokenizer = ChineseCharEnglishSpmTokenizer(dict_file, spm_model_path)
+        self.encoder = None
+        self.decoder = None
+        # self.init_encoder(encoder_path, providers)
+        # self.init_decoder(decoder_path, providers)
+        self.init_decoder_main(decoder_path, providers)
+        self.init_decoder_loop(decoder_path, providers)
+        self.pe = self.init_pe(decoder_path)
+    # def init_encoder(self, encoder_path, providers=None):
+    #     start_time = time.time()
+    #     self.encoder = axe.InferenceSession(
+    #         encoder_path,
+    #         # sess_options=self.session_opts,
+    #         providers=providers
+    #     )
+    #     end_time = time.time()
+    #     logger.info(f"load encoder cost {end_time - start_time} seconds")
+    def init_decoder(self, decoder_path, providers=None):
+        start_time = time.time()
+        self.decoder = ort.InferenceSession(
+            decoder_path,
+            sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder cost {end_time - start_time} seconds")
+    def init_decoder_main(self, decoder_path, providers=None):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "decoder_main.onnx")
+        start_time = time.time()
+        self.decoder_main = ort.InferenceSession(
+            decoder_path,
+            sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder_main cost {end_time - start_time} seconds")
+        input_names = [i.name for i in self.decoder_main.get_inputs()]
+        print(f"decoder_main.input_names: {input_names}")
+    def init_decoder_loop(self, decoder_path, providers=None):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "decoder_loop.onnx")
+        start_time = time.time()
+        self.decoder_loop = ort.InferenceSession(
+            decoder_path,
+            sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder_loop cost {end_time - start_time} seconds")
+        input_names = [i.name for i in self.decoder_loop.get_inputs()]
+        print(f"decoder_loop.input_names: {input_names}")
+    def init_pe(self, decoder_path):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "pe.npy")
+        return np.load(decoder_path)
+    def run_encoder(self, input: np.ndarray,
+                    input_length: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.encoder.run(
+            None,
+            {
+                "encoder_input": input,
+                "encoder_input_lengths": input_length.astype(np.int32)
+            }
+        )
+        return (
+            n_layer_cross_k,
+            n_layer_cross_v,
+            cross_attn_mask
+        )
+    def decode_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        offset: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        print("decode:")
+        print(f"tokens.shape: {tokens.shape}")
+        print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        print(f"offset.shape: {offset.shape}")
+        print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        # print(f"self_attn_mask: {self_attn_mask}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder.run(
+            None,
+            {
+                self.decoder.get_inputs()[0].name: tokens,
+                self.decoder.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder.get_inputs()[2].name: n_layer_self_v_cache,
+                self.decoder.get_inputs()[3].name: n_layer_cross_k_cache,
+                self.decoder.get_inputs()[4].name: n_layer_cross_v_cache,
+                self.decoder.get_inputs()[5].name: offset,
+                self.decoder.get_inputs()[6].name: self_attn_mask,
+                self.decoder.get_inputs()[7].name: cross_attn_mask,
+            }
+        )
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def decode_main_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        pe: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode_main:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"pe.shape: {pe.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_main.run(
+            None,
+            {
+                self.decoder_main.get_inputs()[0].name: tokens,
+                # self.decoder_main.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder_main.get_inputs()[1].name: n_layer_cross_k_cache,
+                self.decoder_main.get_inputs()[2].name: n_layer_cross_v_cache,
+                self.decoder_main.get_inputs()[3].name: pe,
+                self.decoder_main.get_inputs()[4].name: self_attn_mask,
+                self.decoder_main.get_inputs()[5].name: cross_attn_mask,
+                # self.decoder_main.get_inputs()[7].name: cross_attn_mask,
+            }
+        )
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def decode_loop_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        pe: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode_loop:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"pe.shape: {pe.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_loop.run(
+            None,
+            {
+                self.decoder_loop.get_inputs()[0].name: tokens,
+                self.decoder_loop.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder_loop.get_inputs()[2].name: n_layer_self_v_cache,
+                self.decoder_loop.get_inputs()[3].name: n_layer_cross_k_cache,
+                self.decoder_loop.get_inputs()[4].name: n_layer_cross_v_cache,
+                self.decoder_loop.get_inputs()[5].name: pe,
+                self.decoder_loop.get_inputs()[6].name: self_attn_mask,
+                self.decoder_loop.get_inputs()[7].name: cross_attn_mask,
+            }
+        )
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def run_decoder(
+        self,
+        n_layer_cross_k,
+        n_layer_cross_v,
+        cross_attn_mask,
+        beam_size,
+        nbest
+    ):
+        num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
+        encoder_out_length = cross_attn_mask.shape[-1]
+        cross_attn_mask = torch.from_numpy(cross_attn_mask).to(torch.float32)
+        cross_attn_mask = cross_attn_mask.unsqueeze(1).repeat(
+            1, beam_size, 1, 1
+        ).view(beam_size * batch_size, -1, encoder_out_length)
+        n_layer_cross_k = torch.from_numpy(n_layer_cross_k)
+        n_layer_cross_v = torch.from_numpy(n_layer_cross_v)
+        n_layer_cross_k = n_layer_cross_k.unsqueeze(2).repeat(
+            1, 1, beam_size, 1, 1
+        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        n_layer_cross_v = n_layer_cross_v.unsqueeze(2).repeat(
+            1, 1, beam_size, 1, 1
+        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        prediction_tokens = torch.ones(
+            beam_size * batch_size, 1).fill_(self.sos_id).long()
+        tokens = prediction_tokens
+        offset = torch.zeros(1, dtype=torch.int64)
+        n_layer_self_k_cache, n_layer_self_v_cache = self.get_initialized_self_cache(
+            batch_size, beam_size
+        )
+        scores = torch.tensor([0.0] + [-INF]*(beam_size - 1)).float()
+        scores = scores.repeat(batch_size).view(batch_size * beam_size, 1)
+        is_finished = torch.zeros_like(scores)
+        # self_attn_mask = torch.zeros(
+        #     batch_size * beam_size,
+        #     1, 1
+        # )
+        self_attn_mask = np.zeros((batch_size * beam_size, 1, 1), dtype=np.float32)
+        results = [self.sos_id]
+        for i in range(self.decode_max_len):
+            # self_attn_mask = torch.empty(
+            #     batch_size * beam_size,
+            #     prediction_tokens.shape[-1], prediction_tokens.shape[-1]
+            # ).fill_(-np.inf).triu_(1)
+            # self_attn_mask = self_attn_mask[:, -1:, :]
+            # self_attn_mask = to_numpy(self_attn_mask)
+            # logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_one_token(
+            #     to_numpy(tokens),
+            #     to_numpy(n_layer_self_k_cache),
+            #     to_numpy(n_layer_self_v_cache),
+            #     to_numpy(n_layer_cross_k),
+            #     to_numpy(n_layer_cross_v),
+            #     to_numpy(offset),
+            #     to_numpy(self_attn_mask),
+            #     to_numpy(cross_attn_mask)
+            # )
+            tokens = to_numpy(tokens)
+            n_layer_self_k_cache = to_numpy(n_layer_self_k_cache)
+            n_layer_self_v_cache = to_numpy(n_layer_self_v_cache)
+            n_layer_cross_k = to_numpy(n_layer_cross_k)
+            n_layer_cross_v = to_numpy(n_layer_cross_v)
+            cross_attn_mask = to_numpy(cross_attn_mask)
+            if i == 0:
+                logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_main_one_token(
+                    to_numpy(tokens),
+                    to_numpy(n_layer_self_k_cache),
+                    to_numpy(n_layer_self_v_cache),
+                    to_numpy(n_layer_cross_k),
+                    to_numpy(n_layer_cross_v),
+                    self.pe[offset],
+                    self_attn_mask,
+                    to_numpy(cross_attn_mask)
+                )
+            else:
+                logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_loop_one_token(
+                    to_numpy(tokens),
+                    to_numpy(n_layer_self_k_cache),
+                    to_numpy(n_layer_self_v_cache),
+                    to_numpy(n_layer_cross_k),
+                    to_numpy(n_layer_cross_v),
+                    self.pe[offset],
+                    self_attn_mask,
+                    to_numpy(cross_attn_mask)
+                )
+            offset += 1
+            logits = torch.from_numpy(logits)
+            logits = logits.squeeze(1)
+            t_scores = F.log_softmax(logits, dim=-1)
+            t_topB_scores, t_topB_ys = torch.topk(t_scores, k=beam_size, dim=1)
+            t_topB_scores = set_finished_beam_score_to_zero(t_topB_scores, is_finished)
+            t_topB_ys = set_finished_beam_y_to_eos(t_topB_ys, is_finished, self.eos_id)
+            scores = scores + t_topB_scores
+            scores = scores.view(batch_size, beam_size * beam_size)
+            scores, topB_score_ids = torch.topk(scores, k=beam_size, dim=1)
+            scores = scores.view(-1, 1)
+            topB_row_number_in_each_B_rows_of_ys = torch.div(
+                topB_score_ids, beam_size).view(batch_size * beam_size)
+            stride = beam_size * torch.arange(batch_size).view(
+                batch_size, 1).repeat(1, beam_size).view(batch_size * beam_size)
+            topB_row_number_in_ys = topB_row_number_in_each_B_rows_of_ys.long() + stride.long()
+            prediction_tokens = prediction_tokens[topB_row_number_in_ys]
+            t_ys = torch.gather(
+                t_topB_ys.view(batch_size, beam_size * beam_size),
+                dim=1, index=topB_score_ids
+            ).view(beam_size * batch_size, 1)
+            tokens = t_ys
+            prediction_tokens = torch.cat((prediction_tokens, t_ys), dim=1)
+            n_layer_self_k_cache = torch.from_numpy(n_layer_self_k_cache)
+            n_layer_self_v_cache = torch.from_numpy(n_layer_self_v_cache)
+            for i, self_k_cache in enumerate(n_layer_self_k_cache):
+                n_layer_self_k_cache[i] = n_layer_self_k_cache[i][topB_row_number_in_ys]
+            for i, self_v_cache in enumerate(n_layer_self_v_cache):
+                n_layer_self_v_cache[i] = n_layer_self_v_cache[i][topB_row_number_in_ys]
+            is_finished = t_ys.eq(self.eos_id)
+            if is_finished.sum().item() == beam_size * batch_size:
+                break
+        scores = scores.view(batch_size, beam_size)
+        prediction_valid_token_lengths = torch.sum(
+            torch.ne(
+                prediction_tokens.view(batch_size, beam_size, -1),
+                self.eos_id),
+            dim=-1
+        ).int()
+        nbest_scores, nbest_ids = torch.topk(scores, k=nbest, dim=1)
+        index = nbest_ids + beam_size * torch.arange(batch_size).view(batch_size, 1).long()
+        nbest_prediction_tokens = prediction_tokens.view(batch_size * beam_size, -1)[index.view(-1)]
+        nbest_prediction_tokens = nbest_prediction_tokens.view(batch_size, nbest_ids.size(1), -1)
+        nbest_prediction_valid_token_lengths = prediction_valid_token_lengths.view(
+            batch_size * beam_size)[index.view(-1)].view(batch_size, -1)
+        nbest_hyps: List[List[Dict[str, torch.Tensor]]] = []
+        for i in range(batch_size):
+            i_best_hyps: List[Dict[str, torch.Tensor]] = []
+            for j, score in enumerate(nbest_scores[i]):
+                hyp = {
+                    "token_ids": nbest_prediction_tokens[i, j, 1:nbest_prediction_valid_token_lengths[i, j]],
+                    "score": score
+                }
+                i_best_hyps.append(hyp)
+            nbest_hyps.append(i_best_hyps)
+        return nbest_hyps
+    def get_initialized_self_cache(self,
+                                   batch_size,
+                                   beam_size
+                                   ) -> Tuple[Tensor, Tensor]:
+        n_layer_self_k_cache = torch.zeros(
+            self.num_decoder_blocks,
+            batch_size * beam_size,
+            self.decode_max_len,
+            self.decoder_hidden_dim,
+        )
+        n_layer_self_v_cache = torch.zeros(
+            self.num_decoder_blocks,
+            batch_size * beam_size,
+            self.decode_max_len,
+            self.decoder_hidden_dim,
+        )
+        return n_layer_self_k_cache, n_layer_self_v_cache
+    def calc_feat_len(self, audio_dur):
+        import math
+        sample_rate = 16000
+        frame_length = 25 * sample_rate / 1000
+        frame_shift = 10 * sample_rate / 1000
+        length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
+        return length
+    def transcribe(self,
+                   batch_wav_path: List[str],
+                   beam_size: int = 1,
+                   nbest: int = 1
+                ) -> List[Dict]:
+        feats, lengths, wav_durations = self.feature_extractor(batch_wav_path)
+        print(f"feats.shape: {feats.shape}")
+        maxlen = self.calc_feat_len(10)
+        if feats.shape[1] < maxlen:
+            feats = np.concatenate([feats, np.zeros((1, maxlen - feats.shape[1], 80), dtype=np.float32)], axis=1)
+        feats = feats[:, :maxlen, :]
+        encoder_data_path = os.path.join("encoder_output", os.path.basename(batch_wav_path[0]))
+        # decoder_data_path = os.path.join("calib_dataset", "decoder", os.path.basename(batch_wav_path[0]))
+        # os.makedirs(encoder_data_path, exist_ok=True)
+        # os.makedirs(decoder_data_path, exist_ok=True)
+        n_layer_cross_k = np.load(os.path.join(encoder_data_path, "n_layer_cross_k.npy"))
+        n_layer_cross_v = np.load(os.path.join(encoder_data_path, "n_layer_cross_v.npy"))
+        cross_attn_mask = np.load(os.path.join(encoder_data_path, "cross_attn_mask.npy"))
+        # for name, npy in zip(["encoder_input", "encoder_input_lengths"], [feats, lengths]):
+        #     file_path = os.path.join(encoder_data_path, name + ".npy")
+        #     np.save(file_path, npy)
+        start_time = time.time()
+        nbest_hyps = self.run_decoder(n_layer_cross_k,
+                                      n_layer_cross_v,
+                                      cross_attn_mask,
+                                      beam_size,
+                                      nbest
+                                      )
+        transcribe_durations = time.time() - start_time
+        results: List[Dict] = []
+        for wav, hyp in zip(batch_wav_path, nbest_hyps):
+            hyp = hyp[0]
+            hyp_ids = [int(id) for id in hyp["token_ids"].cpu()]
+            score = hyp["score"].item()
+            text = self.tokenizer.detokenize(hyp_ids)
+            results.append(
+                {
+                    "wav": wav,
+                    "text": text,
+                    "score": score
+                }
+            )
+        return results, wav_durations, transcribe_durations
+def parse_args():
+    parser = argparse.ArgumentParser(description="FireRedASROnnxModel Test")
+    parser.add_argument(
+        "--encoder",
+        type=str,
+        default="axmodel/encoder.axmodel",
+        help="Path to onnx encoder"
+    )
+    parser.add_argument(
+        "--decoder",
+        type=str,
+        default="onnx_decoder/decoder_main.onnx",
+        help="Path to onnx decoder"
+    )
+    parser.add_argument(
+        "--cmvn",
+        type=str,
+        default="axmodel/cmvn.ark",
+        help="Path to cmvn"
+    )
+    parser.add_argument(
+        "--dict",
+        type=str,
+        default="axmodel/dict.txt",
+        help="Path to dict"
+    )
+    parser.add_argument(
+        "--spm_model",
+        type=str,
+        default="axmodel/train_bpe1000.model",
+        help="Path to spm model"
+    )
+    parser.add_argument(
+        "--wavlist",
+        type=str,
+        default="wavlist.txt",
+        help="File to wav path list"
+    )
+    parser.add_argument(
+        "--hypo",
+        type=str,
+        default="hypo_encoder.txt",
+        help="File of hypos"
+    )
+    parser.add_argument(
+        "--beam_size",
+        type=int,
+        default=3,
+        help=""
+    )
+    parser.add_argument(
+        "--nbest",
+        type=int,
+        default=1,
+        help=""
+    )
+    return parser.parse_args()
+def parse_wavlist(wavlist: str):
+    wavpaths = []
+    with open(wavlist) as f:
+        for line in f:
+            line = line.strip()
+            if not os.path.exists(line):
+                print(f"{line} doesn't exist.")
+                continue
+            wavpaths.append(line)
+    return wavpaths
+def main():
+    args = parse_args()
+    print(args)
+    onnx_model = FireRedASROnnxModel(args.encoder,
+                                     args.decoder,
+                                     args.cmvn,
+                                     args.dict,
+                                     args.spm_model)
+    wf = open(args.hypo, "wt")
+    wavlist = parse_wavlist(args.wavlist)
+    total_wav_durations = 0
+    total_transcribe_durations = 0
+    for wav in wavlist:
+        batch_wav = [wav]
+        results, wav_durations, transcribe_durations = onnx_model.transcribe(batch_wav, args.beam_size, args.nbest)
+        wav_durations = sum(wav_durations)
+        total_wav_durations += wav_durations
+        total_transcribe_durations += transcribe_durations
+        logger.info(f"{batch_wav}")
+        logger.info(f"Durations: {wav_durations}")
+        logger.info(f"Transcribe Durations: {transcribe_durations}")
+        rtf = transcribe_durations / wav_durations
+        logger.info(f"(Real time factor) RTF: {rtf}")
+        for result in results:
+            logger.info(f"wav: {result['wav']}")
+            logger.info(f"text: {result['text']}")
+            logger.info(f"score: {result['score']}")
+            logger.info("")
+            wf.write(f"{result['text']} ({result['wav']})\n")
+    logger.info(f"total wav durations: {total_wav_durations}")
+    logger.info(f"total transcribe durations: {total_transcribe_durations}")
+    avg_ref = total_transcribe_durations / total_wav_durations
+    logger.info(f"AVG RTF: {avg_ref}")
+    wf.close()
+if __name__ == "__main__":
+    main()

test_encoder.py ADDED Viewed

	@@ -0,0 +1,646 @@

+from fireredasr.data.asr_feat import ASRFeatExtractor
+from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer
+import onnxruntime as ort
+import axengine as axe
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torch import Tensor
+from typing import Tuple, List, Dict
+import argparse
+import os
+import time
+import logging
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+logger_stream_hander = logging.StreamHandler()
+logger_stream_hander.setLevel("INFO")
+logger.addHandler(logger_stream_hander)
+INF = 1e10
+def to_numpy(tensor):
+    if isinstance(tensor, np.ndarray):
+        return tensor
+    if tensor.requires_grad:
+        return tensor.detach().cpu().numpy()
+    else:
+        return tensor.cpu().numpy()
+def set_finished_beam_score_to_zero(scores, is_finished):
+    NB, B = scores.size()
+    is_finished = is_finished.float()
+    mask_score = torch.tensor([0.0] + [-INF]*(B-1)).float()
+    mask_score = mask_score.view(1, B).repeat(NB, 1)
+    return scores * (1 - is_finished) + mask_score * is_finished
+def set_finished_beam_y_to_eos(ys, is_finished, eos_id):
+    is_finished = is_finished.long()
+    return ys * (1 - is_finished) + eos_id * is_finished
+class FireRedASROnnxModel:
+    def __init__(
+        self,
+        encoder_path: str,
+        decoder_path: str,
+        cmvn_file: str,
+        dict_file: str,
+        spm_model_path: str,
+        providers=['AXCLRTExecutionProvider', 'AxEngineExecutionProvider']
+    ):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+        # session_opts.log_severity_level = 1
+        self.session_opts = session_opts
+        # NOTE: 参考whisper设置的最大的解码长度
+        # FireRedASR-AED 模型支持的最长语音为 60s
+        # ref: https://github.com/FireRedTeam/FireRedASR?tab=readme-ov-file#input-length-limitations
+        self.decode_max_len = 448
+        self.decoder_hidden_dim = 1280
+        self.num_decoder_blocks = 16
+        self.blank_id = 0
+        self.sos_id = 3
+        self.eos_id = 4
+        self.pad_id = 2
+        self.feature_extractor = ASRFeatExtractor(cmvn_file)
+        self.tokenizer = ChineseCharEnglishSpmTokenizer(dict_file, spm_model_path)
+        self.encoder = None
+        self.decoder = None
+        self.init_encoder(encoder_path, providers)
+        # self.init_decoder(decoder_path, providers)
+        # self.init_decoder_main(decoder_path, providers)
+        # self.init_decoder_loop(decoder_path, providers)
+        self.pe = self.init_pe(decoder_path)
+    def init_encoder(self, encoder_path, providers=None):
+        start_time = time.time()
+        self.encoder = axe.InferenceSession(
+            encoder_path,
+            # sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load encoder cost {end_time - start_time} seconds")
+    def init_decoder(self, decoder_path, providers=None):
+        start_time = time.time()
+        self.decoder = ort.InferenceSession(
+            decoder_path,
+            sess_options=self.session_opts,
+            providers=['CPUExecutionProvider']
+        )
+        end_time = time.time()
+        logger.info(f"load decoder cost {end_time - start_time} seconds")
+    def init_decoder_main(self, decoder_path, providers=None):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "decoder_main.onnx")
+        start_time = time.time()
+        self.decoder_main = ort.InferenceSession(
+            decoder_path,
+            sess_options=self.session_opts,
+            providers=['CPUExecutionProvider']
+        )
+        end_time = time.time()
+        logger.info(f"load decoder_main cost {end_time - start_time} seconds")
+        input_names = [i.name for i in self.decoder_main.get_inputs()]
+        print(f"decoder_main.input_names: {input_names}")
+    def init_decoder_loop(self, decoder_path, providers=None):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "decoder_loop.onnx")
+        start_time = time.time()
+        self.decoder_loop = ort.InferenceSession(
+            decoder_path,
+            sess_options=self.session_opts,
+            providers=['CPUExecutionProvider']
+        )
+        end_time = time.time()
+        logger.info(f"load decoder_loop cost {end_time - start_time} seconds")
+        input_names = [i.name for i in self.decoder_loop.get_inputs()]
+        print(f"decoder_loop.input_names: {input_names}")
+    def init_pe(self, decoder_path):
+        decoder_path = os.path.join("axmodel", "pe.npy")
+        return np.load(decoder_path)
+    def run_encoder(self, input: np.ndarray,
+                    input_length: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.encoder.run(
+            None,
+            {
+                "encoder_input": input,
+                "encoder_input_lengths": input_length.astype(np.int32)
+            }
+        )
+        return (
+            n_layer_cross_k,
+            n_layer_cross_v,
+            cross_attn_mask
+        )
+    def decode_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        offset: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"offset.shape: {offset.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        # print(f"self_attn_mask: {self_attn_mask}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder.run(
+            None,
+            {
+                self.decoder.get_inputs()[0].name: tokens,
+                self.decoder.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder.get_inputs()[2].name: n_layer_self_v_cache,
+                self.decoder.get_inputs()[3].name: n_layer_cross_k_cache,
+                self.decoder.get_inputs()[4].name: n_layer_cross_v_cache,
+                self.decoder.get_inputs()[5].name: offset,
+                self.decoder.get_inputs()[6].name: self_attn_mask,
+                self.decoder.get_inputs()[7].name: cross_attn_mask,
+            }
+        )
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def decode_main_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        pe: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode_main:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"pe.shape: {pe.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_main.run(
+            None,
+            {
+                self.decoder_main.get_inputs()[0].name: tokens,
+                # self.decoder_main.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder_main.get_inputs()[1].name: n_layer_cross_k_cache,
+                self.decoder_main.get_inputs()[2].name: n_layer_cross_v_cache,
+                self.decoder_main.get_inputs()[3].name: pe,
+                self.decoder_main.get_inputs()[4].name: self_attn_mask,
+                self.decoder_main.get_inputs()[5].name: cross_attn_mask,
+                # self.decoder_main.get_inputs()[7].name: cross_attn_mask,
+            }
+        )
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def decode_loop_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        pe: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode_loop:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"pe.shape: {pe.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_loop.run(
+            None,
+            {
+                self.decoder_loop.get_inputs()[0].name: tokens,
+                self.decoder_loop.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder_loop.get_inputs()[2].name: n_layer_self_v_cache,
+                self.decoder_loop.get_inputs()[3].name: n_layer_cross_k_cache,
+                self.decoder_loop.get_inputs()[4].name: n_layer_cross_v_cache,
+                self.decoder_loop.get_inputs()[5].name: pe,
+                self.decoder_loop.get_inputs()[6].name: self_attn_mask,
+                self.decoder_loop.get_inputs()[7].name: cross_attn_mask,
+            }
+        )
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def run_decoder(
+        self,
+        n_layer_cross_k,
+        n_layer_cross_v,
+        cross_attn_mask,
+        beam_size,
+        nbest
+    ):
+        num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
+        encoder_out_length = cross_attn_mask.shape[-1]
+        cross_attn_mask = torch.from_numpy(cross_attn_mask).to(torch.float32)
+        cross_attn_mask = cross_attn_mask.unsqueeze(1).repeat(
+            1, beam_size, 1, 1
+        ).view(beam_size * batch_size, -1, encoder_out_length)
+        n_layer_cross_k = torch.from_numpy(n_layer_cross_k)
+        n_layer_cross_v = torch.from_numpy(n_layer_cross_v)
+        n_layer_cross_k = n_layer_cross_k.unsqueeze(2).repeat(
+            1, 1, beam_size, 1, 1
+        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        n_layer_cross_v = n_layer_cross_v.unsqueeze(2).repeat(
+            1, 1, beam_size, 1, 1
+        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        prediction_tokens = torch.ones(
+            beam_size * batch_size, 1).fill_(self.sos_id).long()
+        tokens = prediction_tokens
+        offset = torch.zeros(1, dtype=torch.int64)
+        n_layer_self_k_cache, n_layer_self_v_cache = self.get_initialized_self_cache(
+            batch_size, beam_size
+        )
+        scores = torch.tensor([0.0] + [-INF]*(beam_size - 1)).float()
+        scores = scores.repeat(batch_size).view(batch_size * beam_size, 1)
+        is_finished = torch.zeros_like(scores)
+        # self_attn_mask = torch.zeros(
+        #     batch_size * beam_size,
+        #     1, 1
+        # )
+        self_attn_mask = np.zeros((batch_size * beam_size, 1, 1), dtype=np.float32)
+        results = [self.sos_id]
+        for i in range(self.decode_max_len):
+            self_attn_mask = torch.empty(
+                batch_size * beam_size,
+                prediction_tokens.shape[-1], prediction_tokens.shape[-1]
+            ).fill_(-np.inf).triu_(1)
+            self_attn_mask = self_attn_mask[:, -1:, :]
+            self_attn_mask = to_numpy(self_attn_mask)
+            logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_one_token(
+                to_numpy(tokens),
+                to_numpy(n_layer_self_k_cache),
+                to_numpy(n_layer_self_v_cache),
+                to_numpy(n_layer_cross_k),
+                to_numpy(n_layer_cross_v),
+                to_numpy(offset),
+                to_numpy(self_attn_mask),
+                to_numpy(cross_attn_mask)
+            )
+            tokens = to_numpy(tokens)
+            n_layer_self_k_cache = to_numpy(n_layer_self_k_cache)
+            n_layer_self_v_cache = to_numpy(n_layer_self_v_cache)
+            n_layer_cross_k = to_numpy(n_layer_cross_k)
+            n_layer_cross_v = to_numpy(n_layer_cross_v)
+            cross_attn_mask = to_numpy(cross_attn_mask)
+            # if i == 0:
+            #     logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_main_one_token(
+            #         to_numpy(tokens),
+            #         to_numpy(n_layer_self_k_cache),
+            #         to_numpy(n_layer_self_v_cache),
+            #         to_numpy(n_layer_cross_k),
+            #         to_numpy(n_layer_cross_v),
+            #         self.pe[offset],
+            #         self_attn_mask,
+            #         to_numpy(cross_attn_mask)
+            #     )
+            # else:
+            #     logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_loop_one_token(
+            #         to_numpy(tokens),
+            #         to_numpy(n_layer_self_k_cache),
+            #         to_numpy(n_layer_self_v_cache),
+            #         to_numpy(n_layer_cross_k),
+            #         to_numpy(n_layer_cross_v),
+            #         self.pe[offset],
+            #         self_attn_mask,
+            #         to_numpy(cross_attn_mask)
+            #     )
+            offset += 1
+            logits = torch.from_numpy(logits)
+            logits = logits.squeeze(1)
+            t_scores = F.log_softmax(logits, dim=-1)
+            t_topB_scores, t_topB_ys = torch.topk(t_scores, k=beam_size, dim=1)
+            t_topB_scores = set_finished_beam_score_to_zero(t_topB_scores, is_finished)
+            t_topB_ys = set_finished_beam_y_to_eos(t_topB_ys, is_finished, self.eos_id)
+            scores = scores + t_topB_scores
+            scores = scores.view(batch_size, beam_size * beam_size)
+            scores, topB_score_ids = torch.topk(scores, k=beam_size, dim=1)
+            scores = scores.view(-1, 1)
+            topB_row_number_in_each_B_rows_of_ys = torch.div(
+                topB_score_ids, beam_size).view(batch_size * beam_size)
+            stride = beam_size * torch.arange(batch_size).view(
+                batch_size, 1).repeat(1, beam_size).view(batch_size * beam_size)
+            topB_row_number_in_ys = topB_row_number_in_each_B_rows_of_ys.long() + stride.long()
+            prediction_tokens = prediction_tokens[topB_row_number_in_ys]
+            t_ys = torch.gather(
+                t_topB_ys.view(batch_size, beam_size * beam_size),
+                dim=1, index=topB_score_ids
+            ).view(beam_size * batch_size, 1)
+            tokens = t_ys
+            prediction_tokens = torch.cat((prediction_tokens, t_ys), dim=1)
+            n_layer_self_k_cache = torch.from_numpy(n_layer_self_k_cache)
+            n_layer_self_v_cache = torch.from_numpy(n_layer_self_v_cache)
+            for i, self_k_cache in enumerate(n_layer_self_k_cache):
+                n_layer_self_k_cache[i] = n_layer_self_k_cache[i][topB_row_number_in_ys]
+            for i, self_v_cache in enumerate(n_layer_self_v_cache):
+                n_layer_self_v_cache[i] = n_layer_self_v_cache[i][topB_row_number_in_ys]
+            is_finished = t_ys.eq(self.eos_id)
+            if is_finished.sum().item() == beam_size * batch_size:
+                break
+        scores = scores.view(batch_size, beam_size)
+        prediction_valid_token_lengths = torch.sum(
+            torch.ne(
+                prediction_tokens.view(batch_size, beam_size, -1),
+                self.eos_id),
+            dim=-1
+        ).int()
+        nbest_scores, nbest_ids = torch.topk(scores, k=nbest, dim=1)
+        index = nbest_ids + beam_size * torch.arange(batch_size).view(batch_size, 1).long()
+        nbest_prediction_tokens = prediction_tokens.view(batch_size * beam_size, -1)[index.view(-1)]
+        nbest_prediction_tokens = nbest_prediction_tokens.view(batch_size, nbest_ids.size(1), -1)
+        nbest_prediction_valid_token_lengths = prediction_valid_token_lengths.view(
+            batch_size * beam_size)[index.view(-1)].view(batch_size, -1)
+        nbest_hyps: List[List[Dict[str, torch.Tensor]]] = []
+        for i in range(batch_size):
+            i_best_hyps: List[Dict[str, torch.Tensor]] = []
+            for j, score in enumerate(nbest_scores[i]):
+                hyp = {
+                    "token_ids": nbest_prediction_tokens[i, j, 1:nbest_prediction_valid_token_lengths[i, j]],
+                    "score": score
+                }
+                i_best_hyps.append(hyp)
+            nbest_hyps.append(i_best_hyps)
+        return nbest_hyps
+    def get_initialized_self_cache(self,
+                                   batch_size,
+                                   beam_size
+                                   ) -> Tuple[Tensor, Tensor]:
+        n_layer_self_k_cache = torch.zeros(
+            self.num_decoder_blocks,
+            batch_size * beam_size,
+            self.decode_max_len,
+            self.decoder_hidden_dim,
+        )
+        n_layer_self_v_cache = torch.zeros(
+            self.num_decoder_blocks,
+            batch_size * beam_size,
+            self.decode_max_len,
+            self.decoder_hidden_dim,
+        )
+        return n_layer_self_k_cache, n_layer_self_v_cache
+    def calc_feat_len(self, audio_dur):
+        import math
+        sample_rate = 16000
+        frame_length = 25 * sample_rate / 1000
+        frame_shift = 10 * sample_rate / 1000
+        length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
+        return length
+    def transcribe(self,
+                   batch_wav_path: List[str],
+                   beam_size: int = 1,
+                   nbest: int = 1
+                ) -> List[Dict]:
+        feats, lengths, wav_durations = self.feature_extractor(batch_wav_path)
+        print(f"feats.shape: {feats.shape}")
+        maxlen = self.calc_feat_len(10)
+        if feats.shape[1] < maxlen:
+            feats = np.concatenate([feats, np.zeros((1, maxlen - feats.shape[1], 80), dtype=np.float32)], axis=1)
+        feats = feats[:, :maxlen, :]
+        encoder_data_path = os.path.join("encoder_output", os.path.basename(batch_wav_path[0]))
+        # decoder_data_path = os.path.join("calib_dataset", "decoder", os.path.basename(batch_wav_path[0]))
+        os.makedirs(encoder_data_path, exist_ok=True)
+        # os.makedirs(decoder_data_path, exist_ok=True)
+        feats = to_numpy(feats)
+        lengths = to_numpy(lengths)
+        # for name, npy in zip(["encoder_input", "encoder_input_lengths"], [feats, lengths]):
+        #     file_path = os.path.join(encoder_data_path, name + ".npy")
+        #     np.save(file_path, npy)
+        start_time = time.time()
+        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.run_encoder(
+            to_numpy(feats),
+            to_numpy(lengths)
+        )
+        for name, npy in zip(["n_layer_cross_k", "n_layer_cross_v", "cross_attn_mask"], [n_layer_cross_k, n_layer_cross_v, cross_attn_mask]):
+            file_path = os.path.join(encoder_data_path, name + ".npy")
+            np.save(file_path, npy)
+        # nbest_hyps = self.run_decoder(n_layer_cross_k,
+        #                               n_layer_cross_v,
+        #                               cross_attn_mask,
+        #                               beam_size,
+        #                               nbest
+        #                               )
+        # transcribe_durations = time.time() - start_time
+        # results: List[Dict] = []
+        # for wav, hyp in zip(batch_wav_path, nbest_hyps):
+        #     hyp = hyp[0]
+        #     hyp_ids = [int(id) for id in hyp["token_ids"].cpu()]
+        #     score = hyp["score"].item()
+        #     text = self.tokenizer.detokenize(hyp_ids)
+        #     results.append(
+        #         {
+        #             "wav": wav,
+        #             "text": text,
+        #             "score": score
+        #         }
+        #     )
+        # return results, wav_durations, transcribe_durations
+def parse_args():
+    parser = argparse.ArgumentParser(description="FireRedASROnnxModel Test")
+    parser.add_argument(
+        "--encoder",
+        type=str,
+        default="axmodel/encoder.axmodel",
+        help="Path to onnx encoder"
+    )
+    parser.add_argument(
+        "--decoder",
+        type=str,
+        default="onnx_decoder/decoder.onnx",
+        help="Path to onnx decoder"
+    )
+    parser.add_argument(
+        "--cmvn",
+        type=str,
+        default="axmodel/cmvn.ark",
+        help="Path to cmvn"
+    )
+    parser.add_argument(
+        "--dict",
+        type=str,
+        default="axmodel/dict.txt",
+        help="Path to dict"
+    )
+    parser.add_argument(
+        "--spm_model",
+        type=str,
+        default="axmodel/train_bpe1000.model",
+        help="Path to spm model"
+    )
+    parser.add_argument(
+        "--wavlist",
+        type=str,
+        default="wavlist.txt",
+        help="File to wav path list"
+    )
+    parser.add_argument(
+        "--hypo",
+        type=str,
+        default="hypo_axmodel.txt",
+        help="File of hypos"
+    )
+    parser.add_argument(
+        "--beam_size",
+        type=int,
+        default=3,
+        help=""
+    )
+    parser.add_argument(
+        "--nbest",
+        type=int,
+        default=1,
+        help=""
+    )
+    return parser.parse_args()
+def parse_wavlist(wavlist: str):
+    wavpaths = []
+    with open(wavlist) as f:
+        for line in f:
+            line = line.strip()
+            if not os.path.exists(line):
+                print(f"{line} doesn't exist.")
+                continue
+            wavpaths.append(line)
+    return wavpaths
+def main():
+    args = parse_args()
+    print(args)
+    onnx_model = FireRedASROnnxModel(args.encoder,
+                                     args.decoder,
+                                     args.cmvn,
+                                     args.dict,
+                                     args.spm_model)
+    wf = open(args.hypo, "wt")
+    wavlist = parse_wavlist(args.wavlist)
+    total_wav_durations = 0
+    total_transcribe_durations = 0
+    for wav in wavlist:
+        batch_wav = [wav]
+        onnx_model.transcribe(batch_wav, args.beam_size, args.nbest)
+    #     wav_durations = sum(wav_durations)
+    #     total_wav_durations += wav_durations
+    #     total_transcribe_durations += transcribe_durations
+    #     logger.info(f"{batch_wav}")
+    #     logger.info(f"Durations: {wav_durations}")
+    #     logger.info(f"Transcribe Durations: {transcribe_durations}")
+    #     rtf = transcribe_durations / wav_durations
+    #     logger.info(f"(Real time factor) RTF: {rtf}")
+    #     for result in results:
+    #         logger.info(f"wav: {result['wav']}")
+    #         logger.info(f"text: {result['text']}")
+    #         logger.info(f"score: {result['score']}")
+    #         logger.info("")
+    #         wf.write(f"{result['text']} ({result['wav']})\n")
+    # logger.info(f"total wav durations: {total_wav_durations}")
+    # logger.info(f"total transcribe durations: {total_transcribe_durations}")
+    # avg_ref = total_transcribe_durations / total_wav_durations
+    # logger.info(f"AVG RTF: {avg_ref}")
+    wf.close()
+if __name__ == "__main__":
+    main()

test_onnx_model.py ADDED Viewed

	@@ -0,0 +1,684 @@

+from fireredasr.data.asr_feat import ASRFeatExtractor
+from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer
+import onnxruntime as ort
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torch import Tensor
+from typing import Tuple, List, Dict
+import argparse
+import os
+import time
+import logging
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+logger_stream_hander = logging.StreamHandler()
+logger_stream_hander.setLevel("INFO")
+logger.addHandler(logger_stream_hander)
+INF = 1e10
+def to_numpy(tensor):
+    if isinstance(tensor, np.ndarray):
+        return tensor
+    if tensor.requires_grad:
+        return tensor.detach().cpu().numpy()
+    else:
+        return tensor.cpu().numpy()
+def set_finished_beam_score_to_zero(scores, is_finished):
+    NB, B = scores.size()
+    is_finished = is_finished.float()
+    mask_score = torch.tensor([0.0] + [-INF]*(B-1)).float()
+    mask_score = mask_score.view(1, B).repeat(NB, 1)
+    return scores * (1 - is_finished) + mask_score * is_finished
+def set_finished_beam_y_to_eos(ys, is_finished, eos_id):
+    is_finished = is_finished.long()
+    return ys * (1 - is_finished) + eos_id * is_finished
+class FireRedASROnnxModel:
+    def __init__(
+        self,
+        encoder_path: str,
+        decoder_path: str,
+        cmvn_file: str,
+        dict_file: str,
+        spm_model_path: str,
+        providers=["CPUExecutionProvider"]
+    ):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+        # session_opts.log_severity_level = 1
+        self.session_opts = session_opts
+        # NOTE: 参考whisper设置的最大的解码长度
+        # FireRedASR-AED 模型支持的最长语音为 60s
+        # ref: https://github.com/FireRedTeam/FireRedASR?tab=readme-ov-file#input-length-limitations
+        self.decode_max_len = 448
+        self.decoder_hidden_dim = 1280
+        self.num_decoder_blocks = 16
+        self.blank_id = 0
+        self.sos_id = 3
+        self.eos_id = 4
+        self.pad_id = 2
+        self.feature_extractor = ASRFeatExtractor(cmvn_file)
+        self.tokenizer = ChineseCharEnglishSpmTokenizer(dict_file, spm_model_path)
+        self.encoder = None
+        self.decoder = None
+        self.init_encoder(encoder_path, providers)
+        self.init_decoder(decoder_path, providers)
+        self.init_decoder_main(decoder_path, providers)
+        self.init_decoder_loop(decoder_path, providers)
+        self.pe = self.init_pe(decoder_path)
+    def init_encoder(self, encoder_path, providers=None):
+        start_time = time.time()
+        self.encoder = ort.InferenceSession(
+            encoder_path,
+            sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load encoder cost {end_time - start_time} seconds")
+    def init_decoder(self, decoder_path, providers=None):
+        start_time = time.time()
+        self.decoder = ort.InferenceSession(
+            decoder_path,
+            sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder cost {end_time - start_time} seconds")
+    def init_decoder_main(self, decoder_path, providers=None):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "decoder_main.onnx")
+        start_time = time.time()
+        self.decoder_main = ort.InferenceSession(
+            decoder_path,
+            sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder_main cost {end_time - start_time} seconds")
+        input_names = [i.name for i in self.decoder_main.get_inputs()]
+        print(f"decoder_main.input_names: {input_names}")
+    def init_decoder_loop(self, decoder_path, providers=None):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "decoder_loop.onnx")
+        start_time = time.time()
+        self.decoder_loop = ort.InferenceSession(
+            decoder_path,
+            sess_options=self.session_opts,
+            providers=providers
+        )
+        end_time = time.time()
+        logger.info(f"load decoder_loop cost {end_time - start_time} seconds")
+        input_names = [i.name for i in self.decoder_loop.get_inputs()]
+        print(f"decoder_loop.input_names: {input_names}")
+    def init_pe(self, decoder_path):
+        decoder_path = os.path.dirname(decoder_path)
+        decoder_path = os.path.join(decoder_path, "pe.npy")
+        return np.load(decoder_path)
+    def run_encoder(self, input: np.ndarray,
+                    input_length: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.encoder.run(
+            None,
+            {
+                self.encoder.get_inputs()[0].name: input,
+                self.encoder.get_inputs()[1].name: input_length
+            }
+        )
+        return (
+            n_layer_cross_k,
+            n_layer_cross_v,
+            cross_attn_mask
+        )
+    def decode_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        offset: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"offset.shape: {offset.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        # print(f"self_attn_mask: {self_attn_mask}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder.run(
+            None,
+            {
+                self.decoder.get_inputs()[0].name: tokens,
+                self.decoder.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder.get_inputs()[2].name: n_layer_self_v_cache,
+                self.decoder.get_inputs()[3].name: n_layer_cross_k_cache,
+                self.decoder.get_inputs()[4].name: n_layer_cross_v_cache,
+                self.decoder.get_inputs()[5].name: offset,
+                self.decoder.get_inputs()[6].name: self_attn_mask,
+                self.decoder.get_inputs()[7].name: cross_attn_mask,
+            }
+        )
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def decode_main_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        pe: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode_main:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"pe.shape: {pe.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_main.run(
+            None,
+            {
+                self.decoder_main.get_inputs()[0].name: tokens,
+                # self.decoder_main.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder_main.get_inputs()[1].name: n_layer_cross_k_cache,
+                self.decoder_main.get_inputs()[2].name: n_layer_cross_v_cache,
+                self.decoder_main.get_inputs()[3].name: pe,
+                self.decoder_main.get_inputs()[4].name: self_attn_mask,
+                self.decoder_main.get_inputs()[5].name: cross_attn_mask,
+                # self.decoder_main.get_inputs()[7].name: cross_attn_mask,
+            }
+        )
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def decode_loop_one_token(
+        self,
+        tokens: np.ndarray,
+        n_layer_self_k_cache: np.ndarray,
+        n_layer_self_v_cache: np.ndarray,
+        n_layer_cross_k_cache: np.ndarray,
+        n_layer_cross_v_cache: np.ndarray,
+        pe: np.ndarray,
+        self_attn_mask: np.ndarray,
+        cross_attn_mask: np.ndarray
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        # print("decode_loop:")
+        # print(f"tokens.shape: {tokens.shape}")
+        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
+        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
+        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
+        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
+        # print(f"pe.shape: {pe.shape}")
+        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
+        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
+        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_loop.run(
+            None,
+            {
+                self.decoder_loop.get_inputs()[0].name: tokens,
+                self.decoder_loop.get_inputs()[1].name: n_layer_self_k_cache,
+                self.decoder_loop.get_inputs()[2].name: n_layer_self_v_cache,
+                self.decoder_loop.get_inputs()[3].name: n_layer_cross_k_cache,
+                self.decoder_loop.get_inputs()[4].name: n_layer_cross_v_cache,
+                self.decoder_loop.get_inputs()[5].name: pe,
+                self.decoder_loop.get_inputs()[6].name: self_attn_mask,
+                self.decoder_loop.get_inputs()[7].name: cross_attn_mask,
+            }
+        )
+        return (
+            logits,
+            out_n_layer_self_k_cache,
+            out_n_layer_self_v_cache
+        )
+    def run_decoder(
+        self,
+        n_layer_cross_k,
+        n_layer_cross_v,
+        cross_attn_mask,
+        beam_size,
+        nbest,
+        decoder_data_path
+    ):
+        num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
+        encoder_out_length = cross_attn_mask.shape[-1]
+        cross_attn_mask = torch.from_numpy(cross_attn_mask).to(torch.float32)
+        cross_attn_mask = cross_attn_mask.unsqueeze(1).repeat(
+            1, beam_size, 1, 1
+        ).view(beam_size * batch_size, -1, encoder_out_length)
+        n_layer_cross_k = torch.from_numpy(n_layer_cross_k)
+        n_layer_cross_v = torch.from_numpy(n_layer_cross_v)
+        n_layer_cross_k = n_layer_cross_k.unsqueeze(2).repeat(
+            1, 1, beam_size, 1, 1
+        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        n_layer_cross_v = n_layer_cross_v.unsqueeze(2).repeat(
+            1, 1, beam_size, 1, 1
+        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
+        prediction_tokens = torch.ones(
+            beam_size * batch_size, 1).fill_(self.sos_id).long()
+        tokens = prediction_tokens
+        offset = torch.zeros(1, dtype=torch.int64)
+        n_layer_self_k_cache, n_layer_self_v_cache = self.get_initialized_self_cache(
+            batch_size, beam_size
+        )
+        scores = torch.tensor([0.0] + [-INF]*(beam_size - 1)).float()
+        scores = scores.repeat(batch_size).view(batch_size * beam_size, 1)
+        is_finished = torch.zeros_like(scores)
+        # self_attn_mask = torch.zeros(
+        #     batch_size * beam_size,
+        #     1, 1
+        # )
+        self_attn_mask = np.zeros((batch_size * beam_size, 1, 1), dtype=np.float32)
+        results = [self.sos_id]
+        for i in range(self.decode_max_len):
+            # ==== ORIGIN ====
+            # self_attn_mask = torch.empty(
+            #     batch_size * beam_size,
+            #     prediction_tokens.shape[-1], prediction_tokens.shape[-1]
+            # ).fill_(-np.inf).triu_(1)
+            # self_attn_mask = self_attn_mask[:, -1:, :]
+            # self_attn_mask = to_numpy(self_attn_mask)
+            # logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_one_token(
+            #     to_numpy(tokens),
+            #     to_numpy(n_layer_self_k_cache),
+            #     to_numpy(n_layer_self_v_cache),
+            #     to_numpy(n_layer_cross_k),
+            #     to_numpy(n_layer_cross_v),
+            #     to_numpy(offset),
+            #     to_numpy(self_attn_mask),
+            #     to_numpy(cross_attn_mask)
+            # )
+            # ==== ORIGIN ====
+            # tokens = to_numpy(tokens)
+            # n_layer_self_k_cache = to_numpy(n_layer_self_k_cache)
+            # n_layer_self_v_cache = to_numpy(n_layer_self_v_cache)
+            # n_layer_cross_k = to_numpy(n_layer_cross_k)
+            # n_layer_cross_v = to_numpy(n_layer_cross_v)
+            # cross_attn_mask = to_numpy(cross_attn_mask)
+            # for name, npy in zip(
+            #     ["tokens", "n_layer_self_k_cache", "n_layer_self_v_cache", "n_layer_cross_k", "n_layer_cross_v", "pe", "self_attn_mask", "cross_attn_mask"],
+            #     [tokens, n_layer_self_k_cache, n_layer_self_v_cache, n_layer_cross_k, n_layer_cross_v, self.pe[offset], self_attn_mask, cross_attn_mask]
+            # ):
+            #     file_path = os.path.join(decoder_data_path, name)
+            #     os.makedirs(file_path, exist_ok=True)
+            #     np.save(os.path.join(file_path, f"{i}.npy"), npy)
+            # if i == 0:
+            #     logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_main_one_token(
+            #         to_numpy(tokens),
+            #         to_numpy(n_layer_self_k_cache),
+            #         to_numpy(n_layer_self_v_cache),
+            #         to_numpy(n_layer_cross_k),
+            #         to_numpy(n_layer_cross_v),
+            #         self.pe[0],
+            #         self_attn_mask,
+            #         to_numpy(cross_attn_mask)
+            #     )
+            # else:
+            #     logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_loop_one_token(
+            #         to_numpy(tokens),
+            #         to_numpy(n_layer_self_k_cache),
+            #         to_numpy(n_layer_self_v_cache),
+            #         to_numpy(n_layer_cross_k),
+            #         to_numpy(n_layer_cross_v),
+            #         self.pe[offset],
+            #         self_attn_mask,
+            #         to_numpy(cross_attn_mask)
+            #     )
+            logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_loop_one_token(
+                    to_numpy(tokens),
+                    to_numpy(n_layer_self_k_cache),
+                    to_numpy(n_layer_self_v_cache),
+                    to_numpy(n_layer_cross_k),
+                    to_numpy(n_layer_cross_v),
+                    self.pe[offset],
+                    self_attn_mask,
+                    to_numpy(cross_attn_mask)
+                )
+            offset += 1
+            logits = torch.from_numpy(logits)
+            logits = logits.squeeze(1)
+            t_scores = F.log_softmax(logits, dim=-1)
+            t_topB_scores, t_topB_ys = torch.topk(t_scores, k=beam_size, dim=1)
+            t_topB_scores = set_finished_beam_score_to_zero(t_topB_scores, is_finished)
+            t_topB_ys = set_finished_beam_y_to_eos(t_topB_ys, is_finished, self.eos_id)
+            scores = scores + t_topB_scores
+            scores = scores.view(batch_size, beam_size * beam_size)
+            scores, topB_score_ids = torch.topk(scores, k=beam_size, dim=1)
+            scores = scores.view(-1, 1)
+            topB_row_number_in_each_B_rows_of_ys = torch.div(
+                topB_score_ids, beam_size).view(batch_size * beam_size)
+            stride = beam_size * torch.arange(batch_size).view(
+                batch_size, 1).repeat(1, beam_size).view(batch_size * beam_size)
+            topB_row_number_in_ys = topB_row_number_in_each_B_rows_of_ys.long() + stride.long()
+            prediction_tokens = prediction_tokens[topB_row_number_in_ys]
+            t_ys = torch.gather(
+                t_topB_ys.view(batch_size, beam_size * beam_size),
+                dim=1, index=topB_score_ids
+            ).view(beam_size * batch_size, 1)
+            tokens = t_ys
+            prediction_tokens = torch.cat((prediction_tokens, t_ys), dim=1)
+            n_layer_self_k_cache = torch.from_numpy(n_layer_self_k_cache)
+            n_layer_self_v_cache = torch.from_numpy(n_layer_self_v_cache)
+            for i, self_k_cache in enumerate(n_layer_self_k_cache):
+                n_layer_self_k_cache[i] = n_layer_self_k_cache[i][topB_row_number_in_ys]
+            for i, self_v_cache in enumerate(n_layer_self_v_cache):
+                n_layer_self_v_cache[i] = n_layer_self_v_cache[i][topB_row_number_in_ys]
+            is_finished = t_ys.eq(self.eos_id)
+            if is_finished.sum().item() == beam_size * batch_size:
+                break
+        scores = scores.view(batch_size, beam_size)
+        prediction_valid_token_lengths = torch.sum(
+            torch.ne(
+                prediction_tokens.view(batch_size, beam_size, -1),
+                self.eos_id),
+            dim=-1
+        ).int()
+        nbest_scores, nbest_ids = torch.topk(scores, k=nbest, dim=1)
+        index = nbest_ids + beam_size * torch.arange(batch_size).view(batch_size, 1).long()
+        nbest_prediction_tokens = prediction_tokens.view(batch_size * beam_size, -1)[index.view(-1)]
+        nbest_prediction_tokens = nbest_prediction_tokens.view(batch_size, nbest_ids.size(1), -1)
+        nbest_prediction_valid_token_lengths = prediction_valid_token_lengths.view(
+            batch_size * beam_size)[index.view(-1)].view(batch_size, -1)
+        nbest_hyps: List[List[Dict[str, torch.Tensor]]] = []
+        for i in range(batch_size):
+            i_best_hyps: List[Dict[str, torch.Tensor]] = []
+            for j, score in enumerate(nbest_scores[i]):
+                hyp = {
+                    "token_ids": nbest_prediction_tokens[i, j, 1:nbest_prediction_valid_token_lengths[i, j]],
+                    "score": score
+                }
+                i_best_hyps.append(hyp)
+            nbest_hyps.append(i_best_hyps)
+        return nbest_hyps
+    def get_initialized_self_cache(self,
+                                   batch_size,
+                                   beam_size
+                                   ) -> Tuple[Tensor, Tensor]:
+        n_layer_self_k_cache = torch.zeros(
+            self.num_decoder_blocks,
+            batch_size * beam_size,
+            self.decode_max_len,
+            self.decoder_hidden_dim,
+        )
+        n_layer_self_v_cache = torch.zeros(
+            self.num_decoder_blocks,
+            batch_size * beam_size,
+            self.decode_max_len,
+            self.decoder_hidden_dim,
+        )
+        return n_layer_self_k_cache, n_layer_self_v_cache
+    def calc_feat_len(self, audio_dur):
+        import math
+        sample_rate = 16000
+        frame_length = 25 * sample_rate / 1000
+        frame_shift = 10 * sample_rate / 1000
+        length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
+        return length
+    def transcribe(self,
+                   batch_wav_path: List[str],
+                   beam_size: int = 1,
+                   nbest: int = 1
+                ) -> List[Dict]:
+        feats, lengths, wav_durations = self.feature_extractor(batch_wav_path)
+        print(f"feats.shape: {feats.shape}")
+        maxlen = self.calc_feat_len(10)
+        if feats.shape[1] < maxlen:
+            feats = np.concatenate([feats, np.zeros((1, maxlen - feats.shape[1], 80), dtype=np.float32)], axis=1)
+        feats = feats[:, :maxlen, :]
+        # encoder_data_path = os.path.join("calib_dataset", "encoder", os.path.basename(batch_wav_path[0]))
+        decoder_data_path = os.path.join("calib_dataset", "decoder", os.path.basename(batch_wav_path[0]))
+        # os.makedirs(encoder_data_path, exist_ok=True)
+        # os.makedirs(decoder_data_path, exist_ok=True)
+        feats = to_numpy(feats)
+        lengths = to_numpy(lengths)
+        # for name, npy in zip(["encoder_input", "encoder_input_lengths"], [feats, lengths]):
+        #     file_path = os.path.join(encoder_data_path, name + ".npy")
+        #     np.save(file_path, npy)
+        start_time = time.time()
+        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.run_encoder(
+            to_numpy(feats),
+            to_numpy(lengths)
+        )
+        nbest_hyps = self.run_decoder(n_layer_cross_k,
+                                      n_layer_cross_v,
+                                      cross_attn_mask,
+                                      beam_size,
+                                      nbest,
+                                      decoder_data_path)
+        transcribe_durations = time.time() - start_time
+        results: List[Dict] = []
+        for wav, hyp in zip(batch_wav_path, nbest_hyps):
+            hyp = hyp[0]
+            hyp_ids = [int(id) for id in hyp["token_ids"].cpu()]
+            score = hyp["score"].item()
+            text = self.tokenizer.detokenize(hyp_ids)
+            results.append(
+                {
+                    "wav": wav,
+                    "text": text,
+                    "score": score
+                }
+            )
+        return results, wav_durations, transcribe_durations
+def parse_args():
+    parser = argparse.ArgumentParser(description="FireRedASROnnxModel Test")
+    parser.add_argument(
+        "--encoder",
+        type=str,
+        default="onnx_encoder/encoder.onnx",
+        help="Path to onnx encoder"
+    )
+    parser.add_argument(
+        "--decoder",
+        type=str,
+        default="onnx_decoder/decoder.onnx",
+        help="Path to onnx decoder"
+    )
+    parser.add_argument(
+        "--cmvn",
+        type=str,
+        default="axmodel/cmvn.ark",
+        help="Path to cmvn"
+    )
+    parser.add_argument(
+        "--dict",
+        type=str,
+        default="axmodel/dict.txt",
+        help="Path to dict"
+    )
+    parser.add_argument(
+        "--spm_model",
+        type=str,
+        default="axmodel/train_bpe1000.model",
+        help="Path to spm model"
+    )
+    parser.add_argument(
+        "--wavlist",
+        type=str,
+        default="wavlist.txt",
+        help="File to wav path list"
+    )
+    parser.add_argument(
+        "--hypo",
+        type=str,
+        default="hypo_onnx.txt",
+        help="File of hypos"
+    )
+    parser.add_argument(
+        "--beam_size",
+        type=int,
+        default=3,
+        help=""
+    )
+    parser.add_argument(
+        "--nbest",
+        type=int,
+        default=1,
+        help=""
+    )
+    parser.add_argument(
+        "--provider",
+        default="CPUExecutionProvider",
+        choices=['CUDAExecutionProvider', 'CPUExecutionProvider']
+    )
+    return parser.parse_args()
+def parse_wavlist(wavlist: str):
+    wavpaths = []
+    with open(wavlist) as f:
+        for line in f:
+            line = line.strip()
+            if not os.path.exists(line):
+                print(f"{line} doesn't exist.")
+                continue
+            wavpaths.append(line)
+    return wavpaths
+def main():
+    args = parse_args()
+    print(args)
+    onnx_model = FireRedASROnnxModel(args.encoder,
+                                     args.decoder,
+                                     args.cmvn,
+                                     args.dict,
+                                     args.spm_model,
+                                     [args.provider])
+    wf = open(args.hypo, "wt")
+    wavlist = parse_wavlist(args.wavlist)
+    total_wav_durations = 0
+    total_transcribe_durations = 0
+    for wav in wavlist:
+        batch_wav = [wav]
+        results, wav_durations, transcribe_durations = onnx_model.transcribe(
+            batch_wav, args.beam_size, args.nbest)
+        wav_durations = sum(wav_durations)
+        total_wav_durations += wav_durations
+        total_transcribe_durations += transcribe_durations
+        logger.info(f"{batch_wav}")
+        logger.info(f"Durations: {wav_durations}")
+        logger.info(f"Transcribe Durations: {transcribe_durations}")
+        rtf = transcribe_durations / wav_durations
+        logger.info(f"(Real time factor) RTF: {rtf}")
+        for result in results:
+            logger.info(f"wav: {result['wav']}")
+            logger.info(f"text: {result['text']}")
+            logger.info(f"score: {result['score']}")
+            logger.info("")
+            wf.write(f"{result['text']} ({result['wav']})\n")
+    logger.info(f"total wav durations: {total_wav_durations}")
+    logger.info(f"total transcribe durations: {total_transcribe_durations}")
+    avg_ref = total_transcribe_durations / total_wav_durations
+    logger.info(f"AVG RTF: {avg_ref}")
+    wf.close()
+    # import tarfile as tf
+    # import glob
+    # with tf.open("./calib_dataset/encoder_input.tar.gz", "w:gz") as f:
+    #     for npy in glob.glob("./calib_dataset/encoder/*/encoder_input.npy"):
+    #         f.add(npy)
+    # with tf.open("./calib_dataset/encoder_input_lengths.tar.gz", "w:gz") as f:
+    #     for npy in glob.glob("./calib_dataset/encoder/*/encoder_input_lengths.npy"):
+    #         f.add(npy)
+    # for decoder_input in ["tokens", "n_layer_self_k_cache", "n_layer_self_v_cache", "n_layer_cross_k", "n_layer_cross_v", "pe", "self_attn_mask", "cross_attn_mask"]:
+    #     with tf.open(f"./calib_dataset/{decoder_input}.tar.gz", "w:gz") as f:
+    #         for npy in glob.glob(f"./calib_dataset/decoder/*/{decoder_input}"):
+    #             f.add(npy)
+if __name__ == "__main__":
+    main()

wavlist.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+wav/TEST_NET_Y0000000000_-KTKHdZ2fb8_S00000.wav
+wav/TEST_MEETING_T0000000001_S00000.wav
+wav/IT0011W0001.wav
+wav/BAC009S0764W0121.wav