Delete unneceesary files

Browse files

Files changed (5) hide show

model_convert/model_wrapper.py +0 -431
model_convert/to_onnx.py +0 -525
test_decoder.py +0 -640
test_encoder.py +0 -646
test_onnx_model.py +0 -684

model_convert/model_wrapper.py DELETED Viewed

@@ -1,431 +0,0 @@
-import torch
-import torch.nn as nn
-from torch import Tensor
-from fireredasr.models.module.conformer_encoder import ConformerEncoder
-from fireredasr.models.module.transformer_decoder import (
-    TransformerDecoder,
-    DecoderLayer,
-    DecoderMultiHeadAttention,
-    DecoderScaledDotProductAttention,
-    PositionalEncoding
-)
-def DecoderScaledDotProductAttentionForward(
-    self: DecoderScaledDotProductAttention,
-    q: Tensor,
-    k: Tensor,
-    v: Tensor,
-    mask: Tensor
-):
-    attn = torch.matmul(q, k.transpose(2, 3)) / self.temperature
-    if mask is not None:
-        # mask is such as [[[0, 0, 0, 0, ..., -inf, -inf]]]
-        attn = attn + mask
-        attn = torch.softmax(attn, dim=-1)
-    else:
-        attn = torch.softmax(attn, dim=-1)
-    output = torch.matmul(attn, v)
-    return output
-DecoderScaledDotProductAttention.forward = DecoderScaledDotProductAttentionForward
-"""
-The purpose of this is to allow the exported onnx model
-to only need to pass in the token id of the decoding result
-of the previous time step when performing decoding inference at each time step,
-rather than the token id of all previous time steps.
-"""
-def PositionalEncodingForward(
-    self: PositionalEncoding,
-    offset: Tensor
-):
-    return self.pe[:, :offset].clone().detach()[:, -1]
-PositionalEncoding.forward = PositionalEncodingForward
-"""
-NOTE(Lianghu): Why do that?
-When exporting the onnx model using original padding_position_is_0 funciton,
-the dynamic batch does not work properly for the exported onnx model.
-The code in the original padding_position_is_0 function is as follows:
-```py
-def padding_position_is_0(...):
-    N, T = padded_input.size()[:2]
-    mask = torch.ones((N, T)).to(padded_input.device)
-    ...
-```
-Because when exporting onnx, N and T are considered constants.
-Should be N = padded_input.size(0) and T = padded_input.size(1).
-"""
-def padding_position_is_0(self: ConformerEncoder,
-                          padded_input: Tensor,
-                          input_lengths: Tensor):
-    N = padded_input.size(0)
-    T = padded_input.size(1)
-    seq_range = torch.arange(T, device=padded_input.device).unsqueeze(0)  # shape: (1, T)
-    input_lengths_exp = input_lengths.unsqueeze(1)  # shape: (N, 1)
-    mask = seq_range < input_lengths_exp  # shape: (N, T)
-    mask = mask.unsqueeze(dim=1)
-    return mask.to(torch.uint8)
-ConformerEncoder.padding_position_is_0 = padding_position_is_0
-class AudioEncoderTensorCache(nn.Module):
-    def __init__(self,
-                 encoder: ConformerEncoder,
-                 decoder: TransformerDecoder):
-        super().__init__()
-        self.encoder = encoder
-        self.decoder = decoder
-    def forward(self, input: Tensor, input_length: Tensor):
-        encoder_output, _, encoder_mask = self.encoder(input, input_length)
-        n_layer_cross_k_list = []
-        n_layer_cross_v_list = []
-        for layer in self.decoder.layer_stack:
-            # layer: DecoderLayer
-            n_layer_cross_k_list.append(layer.cross_attn.w_ks(encoder_output))
-            n_layer_cross_v_list.append(layer.cross_attn.w_vs(encoder_output))
-        encoder_mask = encoder_mask.to(torch.float32)
-        encoder_mask[encoder_mask == 0] = -torch.inf
-        encoder_mask[encoder_mask == 1] = 0.0
-        return (torch.stack(n_layer_cross_k_list),
-                torch.stack(n_layer_cross_v_list),
-                encoder_mask)
-class DecoderMultiHeadSelfAttention(nn.Module):
-    def __init__(self, multiHeadSelfAttention: DecoderMultiHeadAttention, loop: bool = False):
-        super().__init__()
-        self.multiHeadSelfAttention = multiHeadSelfAttention
-        self.loop = loop
-    def forward(self,
-                x: Tensor,
-                k_cache: Tensor,
-                v_cache: Tensor,
-                mask: Tensor):
-        bs = x.size(0)
-        # 当前时间步为 t
-        # k_cache 和 v_cache 是 时间步 [0: t-1] 的 self_attn_k 和 self_attn_v 的缓存
-        q = self.multiHeadSelfAttention.w_qs(x)
-        k = self.multiHeadSelfAttention.w_ks(x)
-        v = self.multiHeadSelfAttention.w_vs(x)
-        k_cache[:, -k.shape[1] :, :] = k
-        v_cache[:, -v.shape[1] :, :] = v
-        # if self.loop:
-        #     k_cache = torch.cat([k_cache[:, 1:, :], k], 1)
-        #     v_cache = torch.cat([v_cache[:, 1:, :], v], 1)
-        # else:
-        #     k_cache = k
-        #     v_cache = v
-        q = q.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
-        k = k_cache.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
-        v = v_cache.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
-        k = k.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
-        v = v.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
-        if mask is not None:
-            mask = mask.unsqueeze(1)
-        output = self.multiHeadSelfAttention.attention(q, k, v, mask)
-        output = output.transpose(1, 2).contiguous().view(bs, -1, self.multiHeadSelfAttention.d_model)
-        output = self.multiHeadSelfAttention.fc(output)
-        output = self.multiHeadSelfAttention.dropout(output)
-        return output, k_cache, v_cache
-class DecoderMultiHeadSelfAttentionV2(nn.Module):
-    def __init__(self, multiHeadSelfAttention: DecoderMultiHeadAttention, loop: bool = False):
-        super().__init__()
-        self.multiHeadSelfAttention = multiHeadSelfAttention
-        self.loop = loop
-    def forward(self,
-                x: Tensor,
-                k_cache: Tensor,
-                v_cache: Tensor,
-                mask: Tensor):
-        bs = x.size(0)
-        # 当前时间步为 t
-        # k_cache 和 v_cache 是 时间步 [0: t-1] 的 self_attn_k 和 self_attn_v 的缓存
-        q = self.multiHeadSelfAttention.w_qs(x)
-        k = self.multiHeadSelfAttention.w_ks(x)
-        v = self.multiHeadSelfAttention.w_vs(x)
-        # k_cache[:, -k.shape[1] :, :] = k
-        # v_cache[:, -v.shape[1] :, :] = v
-        if self.loop:
-            k_cache = torch.cat([k_cache[:, 1:, :], k], 1)
-            v_cache = torch.cat([v_cache[:, 1:, :], v], 1)
-        else:
-            k_cache = k
-            v_cache = v
-        q = q.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
-        k = k_cache.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
-        v = v_cache.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
-        k = k.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
-        v = v.view(bs, -1, self.multiHeadSelfAttention.n_head, self.multiHeadSelfAttention.d_k)
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
-        if mask is not None:
-            mask = mask.unsqueeze(1)
-        output = self.multiHeadSelfAttention.attention(q, k, v, mask)
-        output = output.transpose(1, 2).contiguous().view(bs, -1, self.multiHeadSelfAttention.d_model)
-        output = self.multiHeadSelfAttention.fc(output)
-        output = self.multiHeadSelfAttention.dropout(output)
-        return output, k_cache, v_cache
-class DecoderMultiHeadCrossAttention(nn.Module):
-    def __init__(self, multiHeadCrossAttention: DecoderMultiHeadAttention):
-        super().__init__()
-        self.multiHeadCrossAttention = multiHeadCrossAttention
-    def forward(self,
-                x: Tensor,
-                k: Tensor,
-                v: Tensor,
-                mask: Tensor):
-        bs = x.size(0)
-        x = self.multiHeadCrossAttention.w_qs(x)
-        x = x.view(bs, -1, self.multiHeadCrossAttention.n_head, self.multiHeadCrossAttention.d_k)
-        k = k.view(bs, -1, self.multiHeadCrossAttention.n_head, self.multiHeadCrossAttention.d_k)
-        v = v.view(bs, -1, self.multiHeadCrossAttention.n_head, self.multiHeadCrossAttention.d_k)
-        x = x.transpose(1, 2)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
-        if mask is not None:
-            mask = mask.unsqueeze(1)
-        output = self.multiHeadCrossAttention.attention(x, k, v, mask)
-        output = output.transpose(1, 2).contiguous().view(bs, -1, self.multiHeadCrossAttention.d_model)
-        output = self.multiHeadCrossAttention.fc(output)
-        output = self.multiHeadCrossAttention.dropout(output)
-        return output
-class ResidualAttentionBlockTensorCache(nn.Module):
-    def __init__(self, decoder_layer: DecoderLayer, loop: bool = False):
-        super().__init__()
-        self.original_decoder_layer = decoder_layer
-        self.self_attn = DecoderMultiHeadSelfAttention(decoder_layer.self_attn, loop)
-        self.cross_attn = DecoderMultiHeadCrossAttention(decoder_layer.cross_attn)
-    def forward(self,
-                x: Tensor,
-                self_k_cache: Tensor,
-                self_v_cache: Tensor,
-                cross_k: Tensor,
-                cross_v: Tensor,
-                self_attn_mask: Tensor,
-                cross_attn_mask: Tensor):
-        # q.shape (B, 1, dim)
-        x_self_attn_norm = self.original_decoder_layer.self_attn_norm(x)
-        self_attn_x, self_k_cache_updated, self_v_cache_updated = self.self_attn(
-            x_self_attn_norm, self_k_cache, self_v_cache, self_attn_mask)
-        x = x + self_attn_x
-        residual = x
-        x_cross_attn_norm = self.original_decoder_layer.cross_attn_norm(x)
-        x_cross_attn = self.cross_attn(x_cross_attn_norm, cross_k, cross_v, cross_attn_mask)
-        x = residual + x_cross_attn
-        x = x + self.original_decoder_layer.mlp(self.original_decoder_layer.mlp_norm(x))
-        return x, self_k_cache_updated, self_v_cache_updated
-class ResidualAttentionBlockTensorCacheV2(nn.Module):
-    def __init__(self, decoder_layer: DecoderLayer, loop: bool = False):
-        super().__init__()
-        self.original_decoder_layer = decoder_layer
-        self.self_attn = DecoderMultiHeadSelfAttentionV2(decoder_layer.self_attn, loop)
-        self.cross_attn = DecoderMultiHeadCrossAttention(decoder_layer.cross_attn)
-    def forward(self,
-                x: Tensor,
-                self_k_cache: Tensor,
-                self_v_cache: Tensor,
-                cross_k: Tensor,
-                cross_v: Tensor,
-                self_attn_mask: Tensor,
-                cross_attn_mask: Tensor):
-        # q.shape (B, 1, dim)
-        x_self_attn_norm = self.original_decoder_layer.self_attn_norm(x)
-        self_attn_x, self_k_cache_updated, self_v_cache_updated = self.self_attn(
-            x_self_attn_norm, self_k_cache, self_v_cache, self_attn_mask)
-        x = x + self_attn_x
-        residual = x
-        x_cross_attn_norm = self.original_decoder_layer.cross_attn_norm(x)
-        x_cross_attn = self.cross_attn(x_cross_attn_norm, cross_k, cross_v, cross_attn_mask)
-        x = residual + x_cross_attn
-        x = x + self.original_decoder_layer.mlp(self.original_decoder_layer.mlp_norm(x))
-        return x, self_k_cache_updated, self_v_cache_updated
-class TextDecoderTensorCache(nn.Module):
-    def __init__(self, decoder: TransformerDecoder):
-        super().__init__()
-        self.decoder = decoder
-        self.blocks = []
-        for original_layer in self.decoder.layer_stack:
-            self.blocks.append(
-                ResidualAttentionBlockTensorCache(original_layer))
-    def forward(self,
-                tokens: Tensor,
-                n_layer_self_k_cache: Tensor,
-                n_layer_self_v_cache: Tensor,
-                n_layer_cross_k: Tensor,
-                n_layer_cross_v: Tensor,
-                offset: Tensor,
-                self_attn_mask: Tensor,
-                cross_attn_mask: Tensor):
-        """
-        TODO(Lianghu): Integrate self_attn_mask into the model inference process
-              instead of passing it in through an external interface.
-        """
-        x = self.decoder.dropout(
-            self.decoder.tgt_word_emb(tokens) * self.decoder.scale +
-            self.decoder.positional_encoding(offset + 1)
-        )
-        i = 0
-        for block in self.blocks:
-            self_k_cache = n_layer_self_k_cache[i, :, : offset[0] + tokens.shape[-1], :]
-            self_v_cache = n_layer_self_v_cache[i, :, : offset[0] + tokens.shape[-1], :]
-            x, self_k_cache, self_v_cache = block(
-                x,
-                self_k_cache,
-                self_v_cache,
-                n_layer_cross_k[i],
-                n_layer_cross_v[i],
-                self_attn_mask,
-                cross_attn_mask
-            )
-            n_layer_self_k_cache[i, :, : offset[0] + tokens.shape[-1], :] = self_k_cache
-            n_layer_self_v_cache[i, :, : offset[0] + tokens.shape[-1], :] = self_v_cache
-            i += 1
-        output = self.decoder.layer_norm_out(x)
-        logits = self.decoder.tgt_word_prj(output)
-        return logits, n_layer_self_k_cache, n_layer_self_v_cache
-class TextDecoderTensorCacheV2(nn.Module):
-    def __init__(self, decoder: TransformerDecoder, loop: bool = False):
-        super().__init__()
-        self.decoder = decoder
-        self.loop = loop
-        self.blocks = []
-        for original_layer in self.decoder.layer_stack:
-            self.blocks.append(
-                ResidualAttentionBlockTensorCacheV2(original_layer, loop))
-    def forward(self,
-                tokens: Tensor,
-                n_layer_self_k_cache: Tensor,
-                n_layer_self_v_cache: Tensor,
-                n_layer_cross_k: Tensor,
-                n_layer_cross_v: Tensor,
-                positional_embedding: Tensor,
-                self_attn_mask: Tensor,
-                cross_attn_mask: Tensor):
-        """
-        TODO(Lianghu): Integrate self_attn_mask into the model inference process
-              instead of passing it in through an external interface.
-        """
-        x = self.decoder.dropout(
-            self.decoder.tgt_word_emb(tokens) * self.decoder.scale +
-            positional_embedding
-        )
-        # if self.loop:
-        #     x = self.decoder.dropout(
-        #         self.decoder.tgt_word_emb(tokens) * self.decoder.scale +
-        #         positional_embedding
-        #     )
-        # else:
-        #     x = self.decoder.dropout(
-        #         self.decoder.tgt_word_emb(tokens) * self.decoder.scale +
-        #         self.decoder.positional_encoding.pe[:, : tokens.shape[-1]]
-        #     )
-        i = 0
-        self_k_cache_out = []
-        self_v_cache_out = []
-        for block in self.blocks:
-            self_k_cache = n_layer_self_k_cache[i, :, :, :]
-            self_v_cache = n_layer_self_v_cache[i, :, :, :]
-            if self.loop:
-                x, self_k_cache, self_v_cache = block(
-                    x,
-                    self_k_cache,
-                    self_v_cache,
-                    n_layer_cross_k[i],
-                    n_layer_cross_v[i],
-                    self_attn_mask,
-                    cross_attn_mask
-                )
-                self_k_cache_out.append(self_k_cache.unsqueeze(0))
-                self_v_cache_out.append(self_v_cache.unsqueeze(0))
-            else:
-                n_audio, n_text_ctx, ntext_state = self_k_cache.shape
-                x, self_k_cache, self_v_cache = block(
-                    x,
-                    self_k_cache,
-                    self_v_cache,
-                    n_layer_cross_k[i],
-                    n_layer_cross_v[i],
-                    self_attn_mask,
-                    cross_attn_mask
-                )
-                self_k_cache_out.append(torch.cat((torch.zeros([n_audio, n_text_ctx - self_k_cache.shape[1], ntext_state]).to(self_k_cache.device), self_k_cache), 1).unsqueeze(0))
-                self_v_cache_out.append(torch.cat((torch.zeros([n_audio, n_text_ctx - self_v_cache.shape[1], ntext_state]).to(self_v_cache.device), self_v_cache), 1).unsqueeze(0))
-            i += 1
-        n_layer_self_k_cache = torch.cat(self_k_cache_out, 0)
-        n_layer_self_v_cache = torch.cat(self_v_cache_out, 0)
-        output = self.decoder.layer_norm_out(x)
-        logits = self.decoder.tgt_word_prj(output)
-        return logits, n_layer_self_k_cache, n_layer_self_v_cache

model_convert/to_onnx.py DELETED Viewed

@@ -1,525 +0,0 @@
-import model_wrapper
-from fireredasr.models.fireredasr import FireRedAsrAed
-import torch
-import onnx
-import onnxruntime
-from onnxruntime.quantization import QuantType, quantize_dynamic
-import onnxslim
-from onnx.external_data_helper import convert_model_to_external_data
-import numpy as np
-import math
-import kaldiio
-import os
-import argparse
-from typing import Dict, Any
-def to_numpy(tensor):
-    if tensor.requires_grad:
-        return tensor.detach().cpu().numpy()
-    else:
-        return tensor.cpu().numpy()
-def load_model(model_path):
-    package = torch.load(model_path,
-                         map_location=lambda storage,
-                         loc: storage, weights_only=False)
-    model = FireRedAsrAed.from_args(package["args"])
-    model.load_state_dict(package["model_state_dict"], strict=True)
-    return model, package["args"]
-def read_kaldi_cmvn(kaldi_cmvn_file):
-    assert os.path.exists(kaldi_cmvn_file)
-    stats = kaldiio.load_mat(kaldi_cmvn_file)
-    assert stats.shape[0] == 2
-    dim = stats.shape[-1] - 1
-    count = stats[0, dim]
-    assert count >= 1
-    floor = 1e-20
-    means = []
-    inverse_std_variences = []
-    for d in range(dim):
-        mean = stats[0, d] / count
-        means.append(mean.item())
-        varience = (stats[1, d] / count) - mean*mean
-        if varience < floor:
-            varience = floor
-        istd = 1.0 / math.sqrt(varience)
-        inverse_std_variences.append(istd)
-    return means, inverse_std_variences
-def add_meta_data(filename: str, meta_data: Dict[str, Any]):
-    """Add meta data to an ONNX model. It is changed in-place.
-    Args:
-      filename:
-        Filename of the ONNX model to be changed.
-      meta_data:
-        Key-value pairs.
-    """
-    model = onnx.load(filename)
-    while len(model.metadata_props):
-        model.metadata_props.pop()
-    for key, value in meta_data.items():
-        meta = model.metadata_props.add()
-        meta.key = key
-        meta.value = str(value)
-    onnx.save(model, filename)
-def calc_feat_len(audio_dur):
-    import math
-    sample_rate = 16000
-    frame_length = 25 * sample_rate / 1000
-    frame_shift = 10 * sample_rate / 1000
-    length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
-    return length
-def export_encoder(fireredasr_model, args, model_args):
-    encoder = model_wrapper.AudioEncoderTensorCache(
-        fireredasr_model.encoder,
-        fireredasr_model.decoder)
-    encoder.eval()
-    # forge encoder input
-    encoder_input = torch.randn(1, calc_feat_len(10), 80)
-    encoder_input_lengths = torch.tensor([100], dtype=torch.int64)
-    n_layer_cross_k, n_layer_cross_v, cross_attn_mask = encoder(
-        encoder_input,
-        encoder_input_lengths
-    )
-    if not os.path.exists(args.encoder):
-        os.makedirs(args.encoder)
-    onnx_encoder_file = os.path.join(args.encoder, "encoder.onnx")
-    with torch.no_grad():
-        torch.onnx.export(
-            encoder,
-            (encoder_input, encoder_input_lengths),
-            onnx_encoder_file,
-            export_params=True,
-            do_constant_folding=True,
-            opset_version=16,
-            verbose=False,
-            input_names=["encoder_input",
-                        "encoder_input_lengths"],
-            output_names=["n_layer_cross_k",
-                        "n_layer_cross_v",
-                        "cross_attn_mask"],
-            # dynamic_axes={
-            #     "encoder_input": {
-            #         0: "batch_size",
-            #         1: "input_length"
-            #     },
-            #     "encoder_input_lengths": {
-            #         0: "batch_size"
-            #     },
-            #     "n_layer_cross_k": {
-            #         1: "batch_size",
-            #         2: "length"
-            #     },
-            #     "n_layer_cross_v": {
-            #         1: "batch_size",
-            #         2: "length"
-            #     },
-            #     "cross_attn_mask": {
-            #         0: "batch_size",
-            #         2: "length"
-            #     }
-            # },
-            external_data=True
-        )
-    external_filename = os.path.basename(onnx_encoder_file).split(".onnx")[0]
-    model = onnx.load(onnx_encoder_file)
-    convert_model_to_external_data(
-        model,
-        all_tensors_to_one_file=True,
-        location=f"./{external_filename}.data",
-        size_threshold=0,
-        convert_attribute=False
-    )
-    onnx.save_model(
-        model,
-        onnx_encoder_file,
-        save_as_external_data=True,
-        all_tensors_to_one_file=True,
-        location=f"./{external_filename}.data",
-        size_threshold=0
-    )
-    onnx.checker.check_model(onnx_encoder_file, True)
-    ort_session = onnxruntime.InferenceSession(onnx_encoder_file)
-    onnx_encoder_input = to_numpy(encoder_input)
-    onxx_encoder_input_lengths = to_numpy(encoder_input_lengths)
-    ort_inputs = {ort_session.get_inputs()[0].name: onnx_encoder_input,
-                  ort_session.get_inputs()[1].name: onxx_encoder_input_lengths}
-    ort_outputs = ort_session.run(None, ort_inputs)
-    try:
-        np.testing.assert_allclose(to_numpy(n_layer_cross_k), ort_outputs[0], rtol=1e-03, atol=1e-05)
-    except AssertionError as e:
-        print(e)
-    try:
-        np.testing.assert_allclose(to_numpy(n_layer_cross_v), ort_outputs[1], rtol=1e-03, atol=1e-05)
-    except AssertionError as e:
-        print(e)
-    try:
-        np.testing.assert_allclose(to_numpy(cross_attn_mask), ort_outputs[2], rtol=1e-03, atol=1e-05)
-    except AssertionError as e:
-        print(e)
-    print("export onnx encoder done.")
-    # Generate int8 quantization models
-    # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
-    print("Generate int8 quantization models")
-    if not os.path.exists(args.encoder_int8):
-        os.mkdir(args.encoder_int8)
-    onnx_encoder_int8_file = "encoder_int8.onnx"
-    onnx_encoder_int8_file = os.path.join(args.encoder_int8, onnx_encoder_int8_file)
-    quantize_dynamic(
-        model_input=onnx_encoder_file,
-        model_output=onnx_encoder_int8_file,
-        op_types_to_quantize=["MatMul"],
-        weight_type=QuantType.QInt8,
-    )
-    cmvn_mean, cmvn_inv_stddev = read_kaldi_cmvn(args.cmvn)
-    cmvn_mean = [str(m) for m in cmvn_mean]
-    cmvn_inv_stddev = [str(istd) for istd in cmvn_inv_stddev]
-    encoder_meta_data = {
-        "model_type": "FireRedAsrAED-L",
-        "maintainer": "LiangHu",
-        "feat_dim": model_args.idim,
-        "feat_type": "fbank",
-        "num_decoder_layers": model_args.n_layers_dec,
-        "num_head": model_args.n_head,
-        "head_dim": model_args.d_model // model_args.n_head,
-        "max_len": 448,
-        "sos": model_args.sos_id,
-        "eos": model_args.eos_id,
-        "cmvn_mean": ','.join(cmvn_mean),
-        "cmvn_inv_stddev": ','.join(cmvn_inv_stddev)
-    }
-    # add_meta_data(onnx_encoder_file, encoder_meta_data)
-    add_meta_data(onnx_encoder_int8_file, encoder_meta_data)
-    return n_layer_cross_k, n_layer_cross_v, cross_attn_mask
-def export_decoder(fireredasr_model, args,
-                   n_layer_cross_k,
-                   n_layer_cross_v,
-                   cross_attn_mask):
-    beam_size = 3
-    decoder = model_wrapper.TextDecoderTensorCache(
-        fireredasr_model.decoder)
-    decoder.eval()
-    num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
-    encoder_out_length = cross_attn_mask.size(-1)
-    # preparing for batch beam search
-    cross_attn_mask = cross_attn_mask.unsqueeze(1).repeat(
-        1, beam_size, 1, 1).view(beam_size * batch_size, -1, encoder_out_length)
-    n_layer_cross_k = n_layer_cross_k.unsqueeze(2).repeat(
-        1, 1, beam_size, 1, 1
-    ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
-    n_layer_cross_v = n_layer_cross_v.unsqueeze(2).repeat(
-        1, 1, beam_size, 1, 1
-    ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
-    tokens = torch.ones(beam_size * batch_size, 1).fill_(decoder.decoder.sos_id).long()
-    n_layer_self_k_cache = torch.zeros(
-        (
-            len(decoder.blocks),
-            batch_size * beam_size,
-            448,
-            1280
-        )
-    )
-    n_layer_self_v_cache = torch.zeros(
-        (
-            len(decoder.blocks),
-            batch_size * beam_size,
-            448,
-            1280
-        )
-    )
-    offset = torch.zeros(1, dtype=torch.int64)
-    self_attn_mask = torch.empty(batch_size * beam_size,
-                                 tokens.shape[-1], tokens.shape[-1]
-                                 ).fill_(-np.inf).triu_(1) # fill_(-np.inf)
-    self_attn_mask = self_attn_mask[:, -1:, :]
-    logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = decoder(
-        tokens,
-        n_layer_self_k_cache,
-        n_layer_self_v_cache,
-        n_layer_cross_k,
-        n_layer_cross_v,
-        offset,
-        self_attn_mask,
-        cross_attn_mask
-    )
-    if not os.path.exists(args.decoder):
-        os.makedirs(args.decoder)
-    onnx_decoder_file = os.path.join(args.decoder, "decoder.onnx")
-    with torch.no_grad():
-        torch.onnx.export(
-            decoder,
-            (tokens,
-            n_layer_self_k_cache,
-            n_layer_self_v_cache,
-            n_layer_cross_k,
-            n_layer_cross_v,
-            offset,
-            self_attn_mask,
-            cross_attn_mask),
-            onnx_decoder_file,
-            export_params=True,
-            opset_version=13,
-            verbose=False,
-            input_names=["tokens",
-                        "in_n_layer_self_k_cache",
-                        "in_n_layer_self_v_cache",
-                        "n_layer_cross_k",
-                        "n_layer_cross_v",
-                        "offset",
-                        "self_attn_mask",
-                        "cross_attn_mask"],
-            output_names=["logits",
-                        "out_n_layer_self_k_cache",
-                        "out_n_layer_self_v_cache"],
-            dynamic_axes={
-                "tokens": {0: "n_audio", 1: "n_tokens"},
-                "in_n_layer_self_k_cache": {1: "n_audio"},
-                "in_n_layer_self_v_cache": {1: "n_audio"},
-                "n_layer_cross_k": {1: "n_audio", 2: "T"},
-                "n_layer_cross_v": {1: "n_audio", 2: "T"},
-                "self_attn_mask": {0: "n_audio", 2: "T"},
-                "cross_attn_mask": {0: "n_audio", 2: "T"},
-            },
-            external_data=True
-        )
-    onnx.checker.check_model(onnx_decoder_file)
-    ort_session = onnxruntime.InferenceSession(onnx_decoder_file)
-    onnx_tokens = to_numpy(tokens)
-    onnx_n_layer_self_k_cache = to_numpy(n_layer_self_k_cache)
-    onnx_n_layer_self_v_cache = to_numpy(n_layer_self_v_cache)
-    onnx_n_layer_cross_k = to_numpy(n_layer_cross_k)
-    onnx_n_layer_cross_v = to_numpy(n_layer_cross_v)
-    onnx_offset = to_numpy(offset)
-    onnx_self_attn_mask = to_numpy(self_attn_mask)
-    onnx_cross_attn_mask = to_numpy(cross_attn_mask)
-    ort_inputs = {ort_session.get_inputs()[0].name: onnx_tokens,
-                  ort_session.get_inputs()[1].name: onnx_n_layer_self_k_cache,
-                  ort_session.get_inputs()[2].name: onnx_n_layer_self_v_cache,
-                  ort_session.get_inputs()[3].name: onnx_n_layer_cross_k,
-                  ort_session.get_inputs()[4].name: onnx_n_layer_cross_v,
-                  ort_session.get_inputs()[5].name: onnx_offset,
-                  ort_session.get_inputs()[6].name: onnx_self_attn_mask,
-                  ort_session.get_inputs()[7].name: onnx_cross_attn_mask}
-    ort_outputs = ort_session.run(None, ort_inputs)
-    try:
-        np.testing.assert_allclose(to_numpy(logits), ort_outputs[0], rtol=1e-03, atol=1e-05)
-    except AssertionError as e:
-        print(e)
-    try:
-        np.testing.assert_allclose(to_numpy(out_n_layer_self_k_cache), ort_outputs[1], rtol=1e-03, atol=1e-05)
-    except AssertionError as e:
-        print(e)
-    try:
-        np.testing.assert_allclose(to_numpy(out_n_layer_self_v_cache), ort_outputs[2], rtol=1e-03, atol=1e-05)
-    except AssertionError as e:
-        print(e)
-    print("export onnx decoder done.")
-    if not os.path.exists(args.decoder_int8):
-        os.mkdir(args.decoder_int8)
-    onnx_decoder_int8_file = "decoder_int8.onnx"
-    onnx_decoder_int8_file = os.path.join(args.decoder_int8, onnx_decoder_int8_file)
-    quantize_dynamic(
-        model_input=onnx_decoder_file,
-        model_output=onnx_decoder_int8_file,
-        op_types_to_quantize=["MatMul"],
-        weight_type=QuantType.QInt8,
-    )
-    # decoder main
-    decoder = model_wrapper.TextDecoderTensorCacheV2(
-        fireredasr_model.decoder, loop=False)
-    decoder.eval()
-    self_attn_mask = torch.empty(batch_size * beam_size,
-                                 tokens.shape[-1], tokens.shape[-1]
-                                 ).fill_(-np.inf).triu_(1) # fill_(-np.inf)
-    self_attn_mask = self_attn_mask[:, -1:, :]
-    pe = decoder.decoder.positional_encoding.pe[0]
-    onnx_decoder_file = os.path.join(args.decoder, "decoder_main.onnx")
-    with torch.no_grad():
-        torch.onnx.export(
-            decoder,
-            (tokens,
-            n_layer_self_k_cache,
-            n_layer_self_v_cache,
-            n_layer_cross_k,
-            n_layer_cross_v,
-            pe[0],
-            self_attn_mask,
-            cross_attn_mask),
-            onnx_decoder_file,
-            export_params=True,
-            opset_version=13,
-            verbose=False,
-            input_names=["tokens",
-                        "in_n_layer_self_k_cache",
-                        "in_n_layer_self_v_cache",
-                        "n_layer_cross_k",
-                        "n_layer_cross_v",
-                        "pe",
-                        "self_attn_mask",
-                        "cross_attn_mask"],
-            output_names=["logits",
-                        "out_n_layer_self_k_cache",
-                        "out_n_layer_self_v_cache"],
-            # dynamic_axes={
-            #     "tokens": {0: "n_audio", 1: "n_tokens"},
-            #     "in_n_layer_self_k_cache": {1: "n_audio"},
-            #     "in_n_layer_self_v_cache": {1: "n_audio"},
-            #     "n_layer_cross_k": {1: "n_audio", 2: "T"},
-            #     "n_layer_cross_v": {1: "n_audio", 2: "T"},
-            #     "self_attn_mask": {0: "n_audio", 2: "T"},
-            #     "cross_attn_mask": {0: "n_audio", 2: "T"},
-            # },
-            external_data=True
-        )
-    print(f"Export decoder_main to {onnx_decoder_file}")
-    # decoder loop
-    decoder = model_wrapper.TextDecoderTensorCacheV2(
-        fireredasr_model.decoder, loop=True)
-    decoder.eval()
-    pe = decoder.decoder.positional_encoding.pe[0]
-    pe_file = os.path.join(args.decoder, "pe.npy")
-    np.save(pe_file, pe.numpy())
-    onnx_decoder_file = os.path.join(args.decoder, "decoder_loop.onnx")
-    with torch.no_grad():
-        torch.onnx.export(
-            decoder,
-            (tokens,
-            n_layer_self_k_cache,
-            n_layer_self_v_cache,
-            n_layer_cross_k,
-            n_layer_cross_v,
-            pe[0],
-            self_attn_mask,
-            cross_attn_mask),
-            onnx_decoder_file,
-            export_params=True,
-            opset_version=13,
-            verbose=False,
-            input_names=["tokens",
-                        "in_n_layer_self_k_cache",
-                        "in_n_layer_self_v_cache",
-                        "n_layer_cross_k",
-                        "n_layer_cross_v",
-                        "pe",
-                        "self_attn_mask",
-                        "cross_attn_mask"],
-            output_names=["logits",
-                        "out_n_layer_self_k_cache",
-                        "out_n_layer_self_v_cache"],
-            # dynamic_axes={
-            #     "tokens": {0: "n_audio", 1: "n_tokens"},
-            #     "in_n_layer_self_k_cache": {1: "n_audio"},
-            #     "in_n_layer_self_v_cache": {1: "n_audio"},
-            #     "n_layer_cross_k": {1: "n_audio", 2: "T"},
-            #     "n_layer_cross_v": {1: "n_audio", 2: "T"},
-            #     "self_attn_mask": {0: "n_audio", 2: "T"},
-            #     "cross_attn_mask": {0: "n_audio", 2: "T"},
-            # },
-            external_data=True
-        )
-    print(f"Export decoder_loop to {onnx_decoder_file}")
-def parse_args():
-    parser = argparse.ArgumentParser(description="export FireRedASR-AED torch model to onnx")
-    parser.add_argument(
-        "--model",
-        type=str,
-        required=True,
-        help="Path to FireRedASR-AED torch model"
-    )
-    parser.add_argument(
-        "--encoder",
-        type=str,
-        required=True,
-        help="Dir to the exported onnx encoder"
-    )
-    parser.add_argument(
-        "--decoder",
-        type=str,
-        required=True,
-        help="Dir to the exported onnx decoder"
-    )
-    parser.add_argument(
-        "--encoder_int8",
-        type=str,
-        required=True,
-        help="Dir to the exported onnx encoder after int8 quantization"
-    )
-    parser.add_argument(
-        "--decoder_int8",
-        type=str,
-        required=True,
-        help="Dir to the exported onnx encoder after int8 quantization"
-    )
-    parser.add_argument(
-        "--cmvn",
-        type=str,
-        required=True,
-        help="cmvn.ark file"
-    )
-    return parser.parse_args()
-def main():
-    args = parse_args()
-    fireredasr_model, model_args = load_model(args.model)
-    n_layer_cross_k, n_layer_cross_v, cross_attn_mask = export_encoder(fireredasr_model, args, model_args)
-    export_decoder(fireredasr_model, args, n_layer_cross_k, n_layer_cross_v, cross_attn_mask)
-if __name__ == "__main__":
-    main()

test_decoder.py DELETED Viewed

@@ -1,640 +0,0 @@
-from fireredasr.data.asr_feat import ASRFeatExtractor
-from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer
-import onnxruntime as ort
-# import axengine as axe
-import torch
-import torch.nn.functional as F
-import numpy as np
-from torch import Tensor
-from typing import Tuple, List, Dict
-import argparse
-import os
-import time
-import logging
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-logger_stream_hander = logging.StreamHandler()
-logger_stream_hander.setLevel("INFO")
-logger.addHandler(logger_stream_hander)
-INF = 1e10
-def to_numpy(tensor):
-    if isinstance(tensor, np.ndarray):
-        return tensor
-    if tensor.requires_grad:
-        return tensor.detach().cpu().numpy()
-    else:
-        return tensor.cpu().numpy()
-def set_finished_beam_score_to_zero(scores, is_finished):
-    NB, B = scores.size()
-    is_finished = is_finished.float()
-    mask_score = torch.tensor([0.0] + [-INF]*(B-1)).float()
-    mask_score = mask_score.view(1, B).repeat(NB, 1)
-    return scores * (1 - is_finished) + mask_score * is_finished
-def set_finished_beam_y_to_eos(ys, is_finished, eos_id):
-    is_finished = is_finished.long()
-    return ys * (1 - is_finished) + eos_id * is_finished
-class FireRedASROnnxModel:
-    def __init__(
-        self,
-        encoder_path: str,
-        decoder_path: str,
-        cmvn_file: str,
-        dict_file: str,
-        spm_model_path: str,
-        providers=['CPUExecutionProvider']
-    ):
-        session_opts = ort.SessionOptions()
-        session_opts.inter_op_num_threads = 1
-        session_opts.intra_op_num_threads = 1
-        # session_opts.log_severity_level = 1
-        self.session_opts = session_opts
-        # NOTE: 参考whisper设置的最大的解码长度
-        # FireRedASR-AED 模型支持的最长语音为 60s
-        # ref: https://github.com/FireRedTeam/FireRedASR?tab=readme-ov-file#input-length-limitations
-        self.decode_max_len = 448
-        self.decoder_hidden_dim = 1280
-        self.num_decoder_blocks = 16
-        self.blank_id = 0
-        self.sos_id = 3
-        self.eos_id = 4
-        self.pad_id = 2
-        self.feature_extractor = ASRFeatExtractor(cmvn_file)
-        self.tokenizer = ChineseCharEnglishSpmTokenizer(dict_file, spm_model_path)
-        self.encoder = None
-        self.decoder = None
-        # self.init_encoder(encoder_path, providers)
-        # self.init_decoder(decoder_path, providers)
-        self.init_decoder_main(decoder_path, providers)
-        self.init_decoder_loop(decoder_path, providers)
-        self.pe = self.init_pe(decoder_path)
-    # def init_encoder(self, encoder_path, providers=None):
-    #     start_time = time.time()
-    #     self.encoder = axe.InferenceSession(
-    #         encoder_path,
-    #         # sess_options=self.session_opts,
-    #         providers=providers
-    #     )
-    #     end_time = time.time()
-    #     logger.info(f"load encoder cost {end_time - start_time} seconds")
-    def init_decoder(self, decoder_path, providers=None):
-        start_time = time.time()
-        self.decoder = ort.InferenceSession(
-            decoder_path,
-            sess_options=self.session_opts,
-            providers=providers
-        )
-        end_time = time.time()
-        logger.info(f"load decoder cost {end_time - start_time} seconds")
-    def init_decoder_main(self, decoder_path, providers=None):
-        decoder_path = os.path.dirname(decoder_path)
-        decoder_path = os.path.join(decoder_path, "decoder_main.onnx")
-        start_time = time.time()
-        self.decoder_main = ort.InferenceSession(
-            decoder_path,
-            sess_options=self.session_opts,
-            providers=providers
-        )
-        end_time = time.time()
-        logger.info(f"load decoder_main cost {end_time - start_time} seconds")
-        input_names = [i.name for i in self.decoder_main.get_inputs()]
-        print(f"decoder_main.input_names: {input_names}")
-    def init_decoder_loop(self, decoder_path, providers=None):
-        decoder_path = os.path.dirname(decoder_path)
-        decoder_path = os.path.join(decoder_path, "decoder_loop.onnx")
-        start_time = time.time()
-        self.decoder_loop = ort.InferenceSession(
-            decoder_path,
-            sess_options=self.session_opts,
-            providers=providers
-        )
-        end_time = time.time()
-        logger.info(f"load decoder_loop cost {end_time - start_time} seconds")
-        input_names = [i.name for i in self.decoder_loop.get_inputs()]
-        print(f"decoder_loop.input_names: {input_names}")
-    def init_pe(self, decoder_path):
-        decoder_path = os.path.dirname(decoder_path)
-        decoder_path = os.path.join(decoder_path, "pe.npy")
-        return np.load(decoder_path)
-    def run_encoder(self, input: np.ndarray,
-                    input_length: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.encoder.run(
-            None,
-            {
-                "encoder_input": input,
-                "encoder_input_lengths": input_length.astype(np.int32)
-            }
-        )
-        return (
-            n_layer_cross_k,
-            n_layer_cross_v,
-            cross_attn_mask
-        )
-    def decode_one_token(
-        self,
-        tokens: np.ndarray,
-        n_layer_self_k_cache: np.ndarray,
-        n_layer_self_v_cache: np.ndarray,
-        n_layer_cross_k_cache: np.ndarray,
-        n_layer_cross_v_cache: np.ndarray,
-        offset: np.ndarray,
-        self_attn_mask: np.ndarray,
-        cross_attn_mask: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        print("decode:")
-        print(f"tokens.shape: {tokens.shape}")
-        print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
-        print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
-        print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
-        print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
-        print(f"offset.shape: {offset.shape}")
-        print(f"self_attn_mask.shape: {self_attn_mask.shape}")
-        print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
-        # print(f"self_attn_mask: {self_attn_mask}")
-        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder.run(
-            None,
-            {
-                self.decoder.get_inputs()[0].name: tokens,
-                self.decoder.get_inputs()[1].name: n_layer_self_k_cache,
-                self.decoder.get_inputs()[2].name: n_layer_self_v_cache,
-                self.decoder.get_inputs()[3].name: n_layer_cross_k_cache,
-                self.decoder.get_inputs()[4].name: n_layer_cross_v_cache,
-                self.decoder.get_inputs()[5].name: offset,
-                self.decoder.get_inputs()[6].name: self_attn_mask,
-                self.decoder.get_inputs()[7].name: cross_attn_mask,
-            }
-        )
-        return (
-            logits,
-            out_n_layer_self_k_cache,
-            out_n_layer_self_v_cache
-        )
-    def decode_main_one_token(
-        self,
-        tokens: np.ndarray,
-        n_layer_self_k_cache: np.ndarray,
-        n_layer_self_v_cache: np.ndarray,
-        n_layer_cross_k_cache: np.ndarray,
-        n_layer_cross_v_cache: np.ndarray,
-        pe: np.ndarray,
-        self_attn_mask: np.ndarray,
-        cross_attn_mask: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        # print("decode_main:")
-        # print(f"tokens.shape: {tokens.shape}")
-        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
-        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
-        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
-        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
-        # print(f"pe.shape: {pe.shape}")
-        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
-        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
-        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_main.run(
-            None,
-            {
-                self.decoder_main.get_inputs()[0].name: tokens,
-                # self.decoder_main.get_inputs()[1].name: n_layer_self_k_cache,
-                self.decoder_main.get_inputs()[1].name: n_layer_cross_k_cache,
-                self.decoder_main.get_inputs()[2].name: n_layer_cross_v_cache,
-                self.decoder_main.get_inputs()[3].name: pe,
-                self.decoder_main.get_inputs()[4].name: self_attn_mask,
-                self.decoder_main.get_inputs()[5].name: cross_attn_mask,
-                # self.decoder_main.get_inputs()[7].name: cross_attn_mask,
-            }
-        )
-        return (
-            logits,
-            out_n_layer_self_k_cache,
-            out_n_layer_self_v_cache
-        )
-    def decode_loop_one_token(
-        self,
-        tokens: np.ndarray,
-        n_layer_self_k_cache: np.ndarray,
-        n_layer_self_v_cache: np.ndarray,
-        n_layer_cross_k_cache: np.ndarray,
-        n_layer_cross_v_cache: np.ndarray,
-        pe: np.ndarray,
-        self_attn_mask: np.ndarray,
-        cross_attn_mask: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        # print("decode_loop:")
-        # print(f"tokens.shape: {tokens.shape}")
-        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
-        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
-        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
-        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
-        # print(f"pe.shape: {pe.shape}")
-        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
-        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
-        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_loop.run(
-            None,
-            {
-                self.decoder_loop.get_inputs()[0].name: tokens,
-                self.decoder_loop.get_inputs()[1].name: n_layer_self_k_cache,
-                self.decoder_loop.get_inputs()[2].name: n_layer_self_v_cache,
-                self.decoder_loop.get_inputs()[3].name: n_layer_cross_k_cache,
-                self.decoder_loop.get_inputs()[4].name: n_layer_cross_v_cache,
-                self.decoder_loop.get_inputs()[5].name: pe,
-                self.decoder_loop.get_inputs()[6].name: self_attn_mask,
-                self.decoder_loop.get_inputs()[7].name: cross_attn_mask,
-            }
-        )
-        return (
-            logits,
-            out_n_layer_self_k_cache,
-            out_n_layer_self_v_cache
-        )
-    def run_decoder(
-        self,
-        n_layer_cross_k,
-        n_layer_cross_v,
-        cross_attn_mask,
-        beam_size,
-        nbest
-    ):
-        num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
-        encoder_out_length = cross_attn_mask.shape[-1]
-        cross_attn_mask = torch.from_numpy(cross_attn_mask).to(torch.float32)
-        cross_attn_mask = cross_attn_mask.unsqueeze(1).repeat(
-            1, beam_size, 1, 1
-        ).view(beam_size * batch_size, -1, encoder_out_length)
-        n_layer_cross_k = torch.from_numpy(n_layer_cross_k)
-        n_layer_cross_v = torch.from_numpy(n_layer_cross_v)
-        n_layer_cross_k = n_layer_cross_k.unsqueeze(2).repeat(
-            1, 1, beam_size, 1, 1
-        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
-        n_layer_cross_v = n_layer_cross_v.unsqueeze(2).repeat(
-            1, 1, beam_size, 1, 1
-        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
-        prediction_tokens = torch.ones(
-            beam_size * batch_size, 1).fill_(self.sos_id).long()
-        tokens = prediction_tokens
-        offset = torch.zeros(1, dtype=torch.int64)
-        n_layer_self_k_cache, n_layer_self_v_cache = self.get_initialized_self_cache(
-            batch_size, beam_size
-        )
-        scores = torch.tensor([0.0] + [-INF]*(beam_size - 1)).float()
-        scores = scores.repeat(batch_size).view(batch_size * beam_size, 1)
-        is_finished = torch.zeros_like(scores)
-        # self_attn_mask = torch.zeros(
-        #     batch_size * beam_size,
-        #     1, 1
-        # )
-        self_attn_mask = np.zeros((batch_size * beam_size, 1, 1), dtype=np.float32)
-        results = [self.sos_id]
-        for i in range(self.decode_max_len):
-            # self_attn_mask = torch.empty(
-            #     batch_size * beam_size,
-            #     prediction_tokens.shape[-1], prediction_tokens.shape[-1]
-            # ).fill_(-np.inf).triu_(1)
-            # self_attn_mask = self_attn_mask[:, -1:, :]
-            # self_attn_mask = to_numpy(self_attn_mask)
-            # logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_one_token(
-            #     to_numpy(tokens),
-            #     to_numpy(n_layer_self_k_cache),
-            #     to_numpy(n_layer_self_v_cache),
-            #     to_numpy(n_layer_cross_k),
-            #     to_numpy(n_layer_cross_v),
-            #     to_numpy(offset),
-            #     to_numpy(self_attn_mask),
-            #     to_numpy(cross_attn_mask)
-            # )
-            tokens = to_numpy(tokens)
-            n_layer_self_k_cache = to_numpy(n_layer_self_k_cache)
-            n_layer_self_v_cache = to_numpy(n_layer_self_v_cache)
-            n_layer_cross_k = to_numpy(n_layer_cross_k)
-            n_layer_cross_v = to_numpy(n_layer_cross_v)
-            cross_attn_mask = to_numpy(cross_attn_mask)
-            if i == 0:
-                logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_main_one_token(
-                    to_numpy(tokens),
-                    to_numpy(n_layer_self_k_cache),
-                    to_numpy(n_layer_self_v_cache),
-                    to_numpy(n_layer_cross_k),
-                    to_numpy(n_layer_cross_v),
-                    self.pe[offset],
-                    self_attn_mask,
-                    to_numpy(cross_attn_mask)
-                )
-            else:
-                logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_loop_one_token(
-                    to_numpy(tokens),
-                    to_numpy(n_layer_self_k_cache),
-                    to_numpy(n_layer_self_v_cache),
-                    to_numpy(n_layer_cross_k),
-                    to_numpy(n_layer_cross_v),
-                    self.pe[offset],
-                    self_attn_mask,
-                    to_numpy(cross_attn_mask)
-                )
-            offset += 1
-            logits = torch.from_numpy(logits)
-            logits = logits.squeeze(1)
-            t_scores = F.log_softmax(logits, dim=-1)
-            t_topB_scores, t_topB_ys = torch.topk(t_scores, k=beam_size, dim=1)
-            t_topB_scores = set_finished_beam_score_to_zero(t_topB_scores, is_finished)
-            t_topB_ys = set_finished_beam_y_to_eos(t_topB_ys, is_finished, self.eos_id)
-            scores = scores + t_topB_scores
-            scores = scores.view(batch_size, beam_size * beam_size)
-            scores, topB_score_ids = torch.topk(scores, k=beam_size, dim=1)
-            scores = scores.view(-1, 1)
-            topB_row_number_in_each_B_rows_of_ys = torch.div(
-                topB_score_ids, beam_size).view(batch_size * beam_size)
-            stride = beam_size * torch.arange(batch_size).view(
-                batch_size, 1).repeat(1, beam_size).view(batch_size * beam_size)
-            topB_row_number_in_ys = topB_row_number_in_each_B_rows_of_ys.long() + stride.long()
-            prediction_tokens = prediction_tokens[topB_row_number_in_ys]
-            t_ys = torch.gather(
-                t_topB_ys.view(batch_size, beam_size * beam_size),
-                dim=1, index=topB_score_ids
-            ).view(beam_size * batch_size, 1)
-            tokens = t_ys
-            prediction_tokens = torch.cat((prediction_tokens, t_ys), dim=1)
-            n_layer_self_k_cache = torch.from_numpy(n_layer_self_k_cache)
-            n_layer_self_v_cache = torch.from_numpy(n_layer_self_v_cache)
-            for i, self_k_cache in enumerate(n_layer_self_k_cache):
-                n_layer_self_k_cache[i] = n_layer_self_k_cache[i][topB_row_number_in_ys]
-            for i, self_v_cache in enumerate(n_layer_self_v_cache):
-                n_layer_self_v_cache[i] = n_layer_self_v_cache[i][topB_row_number_in_ys]
-            is_finished = t_ys.eq(self.eos_id)
-            if is_finished.sum().item() == beam_size * batch_size:
-                break
-        scores = scores.view(batch_size, beam_size)
-        prediction_valid_token_lengths = torch.sum(
-            torch.ne(
-                prediction_tokens.view(batch_size, beam_size, -1),
-                self.eos_id),
-            dim=-1
-        ).int()
-        nbest_scores, nbest_ids = torch.topk(scores, k=nbest, dim=1)
-        index = nbest_ids + beam_size * torch.arange(batch_size).view(batch_size, 1).long()
-        nbest_prediction_tokens = prediction_tokens.view(batch_size * beam_size, -1)[index.view(-1)]
-        nbest_prediction_tokens = nbest_prediction_tokens.view(batch_size, nbest_ids.size(1), -1)
-        nbest_prediction_valid_token_lengths = prediction_valid_token_lengths.view(
-            batch_size * beam_size)[index.view(-1)].view(batch_size, -1)
-        nbest_hyps: List[List[Dict[str, torch.Tensor]]] = []
-        for i in range(batch_size):
-            i_best_hyps: List[Dict[str, torch.Tensor]] = []
-            for j, score in enumerate(nbest_scores[i]):
-                hyp = {
-                    "token_ids": nbest_prediction_tokens[i, j, 1:nbest_prediction_valid_token_lengths[i, j]],
-                    "score": score
-                }
-                i_best_hyps.append(hyp)
-            nbest_hyps.append(i_best_hyps)
-        return nbest_hyps
-    def get_initialized_self_cache(self,
-                                   batch_size,
-                                   beam_size
-                                   ) -> Tuple[Tensor, Tensor]:
-        n_layer_self_k_cache = torch.zeros(
-            self.num_decoder_blocks,
-            batch_size * beam_size,
-            self.decode_max_len,
-            self.decoder_hidden_dim,
-        )
-        n_layer_self_v_cache = torch.zeros(
-            self.num_decoder_blocks,
-            batch_size * beam_size,
-            self.decode_max_len,
-            self.decoder_hidden_dim,
-        )
-        return n_layer_self_k_cache, n_layer_self_v_cache
-    def calc_feat_len(self, audio_dur):
-        import math
-        sample_rate = 16000
-        frame_length = 25 * sample_rate / 1000
-        frame_shift = 10 * sample_rate / 1000
-        length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
-        return length
-    def transcribe(self,
-                   batch_wav_path: List[str],
-                   beam_size: int = 1,
-                   nbest: int = 1
-                ) -> List[Dict]:
-        feats, lengths, wav_durations = self.feature_extractor(batch_wav_path)
-        print(f"feats.shape: {feats.shape}")
-        maxlen = self.calc_feat_len(10)
-        if feats.shape[1] < maxlen:
-            feats = np.concatenate([feats, np.zeros((1, maxlen - feats.shape[1], 80), dtype=np.float32)], axis=1)
-        feats = feats[:, :maxlen, :]
-        encoder_data_path = os.path.join("encoder_output", os.path.basename(batch_wav_path[0]))
-        # decoder_data_path = os.path.join("calib_dataset", "decoder", os.path.basename(batch_wav_path[0]))
-        # os.makedirs(encoder_data_path, exist_ok=True)
-        # os.makedirs(decoder_data_path, exist_ok=True)
-        n_layer_cross_k = np.load(os.path.join(encoder_data_path, "n_layer_cross_k.npy"))
-        n_layer_cross_v = np.load(os.path.join(encoder_data_path, "n_layer_cross_v.npy"))
-        cross_attn_mask = np.load(os.path.join(encoder_data_path, "cross_attn_mask.npy"))
-        # for name, npy in zip(["encoder_input", "encoder_input_lengths"], [feats, lengths]):
-        #     file_path = os.path.join(encoder_data_path, name + ".npy")
-        #     np.save(file_path, npy)
-        start_time = time.time()
-        nbest_hyps = self.run_decoder(n_layer_cross_k,
-                                      n_layer_cross_v,
-                                      cross_attn_mask,
-                                      beam_size,
-                                      nbest
-                                      )
-        transcribe_durations = time.time() - start_time
-        results: List[Dict] = []
-        for wav, hyp in zip(batch_wav_path, nbest_hyps):
-            hyp = hyp[0]
-            hyp_ids = [int(id) for id in hyp["token_ids"].cpu()]
-            score = hyp["score"].item()
-            text = self.tokenizer.detokenize(hyp_ids)
-            results.append(
-                {
-                    "wav": wav,
-                    "text": text,
-                    "score": score
-                }
-            )
-        return results, wav_durations, transcribe_durations
-def parse_args():
-    parser = argparse.ArgumentParser(description="FireRedASROnnxModel Test")
-    parser.add_argument(
-        "--encoder",
-        type=str,
-        default="axmodel/encoder.axmodel",
-        help="Path to onnx encoder"
-    )
-    parser.add_argument(
-        "--decoder",
-        type=str,
-        default="onnx_decoder/decoder_main.onnx",
-        help="Path to onnx decoder"
-    )
-    parser.add_argument(
-        "--cmvn",
-        type=str,
-        default="axmodel/cmvn.ark",
-        help="Path to cmvn"
-    )
-    parser.add_argument(
-        "--dict",
-        type=str,
-        default="axmodel/dict.txt",
-        help="Path to dict"
-    )
-    parser.add_argument(
-        "--spm_model",
-        type=str,
-        default="axmodel/train_bpe1000.model",
-        help="Path to spm model"
-    )
-    parser.add_argument(
-        "--wavlist",
-        type=str,
-        default="wavlist.txt",
-        help="File to wav path list"
-    )
-    parser.add_argument(
-        "--hypo",
-        type=str,
-        default="hypo_encoder.txt",
-        help="File of hypos"
-    )
-    parser.add_argument(
-        "--beam_size",
-        type=int,
-        default=3,
-        help=""
-    )
-    parser.add_argument(
-        "--nbest",
-        type=int,
-        default=1,
-        help=""
-    )
-    return parser.parse_args()
-def parse_wavlist(wavlist: str):
-    wavpaths = []
-    with open(wavlist) as f:
-        for line in f:
-            line = line.strip()
-            if not os.path.exists(line):
-                print(f"{line} doesn't exist.")
-                continue
-            wavpaths.append(line)
-    return wavpaths
-def main():
-    args = parse_args()
-    print(args)
-    onnx_model = FireRedASROnnxModel(args.encoder,
-                                     args.decoder,
-                                     args.cmvn,
-                                     args.dict,
-                                     args.spm_model)
-    wf = open(args.hypo, "wt")
-    wavlist = parse_wavlist(args.wavlist)
-    total_wav_durations = 0
-    total_transcribe_durations = 0
-    for wav in wavlist:
-        batch_wav = [wav]
-        results, wav_durations, transcribe_durations = onnx_model.transcribe(batch_wav, args.beam_size, args.nbest)
-        wav_durations = sum(wav_durations)
-        total_wav_durations += wav_durations
-        total_transcribe_durations += transcribe_durations
-        logger.info(f"{batch_wav}")
-        logger.info(f"Durations: {wav_durations}")
-        logger.info(f"Transcribe Durations: {transcribe_durations}")
-        rtf = transcribe_durations / wav_durations
-        logger.info(f"(Real time factor) RTF: {rtf}")
-        for result in results:
-            logger.info(f"wav: {result['wav']}")
-            logger.info(f"text: {result['text']}")
-            logger.info(f"score: {result['score']}")
-            logger.info("")
-            wf.write(f"{result['text']} ({result['wav']})\n")
-    logger.info(f"total wav durations: {total_wav_durations}")
-    logger.info(f"total transcribe durations: {total_transcribe_durations}")
-    avg_ref = total_transcribe_durations / total_wav_durations
-    logger.info(f"AVG RTF: {avg_ref}")
-    wf.close()
-if __name__ == "__main__":
-    main()

test_encoder.py DELETED Viewed

@@ -1,646 +0,0 @@
-from fireredasr.data.asr_feat import ASRFeatExtractor
-from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer
-import onnxruntime as ort
-import axengine as axe
-import torch
-import torch.nn.functional as F
-import numpy as np
-from torch import Tensor
-from typing import Tuple, List, Dict
-import argparse
-import os
-import time
-import logging
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-logger_stream_hander = logging.StreamHandler()
-logger_stream_hander.setLevel("INFO")
-logger.addHandler(logger_stream_hander)
-INF = 1e10
-def to_numpy(tensor):
-    if isinstance(tensor, np.ndarray):
-        return tensor
-    if tensor.requires_grad:
-        return tensor.detach().cpu().numpy()
-    else:
-        return tensor.cpu().numpy()
-def set_finished_beam_score_to_zero(scores, is_finished):
-    NB, B = scores.size()
-    is_finished = is_finished.float()
-    mask_score = torch.tensor([0.0] + [-INF]*(B-1)).float()
-    mask_score = mask_score.view(1, B).repeat(NB, 1)
-    return scores * (1 - is_finished) + mask_score * is_finished
-def set_finished_beam_y_to_eos(ys, is_finished, eos_id):
-    is_finished = is_finished.long()
-    return ys * (1 - is_finished) + eos_id * is_finished
-class FireRedASROnnxModel:
-    def __init__(
-        self,
-        encoder_path: str,
-        decoder_path: str,
-        cmvn_file: str,
-        dict_file: str,
-        spm_model_path: str,
-        providers=['AXCLRTExecutionProvider', 'AxEngineExecutionProvider']
-    ):
-        session_opts = ort.SessionOptions()
-        session_opts.inter_op_num_threads = 1
-        session_opts.intra_op_num_threads = 1
-        # session_opts.log_severity_level = 1
-        self.session_opts = session_opts
-        # NOTE: 参考whisper设置的最大的解码长度
-        # FireRedASR-AED 模型支持的最长语音为 60s
-        # ref: https://github.com/FireRedTeam/FireRedASR?tab=readme-ov-file#input-length-limitations
-        self.decode_max_len = 448
-        self.decoder_hidden_dim = 1280
-        self.num_decoder_blocks = 16
-        self.blank_id = 0
-        self.sos_id = 3
-        self.eos_id = 4
-        self.pad_id = 2
-        self.feature_extractor = ASRFeatExtractor(cmvn_file)
-        self.tokenizer = ChineseCharEnglishSpmTokenizer(dict_file, spm_model_path)
-        self.encoder = None
-        self.decoder = None
-        self.init_encoder(encoder_path, providers)
-        # self.init_decoder(decoder_path, providers)
-        # self.init_decoder_main(decoder_path, providers)
-        # self.init_decoder_loop(decoder_path, providers)
-        self.pe = self.init_pe(decoder_path)
-    def init_encoder(self, encoder_path, providers=None):
-        start_time = time.time()
-        self.encoder = axe.InferenceSession(
-            encoder_path,
-            # sess_options=self.session_opts,
-            providers=providers
-        )
-        end_time = time.time()
-        logger.info(f"load encoder cost {end_time - start_time} seconds")
-    def init_decoder(self, decoder_path, providers=None):
-        start_time = time.time()
-        self.decoder = ort.InferenceSession(
-            decoder_path,
-            sess_options=self.session_opts,
-            providers=['CPUExecutionProvider']
-        )
-        end_time = time.time()
-        logger.info(f"load decoder cost {end_time - start_time} seconds")
-    def init_decoder_main(self, decoder_path, providers=None):
-        decoder_path = os.path.dirname(decoder_path)
-        decoder_path = os.path.join(decoder_path, "decoder_main.onnx")
-        start_time = time.time()
-        self.decoder_main = ort.InferenceSession(
-            decoder_path,
-            sess_options=self.session_opts,
-            providers=['CPUExecutionProvider']
-        )
-        end_time = time.time()
-        logger.info(f"load decoder_main cost {end_time - start_time} seconds")
-        input_names = [i.name for i in self.decoder_main.get_inputs()]
-        print(f"decoder_main.input_names: {input_names}")
-    def init_decoder_loop(self, decoder_path, providers=None):
-        decoder_path = os.path.dirname(decoder_path)
-        decoder_path = os.path.join(decoder_path, "decoder_loop.onnx")
-        start_time = time.time()
-        self.decoder_loop = ort.InferenceSession(
-            decoder_path,
-            sess_options=self.session_opts,
-            providers=['CPUExecutionProvider']
-        )
-        end_time = time.time()
-        logger.info(f"load decoder_loop cost {end_time - start_time} seconds")
-        input_names = [i.name for i in self.decoder_loop.get_inputs()]
-        print(f"decoder_loop.input_names: {input_names}")
-    def init_pe(self, decoder_path):
-        decoder_path = os.path.join("axmodel", "pe.npy")
-        return np.load(decoder_path)
-    def run_encoder(self, input: np.ndarray,
-                    input_length: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.encoder.run(
-            None,
-            {
-                "encoder_input": input,
-                "encoder_input_lengths": input_length.astype(np.int32)
-            }
-        )
-        return (
-            n_layer_cross_k,
-            n_layer_cross_v,
-            cross_attn_mask
-        )
-    def decode_one_token(
-        self,
-        tokens: np.ndarray,
-        n_layer_self_k_cache: np.ndarray,
-        n_layer_self_v_cache: np.ndarray,
-        n_layer_cross_k_cache: np.ndarray,
-        n_layer_cross_v_cache: np.ndarray,
-        offset: np.ndarray,
-        self_attn_mask: np.ndarray,
-        cross_attn_mask: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        # print("decode:")
-        # print(f"tokens.shape: {tokens.shape}")
-        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
-        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
-        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
-        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
-        # print(f"offset.shape: {offset.shape}")
-        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
-        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
-        # print(f"self_attn_mask: {self_attn_mask}")
-        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder.run(
-            None,
-            {
-                self.decoder.get_inputs()[0].name: tokens,
-                self.decoder.get_inputs()[1].name: n_layer_self_k_cache,
-                self.decoder.get_inputs()[2].name: n_layer_self_v_cache,
-                self.decoder.get_inputs()[3].name: n_layer_cross_k_cache,
-                self.decoder.get_inputs()[4].name: n_layer_cross_v_cache,
-                self.decoder.get_inputs()[5].name: offset,
-                self.decoder.get_inputs()[6].name: self_attn_mask,
-                self.decoder.get_inputs()[7].name: cross_attn_mask,
-            }
-        )
-        return (
-            logits,
-            out_n_layer_self_k_cache,
-            out_n_layer_self_v_cache
-        )
-    def decode_main_one_token(
-        self,
-        tokens: np.ndarray,
-        n_layer_self_k_cache: np.ndarray,
-        n_layer_self_v_cache: np.ndarray,
-        n_layer_cross_k_cache: np.ndarray,
-        n_layer_cross_v_cache: np.ndarray,
-        pe: np.ndarray,
-        self_attn_mask: np.ndarray,
-        cross_attn_mask: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        # print("decode_main:")
-        # print(f"tokens.shape: {tokens.shape}")
-        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
-        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
-        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
-        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
-        # print(f"pe.shape: {pe.shape}")
-        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
-        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
-        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_main.run(
-            None,
-            {
-                self.decoder_main.get_inputs()[0].name: tokens,
-                # self.decoder_main.get_inputs()[1].name: n_layer_self_k_cache,
-                self.decoder_main.get_inputs()[1].name: n_layer_cross_k_cache,
-                self.decoder_main.get_inputs()[2].name: n_layer_cross_v_cache,
-                self.decoder_main.get_inputs()[3].name: pe,
-                self.decoder_main.get_inputs()[4].name: self_attn_mask,
-                self.decoder_main.get_inputs()[5].name: cross_attn_mask,
-                # self.decoder_main.get_inputs()[7].name: cross_attn_mask,
-            }
-        )
-        return (
-            logits,
-            out_n_layer_self_k_cache,
-            out_n_layer_self_v_cache
-        )
-    def decode_loop_one_token(
-        self,
-        tokens: np.ndarray,
-        n_layer_self_k_cache: np.ndarray,
-        n_layer_self_v_cache: np.ndarray,
-        n_layer_cross_k_cache: np.ndarray,
-        n_layer_cross_v_cache: np.ndarray,
-        pe: np.ndarray,
-        self_attn_mask: np.ndarray,
-        cross_attn_mask: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        # print("decode_loop:")
-        # print(f"tokens.shape: {tokens.shape}")
-        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
-        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
-        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
-        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
-        # print(f"pe.shape: {pe.shape}")
-        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
-        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
-        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_loop.run(
-            None,
-            {
-                self.decoder_loop.get_inputs()[0].name: tokens,
-                self.decoder_loop.get_inputs()[1].name: n_layer_self_k_cache,
-                self.decoder_loop.get_inputs()[2].name: n_layer_self_v_cache,
-                self.decoder_loop.get_inputs()[3].name: n_layer_cross_k_cache,
-                self.decoder_loop.get_inputs()[4].name: n_layer_cross_v_cache,
-                self.decoder_loop.get_inputs()[5].name: pe,
-                self.decoder_loop.get_inputs()[6].name: self_attn_mask,
-                self.decoder_loop.get_inputs()[7].name: cross_attn_mask,
-            }
-        )
-        return (
-            logits,
-            out_n_layer_self_k_cache,
-            out_n_layer_self_v_cache
-        )
-    def run_decoder(
-        self,
-        n_layer_cross_k,
-        n_layer_cross_v,
-        cross_attn_mask,
-        beam_size,
-        nbest
-    ):
-        num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
-        encoder_out_length = cross_attn_mask.shape[-1]
-        cross_attn_mask = torch.from_numpy(cross_attn_mask).to(torch.float32)
-        cross_attn_mask = cross_attn_mask.unsqueeze(1).repeat(
-            1, beam_size, 1, 1
-        ).view(beam_size * batch_size, -1, encoder_out_length)
-        n_layer_cross_k = torch.from_numpy(n_layer_cross_k)
-        n_layer_cross_v = torch.from_numpy(n_layer_cross_v)
-        n_layer_cross_k = n_layer_cross_k.unsqueeze(2).repeat(
-            1, 1, beam_size, 1, 1
-        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
-        n_layer_cross_v = n_layer_cross_v.unsqueeze(2).repeat(
-            1, 1, beam_size, 1, 1
-        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
-        prediction_tokens = torch.ones(
-            beam_size * batch_size, 1).fill_(self.sos_id).long()
-        tokens = prediction_tokens
-        offset = torch.zeros(1, dtype=torch.int64)
-        n_layer_self_k_cache, n_layer_self_v_cache = self.get_initialized_self_cache(
-            batch_size, beam_size
-        )
-        scores = torch.tensor([0.0] + [-INF]*(beam_size - 1)).float()
-        scores = scores.repeat(batch_size).view(batch_size * beam_size, 1)
-        is_finished = torch.zeros_like(scores)
-        # self_attn_mask = torch.zeros(
-        #     batch_size * beam_size,
-        #     1, 1
-        # )
-        self_attn_mask = np.zeros((batch_size * beam_size, 1, 1), dtype=np.float32)
-        results = [self.sos_id]
-        for i in range(self.decode_max_len):
-            self_attn_mask = torch.empty(
-                batch_size * beam_size,
-                prediction_tokens.shape[-1], prediction_tokens.shape[-1]
-            ).fill_(-np.inf).triu_(1)
-            self_attn_mask = self_attn_mask[:, -1:, :]
-            self_attn_mask = to_numpy(self_attn_mask)
-            logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_one_token(
-                to_numpy(tokens),
-                to_numpy(n_layer_self_k_cache),
-                to_numpy(n_layer_self_v_cache),
-                to_numpy(n_layer_cross_k),
-                to_numpy(n_layer_cross_v),
-                to_numpy(offset),
-                to_numpy(self_attn_mask),
-                to_numpy(cross_attn_mask)
-            )
-            tokens = to_numpy(tokens)
-            n_layer_self_k_cache = to_numpy(n_layer_self_k_cache)
-            n_layer_self_v_cache = to_numpy(n_layer_self_v_cache)
-            n_layer_cross_k = to_numpy(n_layer_cross_k)
-            n_layer_cross_v = to_numpy(n_layer_cross_v)
-            cross_attn_mask = to_numpy(cross_attn_mask)
-            # if i == 0:
-            #     logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_main_one_token(
-            #         to_numpy(tokens),
-            #         to_numpy(n_layer_self_k_cache),
-            #         to_numpy(n_layer_self_v_cache),
-            #         to_numpy(n_layer_cross_k),
-            #         to_numpy(n_layer_cross_v),
-            #         self.pe[offset],
-            #         self_attn_mask,
-            #         to_numpy(cross_attn_mask)
-            #     )
-            # else:
-            #     logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_loop_one_token(
-            #         to_numpy(tokens),
-            #         to_numpy(n_layer_self_k_cache),
-            #         to_numpy(n_layer_self_v_cache),
-            #         to_numpy(n_layer_cross_k),
-            #         to_numpy(n_layer_cross_v),
-            #         self.pe[offset],
-            #         self_attn_mask,
-            #         to_numpy(cross_attn_mask)
-            #     )
-            offset += 1
-            logits = torch.from_numpy(logits)
-            logits = logits.squeeze(1)
-            t_scores = F.log_softmax(logits, dim=-1)
-            t_topB_scores, t_topB_ys = torch.topk(t_scores, k=beam_size, dim=1)
-            t_topB_scores = set_finished_beam_score_to_zero(t_topB_scores, is_finished)
-            t_topB_ys = set_finished_beam_y_to_eos(t_topB_ys, is_finished, self.eos_id)
-            scores = scores + t_topB_scores
-            scores = scores.view(batch_size, beam_size * beam_size)
-            scores, topB_score_ids = torch.topk(scores, k=beam_size, dim=1)
-            scores = scores.view(-1, 1)
-            topB_row_number_in_each_B_rows_of_ys = torch.div(
-                topB_score_ids, beam_size).view(batch_size * beam_size)
-            stride = beam_size * torch.arange(batch_size).view(
-                batch_size, 1).repeat(1, beam_size).view(batch_size * beam_size)
-            topB_row_number_in_ys = topB_row_number_in_each_B_rows_of_ys.long() + stride.long()
-            prediction_tokens = prediction_tokens[topB_row_number_in_ys]
-            t_ys = torch.gather(
-                t_topB_ys.view(batch_size, beam_size * beam_size),
-                dim=1, index=topB_score_ids
-            ).view(beam_size * batch_size, 1)
-            tokens = t_ys
-            prediction_tokens = torch.cat((prediction_tokens, t_ys), dim=1)
-            n_layer_self_k_cache = torch.from_numpy(n_layer_self_k_cache)
-            n_layer_self_v_cache = torch.from_numpy(n_layer_self_v_cache)
-            for i, self_k_cache in enumerate(n_layer_self_k_cache):
-                n_layer_self_k_cache[i] = n_layer_self_k_cache[i][topB_row_number_in_ys]
-            for i, self_v_cache in enumerate(n_layer_self_v_cache):
-                n_layer_self_v_cache[i] = n_layer_self_v_cache[i][topB_row_number_in_ys]
-            is_finished = t_ys.eq(self.eos_id)
-            if is_finished.sum().item() == beam_size * batch_size:
-                break
-        scores = scores.view(batch_size, beam_size)
-        prediction_valid_token_lengths = torch.sum(
-            torch.ne(
-                prediction_tokens.view(batch_size, beam_size, -1),
-                self.eos_id),
-            dim=-1
-        ).int()
-        nbest_scores, nbest_ids = torch.topk(scores, k=nbest, dim=1)
-        index = nbest_ids + beam_size * torch.arange(batch_size).view(batch_size, 1).long()
-        nbest_prediction_tokens = prediction_tokens.view(batch_size * beam_size, -1)[index.view(-1)]
-        nbest_prediction_tokens = nbest_prediction_tokens.view(batch_size, nbest_ids.size(1), -1)
-        nbest_prediction_valid_token_lengths = prediction_valid_token_lengths.view(
-            batch_size * beam_size)[index.view(-1)].view(batch_size, -1)
-        nbest_hyps: List[List[Dict[str, torch.Tensor]]] = []
-        for i in range(batch_size):
-            i_best_hyps: List[Dict[str, torch.Tensor]] = []
-            for j, score in enumerate(nbest_scores[i]):
-                hyp = {
-                    "token_ids": nbest_prediction_tokens[i, j, 1:nbest_prediction_valid_token_lengths[i, j]],
-                    "score": score
-                }
-                i_best_hyps.append(hyp)
-            nbest_hyps.append(i_best_hyps)
-        return nbest_hyps
-    def get_initialized_self_cache(self,
-                                   batch_size,
-                                   beam_size
-                                   ) -> Tuple[Tensor, Tensor]:
-        n_layer_self_k_cache = torch.zeros(
-            self.num_decoder_blocks,
-            batch_size * beam_size,
-            self.decode_max_len,
-            self.decoder_hidden_dim,
-        )
-        n_layer_self_v_cache = torch.zeros(
-            self.num_decoder_blocks,
-            batch_size * beam_size,
-            self.decode_max_len,
-            self.decoder_hidden_dim,
-        )
-        return n_layer_self_k_cache, n_layer_self_v_cache
-    def calc_feat_len(self, audio_dur):
-        import math
-        sample_rate = 16000
-        frame_length = 25 * sample_rate / 1000
-        frame_shift = 10 * sample_rate / 1000
-        length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
-        return length
-    def transcribe(self,
-                   batch_wav_path: List[str],
-                   beam_size: int = 1,
-                   nbest: int = 1
-                ) -> List[Dict]:
-        feats, lengths, wav_durations = self.feature_extractor(batch_wav_path)
-        print(f"feats.shape: {feats.shape}")
-        maxlen = self.calc_feat_len(10)
-        if feats.shape[1] < maxlen:
-            feats = np.concatenate([feats, np.zeros((1, maxlen - feats.shape[1], 80), dtype=np.float32)], axis=1)
-        feats = feats[:, :maxlen, :]
-        encoder_data_path = os.path.join("encoder_output", os.path.basename(batch_wav_path[0]))
-        # decoder_data_path = os.path.join("calib_dataset", "decoder", os.path.basename(batch_wav_path[0]))
-        os.makedirs(encoder_data_path, exist_ok=True)
-        # os.makedirs(decoder_data_path, exist_ok=True)
-        feats = to_numpy(feats)
-        lengths = to_numpy(lengths)
-        # for name, npy in zip(["encoder_input", "encoder_input_lengths"], [feats, lengths]):
-        #     file_path = os.path.join(encoder_data_path, name + ".npy")
-        #     np.save(file_path, npy)
-        start_time = time.time()
-        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.run_encoder(
-            to_numpy(feats),
-            to_numpy(lengths)
-        )
-        for name, npy in zip(["n_layer_cross_k", "n_layer_cross_v", "cross_attn_mask"], [n_layer_cross_k, n_layer_cross_v, cross_attn_mask]):
-            file_path = os.path.join(encoder_data_path, name + ".npy")
-            np.save(file_path, npy)
-        # nbest_hyps = self.run_decoder(n_layer_cross_k,
-        #                               n_layer_cross_v,
-        #                               cross_attn_mask,
-        #                               beam_size,
-        #                               nbest
-        #                               )
-        # transcribe_durations = time.time() - start_time
-        # results: List[Dict] = []
-        # for wav, hyp in zip(batch_wav_path, nbest_hyps):
-        #     hyp = hyp[0]
-        #     hyp_ids = [int(id) for id in hyp["token_ids"].cpu()]
-        #     score = hyp["score"].item()
-        #     text = self.tokenizer.detokenize(hyp_ids)
-        #     results.append(
-        #         {
-        #             "wav": wav,
-        #             "text": text,
-        #             "score": score
-        #         }
-        #     )
-        # return results, wav_durations, transcribe_durations
-def parse_args():
-    parser = argparse.ArgumentParser(description="FireRedASROnnxModel Test")
-    parser.add_argument(
-        "--encoder",
-        type=str,
-        default="axmodel/encoder.axmodel",
-        help="Path to onnx encoder"
-    )
-    parser.add_argument(
-        "--decoder",
-        type=str,
-        default="onnx_decoder/decoder.onnx",
-        help="Path to onnx decoder"
-    )
-    parser.add_argument(
-        "--cmvn",
-        type=str,
-        default="axmodel/cmvn.ark",
-        help="Path to cmvn"
-    )
-    parser.add_argument(
-        "--dict",
-        type=str,
-        default="axmodel/dict.txt",
-        help="Path to dict"
-    )
-    parser.add_argument(
-        "--spm_model",
-        type=str,
-        default="axmodel/train_bpe1000.model",
-        help="Path to spm model"
-    )
-    parser.add_argument(
-        "--wavlist",
-        type=str,
-        default="wavlist.txt",
-        help="File to wav path list"
-    )
-    parser.add_argument(
-        "--hypo",
-        type=str,
-        default="hypo_axmodel.txt",
-        help="File of hypos"
-    )
-    parser.add_argument(
-        "--beam_size",
-        type=int,
-        default=3,
-        help=""
-    )
-    parser.add_argument(
-        "--nbest",
-        type=int,
-        default=1,
-        help=""
-    )
-    return parser.parse_args()
-def parse_wavlist(wavlist: str):
-    wavpaths = []
-    with open(wavlist) as f:
-        for line in f:
-            line = line.strip()
-            if not os.path.exists(line):
-                print(f"{line} doesn't exist.")
-                continue
-            wavpaths.append(line)
-    return wavpaths
-def main():
-    args = parse_args()
-    print(args)
-    onnx_model = FireRedASROnnxModel(args.encoder,
-                                     args.decoder,
-                                     args.cmvn,
-                                     args.dict,
-                                     args.spm_model)
-    wf = open(args.hypo, "wt")
-    wavlist = parse_wavlist(args.wavlist)
-    total_wav_durations = 0
-    total_transcribe_durations = 0
-    for wav in wavlist:
-        batch_wav = [wav]
-        onnx_model.transcribe(batch_wav, args.beam_size, args.nbest)
-    #     wav_durations = sum(wav_durations)
-    #     total_wav_durations += wav_durations
-    #     total_transcribe_durations += transcribe_durations
-    #     logger.info(f"{batch_wav}")
-    #     logger.info(f"Durations: {wav_durations}")
-    #     logger.info(f"Transcribe Durations: {transcribe_durations}")
-    #     rtf = transcribe_durations / wav_durations
-    #     logger.info(f"(Real time factor) RTF: {rtf}")
-    #     for result in results:
-    #         logger.info(f"wav: {result['wav']}")
-    #         logger.info(f"text: {result['text']}")
-    #         logger.info(f"score: {result['score']}")
-    #         logger.info("")
-    #         wf.write(f"{result['text']} ({result['wav']})\n")
-    # logger.info(f"total wav durations: {total_wav_durations}")
-    # logger.info(f"total transcribe durations: {total_transcribe_durations}")
-    # avg_ref = total_transcribe_durations / total_wav_durations
-    # logger.info(f"AVG RTF: {avg_ref}")
-    wf.close()
-if __name__ == "__main__":
-    main()

test_onnx_model.py DELETED Viewed

@@ -1,684 +0,0 @@
-from fireredasr.data.asr_feat import ASRFeatExtractor
-from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer
-import onnxruntime as ort
-import torch
-import torch.nn.functional as F
-import numpy as np
-from torch import Tensor
-from typing import Tuple, List, Dict
-import argparse
-import os
-import time
-import logging
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-logger_stream_hander = logging.StreamHandler()
-logger_stream_hander.setLevel("INFO")
-logger.addHandler(logger_stream_hander)
-INF = 1e10
-def to_numpy(tensor):
-    if isinstance(tensor, np.ndarray):
-        return tensor
-    if tensor.requires_grad:
-        return tensor.detach().cpu().numpy()
-    else:
-        return tensor.cpu().numpy()
-def set_finished_beam_score_to_zero(scores, is_finished):
-    NB, B = scores.size()
-    is_finished = is_finished.float()
-    mask_score = torch.tensor([0.0] + [-INF]*(B-1)).float()
-    mask_score = mask_score.view(1, B).repeat(NB, 1)
-    return scores * (1 - is_finished) + mask_score * is_finished
-def set_finished_beam_y_to_eos(ys, is_finished, eos_id):
-    is_finished = is_finished.long()
-    return ys * (1 - is_finished) + eos_id * is_finished
-class FireRedASROnnxModel:
-    def __init__(
-        self,
-        encoder_path: str,
-        decoder_path: str,
-        cmvn_file: str,
-        dict_file: str,
-        spm_model_path: str,
-        providers=["CPUExecutionProvider"]
-    ):
-        session_opts = ort.SessionOptions()
-        session_opts.inter_op_num_threads = 1
-        session_opts.intra_op_num_threads = 1
-        # session_opts.log_severity_level = 1
-        self.session_opts = session_opts
-        # NOTE: 参考whisper设置的最大的解码长度
-        # FireRedASR-AED 模型支持的最长语音为 60s
-        # ref: https://github.com/FireRedTeam/FireRedASR?tab=readme-ov-file#input-length-limitations
-        self.decode_max_len = 448
-        self.decoder_hidden_dim = 1280
-        self.num_decoder_blocks = 16
-        self.blank_id = 0
-        self.sos_id = 3
-        self.eos_id = 4
-        self.pad_id = 2
-        self.feature_extractor = ASRFeatExtractor(cmvn_file)
-        self.tokenizer = ChineseCharEnglishSpmTokenizer(dict_file, spm_model_path)
-        self.encoder = None
-        self.decoder = None
-        self.init_encoder(encoder_path, providers)
-        self.init_decoder(decoder_path, providers)
-        self.init_decoder_main(decoder_path, providers)
-        self.init_decoder_loop(decoder_path, providers)
-        self.pe = self.init_pe(decoder_path)
-    def init_encoder(self, encoder_path, providers=None):
-        start_time = time.time()
-        self.encoder = ort.InferenceSession(
-            encoder_path,
-            sess_options=self.session_opts,
-            providers=providers
-        )
-        end_time = time.time()
-        logger.info(f"load encoder cost {end_time - start_time} seconds")
-    def init_decoder(self, decoder_path, providers=None):
-        start_time = time.time()
-        self.decoder = ort.InferenceSession(
-            decoder_path,
-            sess_options=self.session_opts,
-            providers=providers
-        )
-        end_time = time.time()
-        logger.info(f"load decoder cost {end_time - start_time} seconds")
-    def init_decoder_main(self, decoder_path, providers=None):
-        decoder_path = os.path.dirname(decoder_path)
-        decoder_path = os.path.join(decoder_path, "decoder_main.onnx")
-        start_time = time.time()
-        self.decoder_main = ort.InferenceSession(
-            decoder_path,
-            sess_options=self.session_opts,
-            providers=providers
-        )
-        end_time = time.time()
-        logger.info(f"load decoder_main cost {end_time - start_time} seconds")
-        input_names = [i.name for i in self.decoder_main.get_inputs()]
-        print(f"decoder_main.input_names: {input_names}")
-    def init_decoder_loop(self, decoder_path, providers=None):
-        decoder_path = os.path.dirname(decoder_path)
-        decoder_path = os.path.join(decoder_path, "decoder_loop.onnx")
-        start_time = time.time()
-        self.decoder_loop = ort.InferenceSession(
-            decoder_path,
-            sess_options=self.session_opts,
-            providers=providers
-        )
-        end_time = time.time()
-        logger.info(f"load decoder_loop cost {end_time - start_time} seconds")
-        input_names = [i.name for i in self.decoder_loop.get_inputs()]
-        print(f"decoder_loop.input_names: {input_names}")
-    def init_pe(self, decoder_path):
-        decoder_path = os.path.dirname(decoder_path)
-        decoder_path = os.path.join(decoder_path, "pe.npy")
-        return np.load(decoder_path)
-    def run_encoder(self, input: np.ndarray,
-                    input_length: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.encoder.run(
-            None,
-            {
-                self.encoder.get_inputs()[0].name: input,
-                self.encoder.get_inputs()[1].name: input_length
-            }
-        )
-        return (
-            n_layer_cross_k,
-            n_layer_cross_v,
-            cross_attn_mask
-        )
-    def decode_one_token(
-        self,
-        tokens: np.ndarray,
-        n_layer_self_k_cache: np.ndarray,
-        n_layer_self_v_cache: np.ndarray,
-        n_layer_cross_k_cache: np.ndarray,
-        n_layer_cross_v_cache: np.ndarray,
-        offset: np.ndarray,
-        self_attn_mask: np.ndarray,
-        cross_attn_mask: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        # print("decode:")
-        # print(f"tokens.shape: {tokens.shape}")
-        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
-        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
-        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
-        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
-        # print(f"offset.shape: {offset.shape}")
-        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
-        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
-        # print(f"self_attn_mask: {self_attn_mask}")
-        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder.run(
-            None,
-            {
-                self.decoder.get_inputs()[0].name: tokens,
-                self.decoder.get_inputs()[1].name: n_layer_self_k_cache,
-                self.decoder.get_inputs()[2].name: n_layer_self_v_cache,
-                self.decoder.get_inputs()[3].name: n_layer_cross_k_cache,
-                self.decoder.get_inputs()[4].name: n_layer_cross_v_cache,
-                self.decoder.get_inputs()[5].name: offset,
-                self.decoder.get_inputs()[6].name: self_attn_mask,
-                self.decoder.get_inputs()[7].name: cross_attn_mask,
-            }
-        )
-        return (
-            logits,
-            out_n_layer_self_k_cache,
-            out_n_layer_self_v_cache
-        )
-    def decode_main_one_token(
-        self,
-        tokens: np.ndarray,
-        n_layer_self_k_cache: np.ndarray,
-        n_layer_self_v_cache: np.ndarray,
-        n_layer_cross_k_cache: np.ndarray,
-        n_layer_cross_v_cache: np.ndarray,
-        pe: np.ndarray,
-        self_attn_mask: np.ndarray,
-        cross_attn_mask: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        # print("decode_main:")
-        # print(f"tokens.shape: {tokens.shape}")
-        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
-        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
-        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
-        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
-        # print(f"pe.shape: {pe.shape}")
-        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
-        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
-        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_main.run(
-            None,
-            {
-                self.decoder_main.get_inputs()[0].name: tokens,
-                # self.decoder_main.get_inputs()[1].name: n_layer_self_k_cache,
-                self.decoder_main.get_inputs()[1].name: n_layer_cross_k_cache,
-                self.decoder_main.get_inputs()[2].name: n_layer_cross_v_cache,
-                self.decoder_main.get_inputs()[3].name: pe,
-                self.decoder_main.get_inputs()[4].name: self_attn_mask,
-                self.decoder_main.get_inputs()[5].name: cross_attn_mask,
-                # self.decoder_main.get_inputs()[7].name: cross_attn_mask,
-            }
-        )
-        return (
-            logits,
-            out_n_layer_self_k_cache,
-            out_n_layer_self_v_cache
-        )
-    def decode_loop_one_token(
-        self,
-        tokens: np.ndarray,
-        n_layer_self_k_cache: np.ndarray,
-        n_layer_self_v_cache: np.ndarray,
-        n_layer_cross_k_cache: np.ndarray,
-        n_layer_cross_v_cache: np.ndarray,
-        pe: np.ndarray,
-        self_attn_mask: np.ndarray,
-        cross_attn_mask: np.ndarray
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        # print("decode_loop:")
-        # print(f"tokens.shape: {tokens.shape}")
-        # print(f"n_layer_self_k_cache.shape: {n_layer_self_k_cache.shape}")
-        # print(f"n_layer_self_v_cache.shape: {n_layer_self_v_cache.shape}")
-        # print(f"n_layer_cross_k_cache.shape: {n_layer_cross_k_cache.shape}")
-        # print(f"n_layer_cross_v_cache.shape: {n_layer_cross_v_cache.shape}")
-        # print(f"pe.shape: {pe.shape}")
-        # print(f"self_attn_mask.shape: {self_attn_mask.shape}")
-        # print(f"cross_attn_mask.shape: {cross_attn_mask.shape}")
-        logits, out_n_layer_self_k_cache, out_n_layer_self_v_cache = self.decoder_loop.run(
-            None,
-            {
-                self.decoder_loop.get_inputs()[0].name: tokens,
-                self.decoder_loop.get_inputs()[1].name: n_layer_self_k_cache,
-                self.decoder_loop.get_inputs()[2].name: n_layer_self_v_cache,
-                self.decoder_loop.get_inputs()[3].name: n_layer_cross_k_cache,
-                self.decoder_loop.get_inputs()[4].name: n_layer_cross_v_cache,
-                self.decoder_loop.get_inputs()[5].name: pe,
-                self.decoder_loop.get_inputs()[6].name: self_attn_mask,
-                self.decoder_loop.get_inputs()[7].name: cross_attn_mask,
-            }
-        )
-        return (
-            logits,
-            out_n_layer_self_k_cache,
-            out_n_layer_self_v_cache
-        )
-    def run_decoder(
-        self,
-        n_layer_cross_k,
-        n_layer_cross_v,
-        cross_attn_mask,
-        beam_size,
-        nbest,
-        decoder_data_path
-    ):
-        num_layer, batch_size, Ti, encoder_out_dim = n_layer_cross_k.shape
-        encoder_out_length = cross_attn_mask.shape[-1]
-        cross_attn_mask = torch.from_numpy(cross_attn_mask).to(torch.float32)
-        cross_attn_mask = cross_attn_mask.unsqueeze(1).repeat(
-            1, beam_size, 1, 1
-        ).view(beam_size * batch_size, -1, encoder_out_length)
-        n_layer_cross_k = torch.from_numpy(n_layer_cross_k)
-        n_layer_cross_v = torch.from_numpy(n_layer_cross_v)
-        n_layer_cross_k = n_layer_cross_k.unsqueeze(2).repeat(
-            1, 1, beam_size, 1, 1
-        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
-        n_layer_cross_v = n_layer_cross_v.unsqueeze(2).repeat(
-            1, 1, beam_size, 1, 1
-        ).view(num_layer, beam_size * batch_size, Ti, encoder_out_dim)
-        prediction_tokens = torch.ones(
-            beam_size * batch_size, 1).fill_(self.sos_id).long()
-        tokens = prediction_tokens
-        offset = torch.zeros(1, dtype=torch.int64)
-        n_layer_self_k_cache, n_layer_self_v_cache = self.get_initialized_self_cache(
-            batch_size, beam_size
-        )
-        scores = torch.tensor([0.0] + [-INF]*(beam_size - 1)).float()
-        scores = scores.repeat(batch_size).view(batch_size * beam_size, 1)
-        is_finished = torch.zeros_like(scores)
-        # self_attn_mask = torch.zeros(
-        #     batch_size * beam_size,
-        #     1, 1
-        # )
-        self_attn_mask = np.zeros((batch_size * beam_size, 1, 1), dtype=np.float32)
-        results = [self.sos_id]
-        for i in range(self.decode_max_len):
-            # ==== ORIGIN ====
-            # self_attn_mask = torch.empty(
-            #     batch_size * beam_size,
-            #     prediction_tokens.shape[-1], prediction_tokens.shape[-1]
-            # ).fill_(-np.inf).triu_(1)
-            # self_attn_mask = self_attn_mask[:, -1:, :]
-            # self_attn_mask = to_numpy(self_attn_mask)
-            # logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_one_token(
-            #     to_numpy(tokens),
-            #     to_numpy(n_layer_self_k_cache),
-            #     to_numpy(n_layer_self_v_cache),
-            #     to_numpy(n_layer_cross_k),
-            #     to_numpy(n_layer_cross_v),
-            #     to_numpy(offset),
-            #     to_numpy(self_attn_mask),
-            #     to_numpy(cross_attn_mask)
-            # )
-            # ==== ORIGIN ====
-            # tokens = to_numpy(tokens)
-            # n_layer_self_k_cache = to_numpy(n_layer_self_k_cache)
-            # n_layer_self_v_cache = to_numpy(n_layer_self_v_cache)
-            # n_layer_cross_k = to_numpy(n_layer_cross_k)
-            # n_layer_cross_v = to_numpy(n_layer_cross_v)
-            # cross_attn_mask = to_numpy(cross_attn_mask)
-            # for name, npy in zip(
-            #     ["tokens", "n_layer_self_k_cache", "n_layer_self_v_cache", "n_layer_cross_k", "n_layer_cross_v", "pe", "self_attn_mask", "cross_attn_mask"],
-            #     [tokens, n_layer_self_k_cache, n_layer_self_v_cache, n_layer_cross_k, n_layer_cross_v, self.pe[offset], self_attn_mask, cross_attn_mask]
-            # ):
-            #     file_path = os.path.join(decoder_data_path, name)
-            #     os.makedirs(file_path, exist_ok=True)
-            #     np.save(os.path.join(file_path, f"{i}.npy"), npy)
-            # if i == 0:
-            #     logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_main_one_token(
-            #         to_numpy(tokens),
-            #         to_numpy(n_layer_self_k_cache),
-            #         to_numpy(n_layer_self_v_cache),
-            #         to_numpy(n_layer_cross_k),
-            #         to_numpy(n_layer_cross_v),
-            #         self.pe[0],
-            #         self_attn_mask,
-            #         to_numpy(cross_attn_mask)
-            #     )
-            # else:
-            #     logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_loop_one_token(
-            #         to_numpy(tokens),
-            #         to_numpy(n_layer_self_k_cache),
-            #         to_numpy(n_layer_self_v_cache),
-            #         to_numpy(n_layer_cross_k),
-            #         to_numpy(n_layer_cross_v),
-            #         self.pe[offset],
-            #         self_attn_mask,
-            #         to_numpy(cross_attn_mask)
-            #     )
-            logits, n_layer_self_k_cache, n_layer_self_v_cache = self.decode_loop_one_token(
-                    to_numpy(tokens),
-                    to_numpy(n_layer_self_k_cache),
-                    to_numpy(n_layer_self_v_cache),
-                    to_numpy(n_layer_cross_k),
-                    to_numpy(n_layer_cross_v),
-                    self.pe[offset],
-                    self_attn_mask,
-                    to_numpy(cross_attn_mask)
-                )
-            offset += 1
-            logits = torch.from_numpy(logits)
-            logits = logits.squeeze(1)
-            t_scores = F.log_softmax(logits, dim=-1)
-            t_topB_scores, t_topB_ys = torch.topk(t_scores, k=beam_size, dim=1)
-            t_topB_scores = set_finished_beam_score_to_zero(t_topB_scores, is_finished)
-            t_topB_ys = set_finished_beam_y_to_eos(t_topB_ys, is_finished, self.eos_id)
-            scores = scores + t_topB_scores
-            scores = scores.view(batch_size, beam_size * beam_size)
-            scores, topB_score_ids = torch.topk(scores, k=beam_size, dim=1)
-            scores = scores.view(-1, 1)
-            topB_row_number_in_each_B_rows_of_ys = torch.div(
-                topB_score_ids, beam_size).view(batch_size * beam_size)
-            stride = beam_size * torch.arange(batch_size).view(
-                batch_size, 1).repeat(1, beam_size).view(batch_size * beam_size)
-            topB_row_number_in_ys = topB_row_number_in_each_B_rows_of_ys.long() + stride.long()
-            prediction_tokens = prediction_tokens[topB_row_number_in_ys]
-            t_ys = torch.gather(
-                t_topB_ys.view(batch_size, beam_size * beam_size),
-                dim=1, index=topB_score_ids
-            ).view(beam_size * batch_size, 1)
-            tokens = t_ys
-            prediction_tokens = torch.cat((prediction_tokens, t_ys), dim=1)
-            n_layer_self_k_cache = torch.from_numpy(n_layer_self_k_cache)
-            n_layer_self_v_cache = torch.from_numpy(n_layer_self_v_cache)
-            for i, self_k_cache in enumerate(n_layer_self_k_cache):
-                n_layer_self_k_cache[i] = n_layer_self_k_cache[i][topB_row_number_in_ys]
-            for i, self_v_cache in enumerate(n_layer_self_v_cache):
-                n_layer_self_v_cache[i] = n_layer_self_v_cache[i][topB_row_number_in_ys]
-            is_finished = t_ys.eq(self.eos_id)
-            if is_finished.sum().item() == beam_size * batch_size:
-                break
-        scores = scores.view(batch_size, beam_size)
-        prediction_valid_token_lengths = torch.sum(
-            torch.ne(
-                prediction_tokens.view(batch_size, beam_size, -1),
-                self.eos_id),
-            dim=-1
-        ).int()
-        nbest_scores, nbest_ids = torch.topk(scores, k=nbest, dim=1)
-        index = nbest_ids + beam_size * torch.arange(batch_size).view(batch_size, 1).long()
-        nbest_prediction_tokens = prediction_tokens.view(batch_size * beam_size, -1)[index.view(-1)]
-        nbest_prediction_tokens = nbest_prediction_tokens.view(batch_size, nbest_ids.size(1), -1)
-        nbest_prediction_valid_token_lengths = prediction_valid_token_lengths.view(
-            batch_size * beam_size)[index.view(-1)].view(batch_size, -1)
-        nbest_hyps: List[List[Dict[str, torch.Tensor]]] = []
-        for i in range(batch_size):
-            i_best_hyps: List[Dict[str, torch.Tensor]] = []
-            for j, score in enumerate(nbest_scores[i]):
-                hyp = {
-                    "token_ids": nbest_prediction_tokens[i, j, 1:nbest_prediction_valid_token_lengths[i, j]],
-                    "score": score
-                }
-                i_best_hyps.append(hyp)
-            nbest_hyps.append(i_best_hyps)
-        return nbest_hyps
-    def get_initialized_self_cache(self,
-                                   batch_size,
-                                   beam_size
-                                   ) -> Tuple[Tensor, Tensor]:
-        n_layer_self_k_cache = torch.zeros(
-            self.num_decoder_blocks,
-            batch_size * beam_size,
-            self.decode_max_len,
-            self.decoder_hidden_dim,
-        )
-        n_layer_self_v_cache = torch.zeros(
-            self.num_decoder_blocks,
-            batch_size * beam_size,
-            self.decode_max_len,
-            self.decoder_hidden_dim,
-        )
-        return n_layer_self_k_cache, n_layer_self_v_cache
-    def calc_feat_len(self, audio_dur):
-        import math
-        sample_rate = 16000
-        frame_length = 25 * sample_rate / 1000
-        frame_shift = 10 * sample_rate / 1000
-        length = math.floor((audio_dur * sample_rate - frame_length) / frame_shift) + 1
-        return length
-    def transcribe(self,
-                   batch_wav_path: List[str],
-                   beam_size: int = 1,
-                   nbest: int = 1
-                ) -> List[Dict]:
-        feats, lengths, wav_durations = self.feature_extractor(batch_wav_path)
-        print(f"feats.shape: {feats.shape}")
-        maxlen = self.calc_feat_len(10)
-        if feats.shape[1] < maxlen:
-            feats = np.concatenate([feats, np.zeros((1, maxlen - feats.shape[1], 80), dtype=np.float32)], axis=1)
-        feats = feats[:, :maxlen, :]
-        # encoder_data_path = os.path.join("calib_dataset", "encoder", os.path.basename(batch_wav_path[0]))
-        decoder_data_path = os.path.join("calib_dataset", "decoder", os.path.basename(batch_wav_path[0]))
-        # os.makedirs(encoder_data_path, exist_ok=True)
-        # os.makedirs(decoder_data_path, exist_ok=True)
-        feats = to_numpy(feats)
-        lengths = to_numpy(lengths)
-        # for name, npy in zip(["encoder_input", "encoder_input_lengths"], [feats, lengths]):
-        #     file_path = os.path.join(encoder_data_path, name + ".npy")
-        #     np.save(file_path, npy)
-        start_time = time.time()
-        n_layer_cross_k, n_layer_cross_v, cross_attn_mask = self.run_encoder(
-            to_numpy(feats),
-            to_numpy(lengths)
-        )
-        nbest_hyps = self.run_decoder(n_layer_cross_k,
-                                      n_layer_cross_v,
-                                      cross_attn_mask,
-                                      beam_size,
-                                      nbest,
-                                      decoder_data_path)
-        transcribe_durations = time.time() - start_time
-        results: List[Dict] = []
-        for wav, hyp in zip(batch_wav_path, nbest_hyps):
-            hyp = hyp[0]
-            hyp_ids = [int(id) for id in hyp["token_ids"].cpu()]
-            score = hyp["score"].item()
-            text = self.tokenizer.detokenize(hyp_ids)
-            results.append(
-                {
-                    "wav": wav,
-                    "text": text,
-                    "score": score
-                }
-            )
-        return results, wav_durations, transcribe_durations
-def parse_args():
-    parser = argparse.ArgumentParser(description="FireRedASROnnxModel Test")
-    parser.add_argument(
-        "--encoder",
-        type=str,
-        default="onnx_encoder/encoder.onnx",
-        help="Path to onnx encoder"
-    )
-    parser.add_argument(
-        "--decoder",
-        type=str,
-        default="onnx_decoder/decoder.onnx",
-        help="Path to onnx decoder"
-    )
-    parser.add_argument(
-        "--cmvn",
-        type=str,
-        default="axmodel/cmvn.ark",
-        help="Path to cmvn"
-    )
-    parser.add_argument(
-        "--dict",
-        type=str,
-        default="axmodel/dict.txt",
-        help="Path to dict"
-    )
-    parser.add_argument(
-        "--spm_model",
-        type=str,
-        default="axmodel/train_bpe1000.model",
-        help="Path to spm model"
-    )
-    parser.add_argument(
-        "--wavlist",
-        type=str,
-        default="wavlist.txt",
-        help="File to wav path list"
-    )
-    parser.add_argument(
-        "--hypo",
-        type=str,
-        default="hypo_onnx.txt",
-        help="File of hypos"
-    )
-    parser.add_argument(
-        "--beam_size",
-        type=int,
-        default=3,
-        help=""
-    )
-    parser.add_argument(
-        "--nbest",
-        type=int,
-        default=1,
-        help=""
-    )
-    parser.add_argument(
-        "--provider",
-        default="CPUExecutionProvider",
-        choices=['CUDAExecutionProvider', 'CPUExecutionProvider']
-    )
-    return parser.parse_args()
-def parse_wavlist(wavlist: str):
-    wavpaths = []
-    with open(wavlist) as f:
-        for line in f:
-            line = line.strip()
-            if not os.path.exists(line):
-                print(f"{line} doesn't exist.")
-                continue
-            wavpaths.append(line)
-    return wavpaths
-def main():
-    args = parse_args()
-    print(args)
-    onnx_model = FireRedASROnnxModel(args.encoder,
-                                     args.decoder,
-                                     args.cmvn,
-                                     args.dict,
-                                     args.spm_model,
-                                     [args.provider])
-    wf = open(args.hypo, "wt")
-    wavlist = parse_wavlist(args.wavlist)
-    total_wav_durations = 0
-    total_transcribe_durations = 0
-    for wav in wavlist:
-        batch_wav = [wav]
-        results, wav_durations, transcribe_durations = onnx_model.transcribe(
-            batch_wav, args.beam_size, args.nbest)
-        wav_durations = sum(wav_durations)
-        total_wav_durations += wav_durations
-        total_transcribe_durations += transcribe_durations
-        logger.info(f"{batch_wav}")
-        logger.info(f"Durations: {wav_durations}")
-        logger.info(f"Transcribe Durations: {transcribe_durations}")
-        rtf = transcribe_durations / wav_durations
-        logger.info(f"(Real time factor) RTF: {rtf}")
-        for result in results:
-            logger.info(f"wav: {result['wav']}")
-            logger.info(f"text: {result['text']}")
-            logger.info(f"score: {result['score']}")
-            logger.info("")
-            wf.write(f"{result['text']} ({result['wav']})\n")
-    logger.info(f"total wav durations: {total_wav_durations}")
-    logger.info(f"total transcribe durations: {total_transcribe_durations}")
-    avg_ref = total_transcribe_durations / total_wav_durations
-    logger.info(f"AVG RTF: {avg_ref}")
-    wf.close()
-    # import tarfile as tf
-    # import glob
-    # with tf.open("./calib_dataset/encoder_input.tar.gz", "w:gz") as f:
-    #     for npy in glob.glob("./calib_dataset/encoder/*/encoder_input.npy"):
-    #         f.add(npy)
-    # with tf.open("./calib_dataset/encoder_input_lengths.tar.gz", "w:gz") as f:
-    #     for npy in glob.glob("./calib_dataset/encoder/*/encoder_input_lengths.npy"):
-    #         f.add(npy)
-    # for decoder_input in ["tokens", "n_layer_self_k_cache", "n_layer_self_v_cache", "n_layer_cross_k", "n_layer_cross_v", "pe", "self_attn_mask", "cross_attn_mask"]:
-    #     with tf.open(f"./calib_dataset/{decoder_input}.tar.gz", "w:gz") as f:
-    #         for npy in glob.glob(f"./calib_dataset/decoder/*/{decoder_input}"):
-    #             f.add(npy)
-if __name__ == "__main__":
-    main()