WeNet / export_onnx.py

First commit

3c50954 about 1 month ago

65.4 kB

	# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from __future__ import print_function

	import argparse
	import copy
	import logging
	import os
	import sys
	import tarfile
	import urllib.request

	import numpy as np
	import torch
	import torch.nn.functional as F
	import yaml
	from wenet.transformer.ctc import CTC
	from wenet.transformer.decoder import TransformerDecoder
	from wenet.transformer.encoder import BaseEncoder
	from wenet.utils.init_model import init_model
	from wenet.utils.mask import make_pad_mask
	from typing import List, Tuple

	try:
	import onnx
	import onnxruntime
	from onnx import helper, numpy_helper
	from onnxsim import simplify
	except ImportError:
	print("Please install onnxruntime!")
	sys.exit(1)

	logger = logging.getLogger(__file__)
	logger.setLevel(logging.INFO)

	DEFAULT_PRETRAINED_MODEL_URL = (
	"https://huggingface.co/openspeech/wenet-models/resolve/main/"
	"aishell_u2pp_conformer_exp.tar.gz")
	DEFAULT_PRETRAINED_MODEL_DIR = "pretrained/aishell_u2pp_conformer_exp"


	def safe_extract_tar(tar, output_dir):
	output_dir = os.path.abspath(output_dir)
	for member in tar.getmembers():
	member_path = os.path.abspath(os.path.join(output_dir, member.name))
	if not member_path.startswith(output_dir + os.sep):
	raise RuntimeError(f"Unsafe tar member path: {member.name}")
	tar.extractall(output_dir)


	def download_file(url, output_path):
	os.makedirs(os.path.dirname(output_path), exist_ok=True)
	print(f"Downloading pretrained model from {url}")
	print(f"Saving to {output_path}")
	urllib.request.urlretrieve(url, output_path)


	def prepare_pretrained_model(args):
	model_dir = args.pretrained_model_dir
	archive_dir = os.path.dirname(model_dir.rstrip(os.sep)) or "."
	archive_path = os.path.join(
	archive_dir, os.path.basename(model_dir.rstrip(os.sep)) + ".tar.gz")

	if not os.path.exists(model_dir):
	if not os.path.exists(archive_path):
	download_file(args.pretrained_model_url, archive_path)
	print(f"Extracting pretrained model to {archive_dir}")
	with tarfile.open(archive_path, "r:gz") as tar:
	safe_extract_tar(tar, archive_dir)

	args.config = os.path.join(model_dir, "train.yaml")
	args.checkpoint = os.path.join(model_dir, "final.pt")
	args.cmvn_file = os.path.join(model_dir, "global_cmvn")

	missing = [path for path in (args.config, args.checkpoint)
	if not os.path.exists(path)]
	if missing:
	raise FileNotFoundError(
	"Missing pretrained model files: " + ", ".join(missing))

	print(f"Using config: {args.config}")
	print(f"Using checkpoint: {args.checkpoint}")
	if os.path.exists(args.cmvn_file):
	print(f"Using CMVN: {args.cmvn_file}")


	def _constant_node_value(node):
	if node is None or node.op_type != "Constant":
	return None
	for attr in node.attribute:
	if attr.name == "value":
	return numpy_helper.to_array(attr.t)
	return None


	def _attribute_value(attr):
	return helper.get_attribute_value(attr)


	def _get_attr(node, name, default=None):
	for attr in node.attribute:
	if attr.name == name:
	return _attribute_value(attr)
	return default


	def _cast_to_onnx_dtype(value, to_dtype):
	tensor_type = onnx.TensorProto.DataType.Name(to_dtype).lower()
	dtype_map = {
	"float": np.float32,
	"double": np.float64,
	"float16": np.float16,
	"int64": np.int64,
	"int32": np.int32,
	"int16": np.int16,
	"int8": np.int8,
	"uint64": np.uint64,
	"uint32": np.uint32,
	"uint16": np.uint16,
	"uint8": np.uint8,
	"bool": np.bool_,
	}
	if tensor_type not in dtype_map:
	return None
	return value.astype(dtype_map[tensor_type])


	def _shape_from_value_info(value_info):
	if not value_info.type.HasField("tensor_type"):
	return None
	if not value_info.type.tensor_type.shape.dim:
	return None
	shape = []
	for dim in value_info.type.tensor_type.shape.dim:
	if dim.HasField("dim_value") and dim.dim_value > 0:
	shape.append(dim.dim_value)
	else:
	return None
	return tuple(shape)


	def _collect_static_shapes(model):
	inferred = onnx.shape_inference.infer_shapes(model)
	shapes = {}
	for value_info in list(inferred.graph.input) + list(
	inferred.graph.value_info) + list(inferred.graph.output):
	shape = _shape_from_value_info(value_info)
	if shape is not None:
	shapes[value_info.name] = shape
	for initializer in inferred.graph.initializer:
	shapes[initializer.name] = tuple(initializer.dims)
	return shapes


	def _eval_static_node(node, inputs, static_shapes):
	if node.op_type == "Constant":
	return _constant_node_value(node)
	if node.op_type != "Shape" and any(value is None for value in inputs):
	return None

	try:
	if node.op_type == "Add":
	return np.add(inputs[0], inputs[1])
	if node.op_type == "Sub":
	return np.subtract(inputs[0], inputs[1])
	if node.op_type == "Mul":
	return np.multiply(inputs[0], inputs[1])
	if node.op_type == "Div":
	return np.divide(inputs[0], inputs[1])
	if node.op_type == "Equal":
	return np.equal(inputs[0], inputs[1])
	if node.op_type == "Greater":
	return np.greater(inputs[0], inputs[1])
	if node.op_type == "GreaterOrEqual":
	return np.greater_equal(inputs[0], inputs[1])
	if node.op_type == "Less":
	return np.less(inputs[0], inputs[1])
	if node.op_type == "LessOrEqual":
	return np.less_equal(inputs[0], inputs[1])
	if node.op_type == "Where":
	return np.where(inputs[0], inputs[1], inputs[2])
	if node.op_type == "Concat":
	axis = _get_attr(node, "axis", 0)
	return np.concatenate(inputs, axis=axis)
	if node.op_type == "Unsqueeze":
	axes = _get_attr(node, "axes")
	if axes is None and len(inputs) > 1:
	axes = inputs[1]
	axes = tuple(int(axis) for axis in np.asarray(axes).reshape(-1))
	return np.expand_dims(inputs[0], axis=axes)
	if node.op_type == "Squeeze":
	axes = _get_attr(node, "axes")
	if axes is None and len(inputs) > 1:
	axes = inputs[1]
	if axes is None:
	return np.squeeze(inputs[0])
	axes = tuple(int(axis) for axis in np.asarray(axes).reshape(-1))
	return np.squeeze(inputs[0], axis=axes)
	if node.op_type == "Cast":
	return _cast_to_onnx_dtype(inputs[0], _get_attr(node, "to"))
	if node.op_type == "Reshape":
	return np.reshape(inputs[0], tuple(int(i) for i in inputs[1]))
	if node.op_type == "Shape":
	if inputs[0] is not None:
	shape = inputs[0].shape
	else:
	shape = static_shapes.get(node.input[0])
	if shape is None:
	return None
	return np.asarray(shape, dtype=np.int64)
	if node.op_type == "Slice":
	data = inputs[0]
	starts = np.asarray(inputs[1]).reshape(-1)
	ends = np.asarray(inputs[2]).reshape(-1)
	axes = (np.asarray(inputs[3]).reshape(-1)
	if len(inputs) > 3 and inputs[3] is not None else
	np.arange(len(starts)))
	steps = (np.asarray(inputs[4]).reshape(-1)
	if len(inputs) > 4 and inputs[4] is not None else
	np.ones(len(starts), dtype=np.int64))
	slices = [slice(None)] * data.ndim
	for start, end, axis, step in zip(starts, ends, axes, steps):
	axis = int(axis)
	start = int(start)
	end = int(end)
	step = int(step)
	if end >= np.iinfo(np.int32).max:
	end = None
	if end <= np.iinfo(np.int32).min:
	end = None
	slices[axis] = slice(start, end, step)
	return data[tuple(slices)]
	if node.op_type == "Gather":
	axis = _get_attr(node, "axis", 0)
	return np.take(inputs[0], inputs[1], axis=axis)
	except Exception:
	return None
	return None


	def _constant_node(output_name, value, name):
	const_tensor = numpy_helper.from_array(np.asarray(value),
	name=output_name + "_value")
	return helper.make_node("Constant",
	inputs=[],
	outputs=[output_name],
	name=name,
	value=const_tensor)


	def _node_attributes(node):
	return {attr.name: helper.get_attribute_value(attr) for attr in node.attribute}


	def _copy_node(node, inputs=None, outputs=None, name=None):
	copied = copy.deepcopy(node)
	if inputs is not None:
	del copied.input[:]
	copied.input.extend(inputs)
	if outputs is not None:
	del copied.output[:]
	copied.output.extend(outputs)
	if name is not None:
	copied.name = name
	return copied


	def _producer_map(model):
	return {output: node for node in model.graph.node for output in node.output}


	def _unsqueeze_greater_equal_pattern(producer, value_name):
	unsqueeze = producer.get(value_name)
	if unsqueeze is None or unsqueeze.op_type != "Unsqueeze":
	return None, None
	compare = producer.get(unsqueeze.input[0])
	if compare is None or compare.op_type != "GreaterOrEqual":
	return None, None
	return unsqueeze, compare


	def rewrite_pulsar2_bool_not(onnx_path):
	"""Remove simple Not nodes that Pulsar2 quantization can cast to float.

	The encoder mask contains Not(Unsqueeze(GreaterOrEqual(...))) and another
	Not over a sliced version of that mask. Pulsar2 can quantize the Not input
	to FP32 and then fail because bitwise Not only accepts bool/integer tensors.
	Rewriting those patterns keeps the graph boolean-equivalent without Not.
	"""
	model = onnx.load(onnx_path)
	producer = _producer_map(model)
	rewritten = 0
	new_nodes = []

	for node in model.graph.node:
	if node.op_type != "Not":
	new_nodes.append(node)
	continue

	compare = producer.get(node.input[0])
	if compare is not None and compare.op_type == "GreaterOrEqual":
	less = helper.make_node("Less",
	inputs=list(compare.input),
	outputs=list(node.output),
	name=node.name + "_less",
	**_node_attributes(compare))
	new_nodes.append(less)
	rewritten += 1
	continue

	unsqueeze, compare = _unsqueeze_greater_equal_pattern(
	producer, node.input[0])
	if unsqueeze is not None:
	less_output = node.output[0] + "_less"
	less = helper.make_node("Less",
	inputs=list(compare.input),
	outputs=[less_output],
	name=node.name + "_less",
	**_node_attributes(compare))
	rewritten_unsqueeze = _copy_node(
	unsqueeze,
	inputs=[less_output] + list(unsqueeze.input[1:]),
	outputs=list(node.output),
	name=node.name + "_unsqueeze")
	new_nodes.extend([less, rewritten_unsqueeze])
	rewritten += 1
	continue

	slice_1 = producer.get(node.input[0])
	slice_0 = producer.get(slice_1.input[0]) if slice_1 else None
	inner_not = producer.get(slice_0.input[0]) if slice_0 else None
	if (slice_1 is not None and slice_1.op_type == "Slice"
	and slice_0 is not None and slice_0.op_type == "Slice"
	and inner_not is not None and inner_not.op_type == "Not"):
	unsqueeze, _ = _unsqueeze_greater_equal_pattern(
	producer, inner_not.input[0])
	if unsqueeze is not None:
	slice_0_output = node.output[0] + "_slice0"
	rewritten_slice_0 = _copy_node(
	slice_0,
	inputs=[unsqueeze.output[0]] + list(slice_0.input[1:]),
	outputs=[slice_0_output],
	name=node.name + "_slice0")
	rewritten_slice_1 = _copy_node(
	slice_1,
	inputs=[slice_0_output] + list(slice_1.input[1:]),
	outputs=list(node.output),
	name=node.name + "_slice1")
	new_nodes.extend([rewritten_slice_0, rewritten_slice_1])
	rewritten += 1
	continue

	new_nodes.append(node)

	if rewritten:
	del model.graph.node[:]
	model.graph.node.extend(new_nodes)
	onnx.checker.check_model(model)
	onnx.save(model, onnx_path)
	print(f"Rewrote {rewritten} bool Not node(s) in {onnx_path}")


	def rewrite_pulsar2_bool_and(onnx_path):
	"""Replace boolean And with arithmetic comparison for Pulsar2 quantization."""
	model = onnx.load(onnx_path)
	rewritten = 0
	new_nodes = []

	for node in model.graph.node:
	if node.op_type != "And" or len(node.input) != 2 or len(
	node.output) != 1:
	new_nodes.append(node)
	continue

	left = node.output[0] + "_left_i32"
	right = node.output[0] + "_right_i32"
	added = node.output[0] + "_sum"
	threshold = node.output[0] + "_threshold"
	new_nodes.append(
	helper.make_node("Cast",
	inputs=[node.input[0]],
	outputs=[left],
	name=node.name + "_cast_left",
	to=onnx.TensorProto.INT32))
	new_nodes.append(
	helper.make_node("Cast",
	inputs=[node.input[1]],
	outputs=[right],
	name=node.name + "_cast_right",
	to=onnx.TensorProto.INT32))
	new_nodes.append(
	helper.make_node("Add",
	inputs=[left, right],
	outputs=[added],
	name=node.name + "_add"))
	new_nodes.append(
	_constant_node(threshold, np.asarray(1, dtype=np.int32),
	node.name + "_threshold"))
	new_nodes.append(
	helper.make_node("Greater",
	inputs=[added, threshold],
	outputs=list(node.output),
	name=node.name + "_greater"))
	rewritten += 1

	if rewritten:
	del model.graph.node[:]
	model.graph.node.extend(new_nodes)
	onnx.checker.check_model(model)
	onnx.save(model, onnx_path)
	print(f"Rewrote {rewritten} bool And node(s) in {onnx_path}")


	def simplify_pulsar2_onnx(onnx_path):
	model = onnx.load(onnx_path)
	sim_model, ok = simplify(model)
	if not ok:
	raise RuntimeError(f"onnxsim failed to validate {onnx_path}")
	onnx.checker.check_model(sim_model)
	onnx.save(sim_model, onnx_path)
	print(f"Simplified {onnx_path} for Pulsar2")


	def fold_static_pulsar2_subgraphs(onnx_path):
	"""Fold static ONNX patterns that Pulsar2 5.0 cannot infer reliably.

	Pulsar2 5.0 can fail shape inference on ConstantOfShape when its input is
	a constant tensor value instead of an initializer. The legacy exporter emits
	this pattern for masks/padding in the encoder graphs. It can also fail when
	an Expand shape is produced by a constant-only subgraph such as
	Mul/Equal/Where. Fold those static pieces before handing the model to
	Pulsar2.
	"""
	model = onnx.load(onnx_path)
	static_shapes = _collect_static_shapes(model)
	constants = {
	initializer.name: numpy_helper.to_array(initializer)
	for initializer in model.graph.initializer
	}
	folded = 0
	new_nodes = []
	for node in model.graph.node:
	inputs = [constants.get(name) for name in node.input]
	if node.op_type == "ConstantOfShape" and node.input:
	shape_value = inputs[0]
	if shape_value is not None:
	fill_value = np.array(0, dtype=np.float32)
	for attr in node.attribute:
	if attr.name == "value":
	fill_value = numpy_helper.to_array(attr.t)
	break
	shape = tuple(int(dim)
	for dim in np.asarray(shape_value).reshape(-1))
	value = np.full(shape,
	fill_value.reshape(-1)[0],
	dtype=fill_value.dtype)
	else:
	value = None
	else:
	value = _eval_static_node(node, inputs, static_shapes)

	if value is None or len(node.output) != 1:
	new_nodes.append(node)
	continue

	constants[node.output[0]] = value
	new_nodes.append(_constant_node(node.output[0], value, node.name))
	folded += 1

	if folded:
	del model.graph.node[:]
	model.graph.node.extend(new_nodes)
	onnx.checker.check_model(model)
	onnx.save(model, onnx_path)
	print(f"Folded {folded} static node(s) in {onnx_path}")


	class Encoder(torch.nn.Module):

	def __init__(self, encoder: BaseEncoder, ctc: CTC, beam_size: int = 10):
	super().__init__()
	self.encoder = encoder
	self.ctc = ctc
	self.beam_size = beam_size

	def forward(
	self,
	speech: torch.Tensor,
	speech_lengths: torch.Tensor,
	):
	"""Encoder
	Args:
	speech: (Batch, Length, ...)
	speech_lengths: (Batch, )
	Returns:
	encoder_out: B x T x F
	encoder_out_lens: B
	ctc_log_probs: B x T x V
	beam_log_probs: B x T x beam_size
	beam_log_probs_idx: B x T x beam_size
	"""
	encoder_out, encoder_mask = self.encoder(speech, speech_lengths, -1,
	-1)
	encoder_out_lens = encoder_mask.squeeze(1).sum(1)
	# ctc_log_probs = self.ctc.log_softmax(encoder_out)
	ctc_log_probs = self.ctc.linear(encoder_out)
	encoder_out_lens = encoder_out_lens.int()
	beam_log_probs, beam_log_probs_idx = torch.topk(ctc_log_probs,
	self.beam_size,
	dim=2)
	return (
	encoder_out,
	encoder_out_lens,
	ctc_log_probs,
	beam_log_probs,
	beam_log_probs_idx,
	)


	class StreamingEncoder(torch.nn.Module):

	def __init__(
	self,
	model,
	required_cache_size,
	beam_size,
	transformer=False,
	return_ctc_logprobs=False,
	):
	super().__init__()
	self.ctc = model.ctc
	self.subsampling_rate = model.encoder.embed.subsampling_rate
	self.embed = model.encoder.embed
	self.global_cmvn = model.encoder.global_cmvn
	self.required_cache_size = required_cache_size
	self.beam_size = beam_size
	self.encoder = model.encoder
	self.transformer = transformer
	self.return_ctc_logprobs = return_ctc_logprobs

	def forward(self, chunk_xs, chunk_lens, offset, att_cache, cnn_cache,
	cache_mask):
	"""Streaming Encoder
	Args:
	xs (torch.Tensor): chunk input, with shape (b, time, mel-dim),
	where `time == (chunk_size - 1) * subsample_rate + \
	subsample.right_context + 1`
	offset (torch.Tensor): offset with shape (b, 1)
	1 is retained for triton deployment
	required_cache_size (int): cache size required for next chunk
	compuation
	> 0: actual cache size
	<= 0: not allowed in streaming gpu encoder `
	att_cache (torch.Tensor): cache tensor for KEY & VALUE in
	transformer/conformer attention, with shape
	(b, elayers, head, cache_t1, d_k * 2), where
	`head * d_k == hidden-dim` and
	`cache_t1 == chunk_size * num_decoding_left_chunks`.
	cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
	(b, elayers, b, hidden-dim, cache_t2), where
	`cache_t2 == cnn.lorder - 1`
	cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size)
	in a batch of request, each request may have different
	history cache. Cache mask is used to indidate the effective
	cache for each request
	Returns:
	torch.Tensor: log probabilities of ctc output and cutoff by beam size
	with shape (b, chunk_size, beam)
	torch.Tensor: index of top beam size probabilities for each timestep
	with shape (b, chunk_size, beam)
	torch.Tensor: output of current input xs,
	with shape (b, chunk_size, hidden-dim).
	torch.Tensor: new attention cache required for next chunk, with
	same shape (b, elayers, head, cache_t1, d_k * 2)
	as the original att_cache
	torch.Tensor: new conformer cnn cache required for next chunk, with
	same shape as the original cnn_cache.
	torch.Tensor: new cache mask, with same shape as the original
	cache mask
	"""
	offset = offset.squeeze(1)
	T = chunk_xs.size(1)
	chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1)
	# B X 1 X T
	chunk_mask = chunk_mask.to(chunk_xs.dtype)
	# transpose batch & num_layers dim
	att_cache = torch.transpose(att_cache, 0, 1)
	cnn_cache = torch.transpose(cnn_cache, 0, 1)

	# rewrite encoder.forward_chunk
	# <---------forward_chunk START--------->
	xs = self.global_cmvn(chunk_xs)
	# chunk mask is important for batch inferencing since
	# different sequence in a batch has different length
	xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset)
	cache_size = att_cache.size(3) # required cache size
	masks = torch.cat((cache_mask, chunk_mask), dim=2)
	index = offset - cache_size

	pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1))
	pos_emb = pos_emb.to(dtype=xs.dtype)

	next_cache_start = -self.required_cache_size
	r_cache_mask = masks[:, :, next_cache_start:]

	r_att_cache = []
	r_cnn_cache = []
	for i, layer in enumerate(self.encoder.encoders):
	i_kv_cache = att_cache[i]
	size = att_cache.size(-1) // 2
	kv_cache = (i_kv_cache[:, :, :, :size], i_kv_cache[:, :, :, size:])
	xs, _, new_kv_cache, new_cnn_cache = layer(
	xs,
	masks,
	pos_emb,
	att_cache=kv_cache,
	cnn_cache=cnn_cache[i],
	)
	# shape(new_att_cache) is (B, head, attention_key_size, d_k * 2),
	# shape(new_cnn_cache) is (B, hidden-dim, cache_t2)
	new_att_cache = torch.cat(new_kv_cache, dim=-1)
	r_att_cache.append(
	new_att_cache[:, :, next_cache_start:, :].unsqueeze(1))
	if not self.transformer:
	r_cnn_cache.append(new_cnn_cache.unsqueeze(1))
	if self.encoder.normalize_before:
	chunk_out = self.encoder.after_norm(xs)
	else:
	chunk_out = xs

	r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx
	if not self.transformer:
	r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers

	# <---------forward_chunk END--------->

	# log_ctc_probs = self.ctc.log_softmax(chunk_out)
	log_ctc_probs = self.ctc.linear(chunk_out)
	log_probs, log_probs_idx = torch.topk(log_ctc_probs,
	self.beam_size,
	dim=2)
	log_probs = log_probs.to(chunk_xs.dtype)

	r_offset = offset + chunk_out.shape[1]
	# the below ops not supported in Tensorrt
	# chunk_out_lens = torch.div(chunk_lens, subsampling_rate,
	# rounding_mode='floor')
	chunk_out_lens = chunk_lens // self.subsampling_rate
	r_offset = r_offset.unsqueeze(1)
	if self.return_ctc_logprobs:
	return (
	log_ctc_probs,
	chunk_out,
	chunk_out_lens,
	r_offset,
	r_att_cache,
	r_cnn_cache,
	r_cache_mask,
	)
	else:
	return (
	log_probs,
	log_probs_idx,
	chunk_out,
	chunk_out_lens,
	r_offset,
	r_att_cache,
	r_cnn_cache,
	r_cache_mask,
	)


	class StreamingSqueezeformerEncoder(torch.nn.Module):

	def __init__(self, model, required_cache_size, beam_size):
	super().__init__()
	self.ctc = model.ctc
	self.subsampling_rate = model.encoder.embed.subsampling_rate
	self.embed = model.encoder.embed
	self.global_cmvn = model.encoder.global_cmvn
	self.required_cache_size = required_cache_size
	self.beam_size = beam_size
	self.encoder = model.encoder
	self.reduce_idx = model.encoder.reduce_idx
	self.recover_idx = model.encoder.recover_idx
	if self.reduce_idx is None:
	self.time_reduce = None
	else:
	if self.recover_idx is None:
	self.time_reduce = "normal" # no recovery at the end
	else:
	self.time_reduce = "recover" # recovery at the end
	assert len(self.reduce_idx) == len(self.recover_idx)

	def calculate_downsampling_factor(self, i: int) -> int:
	if self.reduce_idx is None:
	return 1
	else:
	reduce_exp, recover_exp = 0, 0
	for exp, rd_idx in enumerate(self.reduce_idx):
	if i >= rd_idx:
	reduce_exp = exp + 1
	if self.recover_idx is not None:
	for exp, rc_idx in enumerate(self.recover_idx):
	if i >= rc_idx:
	recover_exp = exp + 1
	return int(2**(reduce_exp - recover_exp))

	def forward(self, chunk_xs, chunk_lens, offset, att_cache, cnn_cache,
	cache_mask):
	"""Streaming Encoder
	Args:
	xs (torch.Tensor): chunk input, with shape (b, time, mel-dim),
	where `time == (chunk_size - 1) * subsample_rate + \
	subsample.right_context + 1`
	offset (torch.Tensor): offset with shape (b, 1)
	1 is retained for triton deployment
	required_cache_size (int): cache size required for next chunk
	compuation
	> 0: actual cache size
	<= 0: not allowed in streaming gpu encoder `
	att_cache (torch.Tensor): cache tensor for KEY & VALUE in
	transformer/conformer attention, with shape
	(b, elayers, head, cache_t1, d_k * 2), where
	`head * d_k == hidden-dim` and
	`cache_t1 == chunk_size * num_decoding_left_chunks`.
	cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
	(b, elayers, b, hidden-dim, cache_t2), where
	`cache_t2 == cnn.lorder - 1`
	cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size)
	in a batch of request, each request may have different
	history cache. Cache mask is used to indidate the effective
	cache for each request
	Returns:
	torch.Tensor: log probabilities of ctc output and cutoff by beam size
	with shape (b, chunk_size, beam)
	torch.Tensor: index of top beam size probabilities for each timestep
	with shape (b, chunk_size, beam)
	torch.Tensor: output of current input xs,
	with shape (b, chunk_size, hidden-dim).
	torch.Tensor: new attention cache required for next chunk, with
	same shape (b, elayers, head, cache_t1, d_k * 2)
	as the original att_cache
	torch.Tensor: new conformer cnn cache required for next chunk, with
	same shape as the original cnn_cache.
	torch.Tensor: new cache mask, with same shape as the original
	cache mask
	"""
	offset = offset.squeeze(1)
	T = chunk_xs.size(1)
	chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1)
	# B X 1 X T
	chunk_mask = chunk_mask.to(chunk_xs.dtype)
	# transpose batch & num_layers dim
	att_cache = torch.transpose(att_cache, 0, 1)
	cnn_cache = torch.transpose(cnn_cache, 0, 1)

	# rewrite encoder.forward_chunk
	# <---------forward_chunk START--------->
	xs = self.global_cmvn(chunk_xs)
	# chunk mask is important for batch inferencing since
	# different sequence in a batch has different length
	xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset)
	elayers, cache_size = att_cache.size(0), att_cache.size(3)
	att_mask = torch.cat((cache_mask, chunk_mask), dim=2)
	index = offset - cache_size

	pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1))
	pos_emb = pos_emb.to(dtype=xs.dtype)

	next_cache_start = -self.required_cache_size
	r_cache_mask = att_mask[:, :, next_cache_start:]

	r_att_cache = []
	r_cnn_cache = []
	mask_pad = torch.ones(1,
	xs.size(1),
	device=xs.device,
	dtype=torch.bool)
	mask_pad = mask_pad.unsqueeze(1)
	max_att_len: int = 0
	recover_activations: List[Tuple[torch.Tensor, torch.Tensor,
	torch.Tensor, torch.Tensor]] = []
	index = 0
	xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int)
	xs = self.encoder.preln(xs)
	for i, layer in enumerate(self.encoder.encoders):
	if self.reduce_idx is not None:
	if self.time_reduce is not None and i in self.reduce_idx:
	recover_activations.append(
	(xs, att_mask, pos_emb, mask_pad))
	(
	xs,
	xs_lens,
	att_mask,
	mask_pad,
	) = self.encoder.time_reduction_layer(
	xs, xs_lens, att_mask, mask_pad)
	pos_emb = pos_emb[:, ::2, :]
	if self.encoder.pos_enc_layer_type == "rel_pos_repaired":
	pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :]
	index += 1

	if self.recover_idx is not None:
	if self.time_reduce == "recover" and i in self.recover_idx:
	index -= 1
	(
	recover_tensor,
	recover_att_mask,
	recover_pos_emb,
	recover_mask_pad,
	) = recover_activations[index]
	# recover output length for ctc decode
	xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2)
	xs = self.encoder.time_recover_layer(xs)
	recoverd_t = recover_tensor.size(1)
	xs = recover_tensor + xs[:, :recoverd_t, :].contiguous()
	att_mask = recover_att_mask
	pos_emb = recover_pos_emb
	mask_pad = recover_mask_pad

	factor = self.calculate_downsampling_factor(i)

	xs, _, new_att_cache, new_cnn_cache = layer(
	xs,
	att_mask,
	pos_emb,
	att_cache=att_cache[i][:, :, ::factor, :]
	[:, :, :pos_emb.size(1) - xs.size(1), :]
	if elayers > 0 else att_cache[:, :, ::factor, :],
	cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache,
	)
	cached_att = new_att_cache[:, :, next_cache_start // factor:, :]
	cached_cnn = new_cnn_cache.unsqueeze(1)
	cached_att = (cached_att.unsqueeze(3).repeat(1, 1, 1, factor,
	1).flatten(2, 3))
	if i == 0:
	# record length for the first block as max length
	max_att_len = cached_att.size(2)
	r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1))
	r_cnn_cache.append(cached_cnn)

	chunk_out = xs
	r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx
	r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers

	# <---------forward_chunk END--------->

	# log_ctc_probs = self.ctc.log_softmax(chunk_out)
	log_ctc_probs = self.ctc.linear(chunk_out)
	log_probs, log_probs_idx = torch.topk(log_ctc_probs,
	self.beam_size,
	dim=2)
	log_probs = log_probs.to(chunk_xs.dtype)

	r_offset = offset + chunk_out.shape[1]
	# the below ops not supported in Tensorrt
	# chunk_out_lens = torch.div(chunk_lens, subsampling_rate,
	# rounding_mode='floor')
	chunk_out_lens = chunk_lens // self.subsampling_rate
	r_offset = r_offset.unsqueeze(1)

	return (
	log_probs,
	log_probs_idx,
	chunk_out,
	chunk_out_lens,
	r_offset,
	r_att_cache,
	r_cnn_cache,
	r_cache_mask,
	)


	class StreamingEfficientConformerEncoder(torch.nn.Module):

	def __init__(self, model, required_cache_size, beam_size):
	super().__init__()
	self.ctc = model.ctc
	self.subsampling_rate = model.encoder.embed.subsampling_rate
	self.embed = model.encoder.embed
	self.global_cmvn = model.encoder.global_cmvn
	self.required_cache_size = required_cache_size
	self.beam_size = beam_size
	self.encoder = model.encoder

	# Efficient Conformer
	self.stride_layer_idx = model.encoder.stride_layer_idx
	self.stride = model.encoder.stride
	self.num_blocks = model.encoder.num_blocks
	self.cnn_module_kernel = model.encoder.cnn_module_kernel

	def calculate_downsampling_factor(self, i: int) -> int:
	factor = 1
	for idx, stride_idx in enumerate(self.stride_layer_idx):
	if i > stride_idx:
	factor *= self.stride[idx]
	return factor

	def forward(self, chunk_xs, chunk_lens, offset, att_cache, cnn_cache,
	cache_mask):
	"""Streaming Encoder
	Args:
	chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim),
	where `time == (chunk_size - 1) * subsample_rate + \
	subsample.right_context + 1`
	chunk_lens (torch.Tensor):
	offset (torch.Tensor): offset with shape (b, 1)
	1 is retained for triton deployment
	att_cache (torch.Tensor): cache tensor for KEY & VALUE in
	transformer/conformer attention, with shape
	(b, elayers, head, cache_t1, d_k * 2), where
	`head * d_k == hidden-dim` and
	`cache_t1 == chunk_size * num_decoding_left_chunks`.
	cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
	(b, elayers, hidden-dim, cache_t2), where
	`cache_t2 == cnn.lorder - 1`
	cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size)
	in a batch of request, each request may have different
	history cache. Cache mask is used to indidate the effective
	cache for each request
	Returns:
	torch.Tensor: log probabilities of ctc output and cutoff by beam size
	with shape (b, chunk_size, beam)
	torch.Tensor: index of top beam size probabilities for each timestep
	with shape (b, chunk_size, beam)
	torch.Tensor: output of current input xs,
	with shape (b, chunk_size, hidden-dim).
	torch.Tensor: new attention cache required for next chunk, with
	same shape (b, elayers, head, cache_t1, d_k * 2)
	as the original att_cache
	torch.Tensor: new conformer cnn cache required for next chunk, with
	same shape as the original cnn_cache.
	torch.Tensor: new cache mask, with same shape as the original
	cache mask
	"""
	offset = offset.squeeze(1) # (b, )
	offset *= self.calculate_downsampling_factor(self.num_blocks + 1)

	T = chunk_xs.size(1)
	chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T)
	# B X 1 X T
	chunk_mask = chunk_mask.to(chunk_xs.dtype)
	# transpose batch & num_layers dim
	# Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2)
	# Shape(cnn_cache): (elayers, b, outsize, cnn_kernel)
	att_cache = torch.transpose(att_cache, 0, 1)
	cnn_cache = torch.transpose(cnn_cache, 0, 1)

	# rewrite encoder.forward_chunk
	# <---------forward_chunk START--------->
	xs = self.global_cmvn(chunk_xs)
	# chunk mask is important for batch inferencing since
	# different sequence in a batch has different length
	xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset)
	cache_size = att_cache.size(3) # required cache size
	masks = torch.cat((cache_mask, chunk_mask), dim=2)
	att_mask = torch.cat((cache_mask, chunk_mask), dim=2)
	index = offset - cache_size

	pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1))
	pos_emb = pos_emb.to(dtype=xs.dtype)

	next_cache_start = -self.required_cache_size
	r_cache_mask = masks[:, :, next_cache_start:]

	r_att_cache = []
	r_cnn_cache = []
	mask_pad = chunk_mask.to(torch.bool)
	max_att_len, max_cnn_len = (
	0,
	0,
	) # for repeat_interleave of new_att_cache
	for i, layer in enumerate(self.encoder.encoders):
	factor = self.calculate_downsampling_factor(i)
	# NOTE(xcsong): Before layer.forward
	# shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2),
	# shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2)
	# shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ]
	att_cache_trunc = 0
	if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1):
	# The time step is not divisible by the downsampling multiple
	# We propose to double the chunk_size.
	att_cache_trunc = (xs.size(1) + att_cache.size(3) // factor -
	pos_emb.size(1) + 1)
	xs, _, new_att_cache, new_cnn_cache = layer(
	xs,
	att_mask,
	pos_emb,
	mask_pad=mask_pad,
	att_cache=att_cache[i][:, :, ::factor, :][:, :,
	att_cache_trunc:, :],
	cnn_cache=cnn_cache[i, :, :, :]
	if cnn_cache.size(0) > 0 else cnn_cache,
	)

	if i in self.stride_layer_idx:
	# compute time dimension for next block
	efficient_index = self.stride_layer_idx.index(i)
	att_mask = att_mask[:, ::self.stride[efficient_index], ::self.
	stride[efficient_index], ]
	mask_pad = mask_pad[:, ::self.stride[efficient_index], ::self.
	stride[efficient_index], ]
	pos_emb = pos_emb[:, ::self.stride[efficient_index], :]

	# shape(new_att_cache) = [batch, head, time2, outdim]
	new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :]
	# shape(new_cnn_cache) = [batch, 1, outdim, cache_t2]
	new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID

	# use repeat_interleave to new_att_cache
	# new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2)
	new_att_cache = (new_att_cache.unsqueeze(3).repeat(
	1, 1, 1, factor, 1).flatten(2, 3))
	# padding new_cnn_cache to cnn.lorder for casual convolution
	new_cnn_cache = F.pad(
	new_cnn_cache,
	(self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0),
	)

	if i == 0:
	# record length for the first block as max length
	max_att_len = new_att_cache.size(2)
	max_cnn_len = new_cnn_cache.size(3)

	# update real shape of att_cache and cnn_cache
	r_att_cache.append(new_att_cache[:, :,
	-max_att_len:, :].unsqueeze(1))
	r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:])

	if self.encoder.normalize_before:
	chunk_out = self.encoder.after_norm(xs)
	else:
	chunk_out = xs

	# shape of r_att_cache: (b, elayers, head, time2, outdim)
	r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx
	# shape of r_cnn_cache: (b, elayers, outdim, cache_t2)
	r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers

	# <---------forward_chunk END--------->

	# log_ctc_probs = self.ctc.log_softmax(chunk_out)
	log_ctc_probs = self.ctc.linear(chunk_out)
	log_probs, log_probs_idx = torch.topk(log_ctc_probs,
	self.beam_size,
	dim=2)
	log_probs = log_probs.to(chunk_xs.dtype)

	r_offset = offset + chunk_out.shape[1]
	# the below ops not supported in Tensorrt
	# chunk_out_lens = torch.div(chunk_lens, subsampling_rate,
	# rounding_mode='floor')
	chunk_out_lens = (
	chunk_lens // self.subsampling_rate //
	self.calculate_downsampling_factor(self.num_blocks + 1))
	chunk_out_lens += 1
	r_offset = r_offset.unsqueeze(1)

	return (
	log_probs,
	log_probs_idx,
	chunk_out,
	chunk_out_lens,
	r_offset,
	r_att_cache,
	r_cnn_cache,
	r_cache_mask,
	)


	class Decoder(torch.nn.Module):

	def __init__(
	self,
	decoder: TransformerDecoder,
	ctc_weight: float = 0.5,
	reverse_weight: float = 0.0,
	beam_size: int = 10,
	decoder_fastertransformer: bool = False,
	):
	super().__init__()
	self.decoder = decoder
	self.ctc_weight = ctc_weight
	self.reverse_weight = reverse_weight
	self.beam_size = beam_size
	self.decoder_fastertransformer = decoder_fastertransformer

	def forward(
	self,
	encoder_out: torch.Tensor,
	encoder_lens: torch.Tensor,
	hyps_pad_sos_eos: torch.Tensor,
	hyps_lens_sos: torch.Tensor,
	r_hyps_pad_sos_eos: torch.Tensor,
	ctc_score: torch.Tensor,
	):
	"""Encoder
	Args:
	encoder_out: B x T x F
	encoder_lens: B
	hyps_pad_sos_eos: B x beam x (T2+1),
	hyps with sos & eos and padded by ignore id
	hyps_lens_sos: B x beam, length for each hyp with sos
	r_hyps_pad_sos_eos: B x beam x (T2+1),
	reversed hyps with sos & eos and padded by ignore id
	ctc_score: B x beam, ctc score for each hyp
	Returns:
	decoder_out: B x beam x T2 x V
	r_decoder_out: B x beam x T2 x V
	best_index: B
	"""
	B, T, F = encoder_out.shape
	bz = self.beam_size
	B2 = B * bz
	encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F)
	encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1)
	encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T)
	T2 = hyps_pad_sos_eos.shape[2] - 1
	hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1)
	hyps_lens = hyps_lens_sos.view(B2, )
	hyps_pad_sos = hyps_pad[:, :-1].contiguous()
	hyps_pad_eos = hyps_pad[:, 1:].contiguous()

	r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1)
	r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous()
	r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous()

	decoder_out, r_decoder_out, _ = self.decoder(
	encoder_out,
	encoder_mask,
	hyps_pad_sos,
	hyps_lens,
	r_hyps_pad_sos,
	self.reverse_weight,
	)
	# decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)
	V = decoder_out.shape[-1]
	decoder_out = decoder_out.view(B2, T2, V)
	mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2
	# mask index, remove ignore id
	index = torch.unsqueeze(hyps_pad_eos * mask, 2).to(torch.long)
	score = decoder_out.gather(2, index).squeeze(2) # B2 X T2
	# mask padded part
	score = score * mask
	decoder_out = decoder_out.view(B, bz, T2, V)
	if self.reverse_weight > 0:
	# r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out,
	# dim=-1)
	r_decoder_out = r_decoder_out.view(B2, T2, V)
	index = torch.unsqueeze(r_hyps_pad_eos * mask, 2).to(torch.long)
	r_score = r_decoder_out.gather(2, index).squeeze(2)
	r_score = r_score * mask
	score = (score * (1 - self.reverse_weight) +
	self.reverse_weight * r_score)
	r_decoder_out = r_decoder_out.view(B, bz, T2, V)
	score = torch.sum(score, axis=1) # B2
	score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score
	best_index = torch.argmax(score, dim=1)
	if self.decoder_fastertransformer:
	return decoder_out, best_index
	else:
	return best_index


	def to_numpy(tensors):
	out = []
	if type(tensors) == torch.tensor:
	tensors = [tensors]
	for tensor in tensors:
	if tensor.requires_grad:
	tensor = tensor.detach().cpu().numpy()
	else:
	tensor = tensor.cpu().numpy()
	out.append(tensor)
	return out


	def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True):
	for a, b in zip(xlist, blist):
	try:
	torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
	except AssertionError as error:
	if tolerate_small_mismatch:
	print(error)
	else:
	raise


	def export_offline_encoder(model, configs, args, logger, encoder_onnx_path):
	bz = 1
	seq_len = 1024
	beam_size = args.beam_size
	feature_size = configs["input_dim"]

	speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32)
	speech_lens = torch.randint(low=10,
	high=seq_len,
	size=(bz, ),
	dtype=torch.int32)
	encoder = Encoder(model.encoder, model.ctc, beam_size)
	encoder.eval()

	torch.onnx.export(
	encoder,
	(speech, speech_lens),
	encoder_onnx_path,
	export_params=True,
	opset_version=13,
	do_constant_folding=True,
	input_names=["speech", "speech_lengths"],
	output_names=[
	"encoder_out",
	"encoder_out_lens",
	"ctc_log_probs",
	"beam_log_probs",
	"beam_log_probs_idx",
	],
	dynamic_axes=None,
	# dynamic_axes={
	# "speech": {
	# 0: "B",
	# 1: "T"
	# },
	# "speech_lengths": {
	# 0: "B"
	# },
	# "encoder_out": {
	# 0: "B",
	# 1: "T_OUT"
	# },
	# "encoder_out_lens": {
	# 0: "B"
	# },
	# "ctc_log_probs": {
	# 0: "B",
	# 1: "T_OUT"
	# },
	# "beam_log_probs": {
	# 0: "B",
	# 1: "T_OUT"
	# },
	# "beam_log_probs_idx": {
	# 0: "B",
	# 1: "T_OUT"
	# },
	# },
	verbose=False,
	dynamo=False,
	)
	fold_static_pulsar2_subgraphs(encoder_onnx_path)
	simplify_pulsar2_onnx(encoder_onnx_path)
	rewrite_pulsar2_bool_not(encoder_onnx_path)

	with torch.no_grad():
	o0, o1, o2, o3, o4 = encoder(speech, speech_lens)

	providers = ["CPUExecutionProvider"]
	ort_session = onnxruntime.InferenceSession(encoder_onnx_path,
	providers=providers)
	ort_inputs = {
	"speech": to_numpy(speech),
	"speech_lengths": to_numpy(speech_lens),
	}
	ort_outs = ort_session.run(None, ort_inputs)

	# check encoder output
	test(to_numpy([o0, o1, o2, o3, o4]), ort_outs)
	logger.info("export offline onnx encoder succeed!")
	onnx_config = {
	"beam_size": args.beam_size,
	"reverse_weight": configs["model_conf"]["reverse_weight"],
	"ctc_weight": configs["model_conf"]["ctc_weight"],
	}
	return onnx_config


	def export_online_encoder(model, configs, args, logger, encoder_onnx_path):
	decoding_chunk_size = args.decoding_chunk_size
	subsampling = model.encoder.embed.subsampling_rate
	context = model.encoder.embed.right_context + 1
	decoding_window = (decoding_chunk_size - 1) * subsampling + context
	batch_size = 1
	audio_len = decoding_window
	feature_size = configs["input_dim"]
	output_size = configs["encoder_conf"]["output_size"]
	num_layers = configs["encoder_conf"]["num_blocks"]
	# in transformer the cnn module will not be available
	transformer = False
	cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1
	if not cnn_module_kernel:
	transformer = True
	num_decoding_left_chunks = args.num_decoding_left_chunks
	required_cache_size = decoding_chunk_size * num_decoding_left_chunks
	if configs["encoder"] == "squeezeformer":
	encoder = StreamingSqueezeformerEncoder(model, required_cache_size,
	args.beam_size)
	elif configs["encoder"] == "efficientConformer":
	encoder = StreamingEfficientConformerEncoder(model,
	required_cache_size,
	args.beam_size)
	else:
	encoder = StreamingEncoder(
	model,
	required_cache_size,
	args.beam_size,
	transformer,
	args.return_ctc_logprobs,
	)
	encoder.eval()

	# begin to export encoder
	chunk_xs = torch.randn(batch_size,
	audio_len,
	feature_size,
	dtype=torch.float32)
	chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len

	offset = torch.arange(0, batch_size, dtype=torch.int32).unsqueeze(1)
	# (elayers, b, head, cache_t1, d_k * 2)
	head = configs["encoder_conf"]["attention_heads"]
	d_k = configs["encoder_conf"]["output_size"] // head
	att_cache = torch.randn(
	batch_size,
	num_layers,
	head,
	required_cache_size,
	d_k * 2,
	dtype=torch.float32,
	)
	cnn_cache = torch.randn(
	batch_size,
	num_layers,
	output_size,
	cnn_module_kernel,
	dtype=torch.float32,
	)

	cache_mask = torch.ones(batch_size,
	1,
	required_cache_size,
	dtype=torch.float32)
	input_names = [
	"chunk_xs",
	"chunk_lens",
	"offset",
	"att_cache",
	"cnn_cache",
	"cache_mask",
	]
	output_names = [
	"log_probs",
	"log_probs_idx",
	"chunk_out",
	"chunk_out_lens",
	"r_offset",
	"r_att_cache",
	"r_cnn_cache",
	"r_cache_mask",
	]
	if args.return_ctc_logprobs:
	output_names = [
	"ctc_log_probs",
	"chunk_out",
	"chunk_out_lens",
	"r_offset",
	"r_att_cache",
	"r_cnn_cache",
	"r_cache_mask",
	]
	input_tensors = (
	chunk_xs,
	chunk_lens,
	offset,
	att_cache,
	cnn_cache,
	cache_mask,
	)
	if transformer:
	assert (args.return_ctc_logprobs is
	False), "return_ctc_logprobs is not supported in transformer"
	output_names.pop(6)

	all_names = input_names + output_names
	dynamic_axes = {}
	for name in all_names:
	# only the first dimension is dynamic
	# all other dimension is fixed
	dynamic_axes[name] = {0: "B"}

	torch.onnx.export(
	encoder,
	input_tensors,
	encoder_onnx_path,
	export_params=True,
	opset_version=14,
	do_constant_folding=True,
	input_names=input_names,
	output_names=output_names,
	# dynamic_axes=dynamic_axes,
	dynamic_axes=None,
	verbose=False,
	dynamo=False,
	)
	fold_static_pulsar2_subgraphs(encoder_onnx_path)
	simplify_pulsar2_onnx(encoder_onnx_path)
	rewrite_pulsar2_bool_not(encoder_onnx_path)

	with torch.no_grad():
	torch_outs = encoder(chunk_xs, chunk_lens, offset, att_cache,
	cnn_cache, cache_mask)
	if transformer:
	torch_outs = list(torch_outs).pop(6)
	ort_session = onnxruntime.InferenceSession(
	encoder_onnx_path, providers=["CPUExecutionProvider"])
	ort_inputs = {}

	input_tensors = to_numpy(input_tensors)
	for idx, name in enumerate(input_names):
	ort_inputs[name] = input_tensors[idx]
	if transformer:
	del ort_inputs["cnn_cache"]
	ort_outs = ort_session.run(None, ort_inputs)
	test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05)
	logger.info("export to onnx streaming encoder succeed!")
	onnx_config = {
	"subsampling_rate": subsampling,
	"context": context,
	"decoding_chunk_size": decoding_chunk_size,
	"num_decoding_left_chunks": num_decoding_left_chunks,
	"beam_size": args.beam_size,
	"feat_size": feature_size,
	"decoding_window": decoding_window,
	"cnn_module_kernel_cache": cnn_module_kernel,
	"return_ctc_logprobs": args.return_ctc_logprobs,
	}
	return onnx_config


	def export_rescoring_decoder(model, configs, args, logger, decoder_onnx_path,
	decoder_fastertransformer):
	bz, seq_len = 1, 32
	beam_size = args.beam_size
	decoder = Decoder(
	model.decoder,
	model.ctc_weight,
	model.reverse_weight,
	beam_size,
	decoder_fastertransformer,
	)
	decoder.eval()

	hyps_pad_sos_eos = torch.randint(low=3,
	high=1000,
	size=(bz, beam_size, seq_len),
	dtype=torch.int32)
	hyps_lens_sos = torch.randint(low=3,
	high=seq_len,
	size=(bz, beam_size),
	dtype=torch.int32)
	r_hyps_pad_sos_eos = torch.randint(low=3,
	high=1000,
	size=(bz, beam_size, seq_len),
	dtype=torch.int32)

	output_size = configs["encoder_conf"]["output_size"]
	encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32)
	encoder_out_lens = torch.randint(low=3,
	high=seq_len,
	size=(bz, ),
	dtype=torch.int32)
	ctc_score = torch.randn(bz, beam_size, dtype=torch.float32)

	input_names = [
	"encoder_out",
	"encoder_out_lens",
	"hyps_pad_sos_eos",
	"hyps_lens_sos",
	"r_hyps_pad_sos_eos",
	"ctc_score",
	]
	output_names = ["best_index"]
	if decoder_fastertransformer:
	output_names.insert(0, "decoder_out")

	torch.onnx.export(
	decoder,
	(
	encoder_out,
	encoder_out_lens,
	hyps_pad_sos_eos,
	hyps_lens_sos,
	r_hyps_pad_sos_eos,
	ctc_score,
	),
	decoder_onnx_path,
	export_params=True,
	opset_version=13,
	do_constant_folding=True,
	input_names=input_names,
	output_names=output_names,
	dynamic_axes=None,
	# dynamic_axes={
	# "encoder_out": {
	# 0: "B",
	# 1: "T"
	# },
	# "encoder_out_lens": {
	# 0: "B"
	# },
	# "hyps_pad_sos_eos": {
	# 0: "B",
	# 2: "T2"
	# },
	# "hyps_lens_sos": {
	# 0: "B"
	# },
	# "r_hyps_pad_sos_eos": {
	# 0: "B",
	# 2: "T2"
	# },
	# "ctc_score": {
	# 0: "B"
	# },
	# "best_index": {
	# 0: "B"
	# },
	# },
	verbose=False,
	dynamo=False,
	)
	fold_static_pulsar2_subgraphs(decoder_onnx_path)
	simplify_pulsar2_onnx(decoder_onnx_path)
	rewrite_pulsar2_bool_not(decoder_onnx_path)
	rewrite_pulsar2_bool_and(decoder_onnx_path)
	with torch.no_grad():
	o0 = decoder(
	encoder_out,
	encoder_out_lens,
	hyps_pad_sos_eos,
	hyps_lens_sos,
	r_hyps_pad_sos_eos,
	ctc_score,
	)
	providers = ["CPUExecutionProvider"]
	ort_session = onnxruntime.InferenceSession(decoder_onnx_path,
	providers=providers)

	input_tensors = [
	encoder_out,
	encoder_out_lens,
	hyps_pad_sos_eos,
	hyps_lens_sos,
	r_hyps_pad_sos_eos,
	ctc_score,
	]
	ort_inputs = {}
	input_tensors = to_numpy(input_tensors)
	for idx, name in enumerate(input_names):
	ort_inputs[name] = input_tensors[idx]

	# if model.reverse weight == 0,
	# the r_hyps_pad will be removed
	# from the onnx decoder since it doen't play any role
	if model.reverse_weight == 0:
	del ort_inputs["r_hyps_pad_sos_eos"]
	ort_outs = ort_session.run(None, ort_inputs)

	# check decoder output
	if decoder_fastertransformer:
	test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05)
	else:
	test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05)
	logger.info("export to onnx decoder succeed!")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="export x86_gpu model")
	parser.add_argument(
	"--pretrained_model_dir",
	default=DEFAULT_PRETRAINED_MODEL_DIR,
	help=("pretrained model directory containing train.yaml, final.pt, "
	"and global_cmvn"),
	)
	parser.add_argument(
	"--pretrained_model_url",
	default=DEFAULT_PRETRAINED_MODEL_URL,
	help="pretrained model tar.gz URL used when pretrained_model_dir is missing",
	)
	parser.add_argument(
	"--reverse_weight",
	default=-1.0,
	type=float,
	required=False,
	help="reverse weight for bitransformer," +
	"default value is in config file",
	)
	parser.add_argument(
	"--ctc_weight",
	default=-1.0,
	type=float,
	required=False,
	help="ctc weight, default value is in config file",
	)
	parser.add_argument(
	"--beam_size",
	default=10,
	type=int,
	required=False,
	help="beam size would be ctc output size",
	)
	parser.add_argument(
	"--output_onnx_dir",
	default="onnx_model",
	help="output onnx encoder and decoder directory",
	)
	# arguments for streaming encoder
	# parser.add_argument(
	# "--streaming",
	# action="store_true",
	# help="whether to export streaming encoder, default false",
	# )
	parser.add_argument(
	"--decoding_chunk_size",
	default=16,
	type=int,
	required=False,
	help="the decoding chunk size, <=0 is not supported",
	)
	parser.add_argument(
	"--num_decoding_left_chunks",
	default=5,
	type=int,
	required=False,
	help="number of left chunks, <= 0 is not supported",
	)
	parser.add_argument(
	"--decoder_fastertransformer",
	action="store_true",
	help="return decoder_out and best_index for ft",
	)
	parser.add_argument(
	"--return_ctc_logprobs",
	action="store_true",
	help="return full ctc_log_probs for TLG streaming encoder",
	)
	args = parser.parse_args()
	prepare_pretrained_model(args)

	torch.manual_seed(0)
	torch.set_printoptions(precision=10)

	with open(args.config, "r") as fin:
	configs = yaml.load(fin, Loader=yaml.FullLoader)
	if os.path.exists(args.cmvn_file):
	if 'cmvn' not in configs:
	configs['cmvn'] = "global_cmvn"
	configs['cmvn_conf'] = {}
	else:
	assert configs['cmvn'] == "global_cmvn"
	assert configs['cmvn_conf'] is not None
	configs['cmvn_conf']["cmvn_file"] = args.cmvn_file
	configs['cmvn_conf'].setdefault(
	"is_json_cmvn", configs.get("is_json_cmvn", True))
	elif configs.get('cmvn', None) == 'global_cmvn':
	raise FileNotFoundError(
	f"Expected global_cmvn in pretrained model dir: {args.cmvn_file}")
	if (args.reverse_weight != -1.0
	and "reverse_weight" in configs["model_conf"]):
	configs["model_conf"]["reverse_weight"] = args.reverse_weight
	print("Update reverse weight to", args.reverse_weight)
	if args.ctc_weight != -1:
	print("Update ctc weight to ", args.ctc_weight)
	configs["model_conf"]["ctc_weight"] = args.ctc_weight
	configs["encoder_conf"]["use_dynamic_chunk"] = False

	model, configs = init_model(args, configs)
	model.eval()

	if not os.path.exists(args.output_onnx_dir):
	os.mkdir(args.output_onnx_dir)

	export_enc_func = None
	# if args.streaming:
	assert args.decoding_chunk_size > 0
	assert args.num_decoding_left_chunks > 0
	export_enc_func = export_online_encoder
	encoder_onnx_path = os.path.join(args.output_onnx_dir, "encoder_online.onnx")
	onnx_config = export_enc_func(model, configs, args, logger,
	encoder_onnx_path)

	# else
	export_enc_func = export_offline_encoder
	encoder_onnx_path = os.path.join(args.output_onnx_dir, "encoder_offline.onnx")
	onnx_config = export_enc_func(model, configs, args, logger,
	encoder_onnx_path)


	decoder_onnx_path = os.path.join(args.output_onnx_dir, "decoder.onnx")
	export_rescoring_decoder(
	model,
	configs,
	args,
	logger,
	decoder_onnx_path,
	args.decoder_fastertransformer,
	)

	config_dir = os.path.join(args.output_onnx_dir, "config.yaml")
	with open(config_dir, "w") as out:
	yaml.dump(onnx_config, out)