| { |
| "model": { |
| "type": "nemotron_speech", |
| "vocab_size": 1025, |
| "num_mels": 128, |
| "fft_size": 512, |
| "hop_length": 160, |
| "win_length": 400, |
| "preemph": 0.97, |
| "log_eps": 5.96046448e-08, |
| "subsampling_factor": 8, |
| "left_context": 70, |
| "conv_context": 8, |
| "pre_encode_cache_size": 9, |
| "sample_rate": 16000, |
| "chunk_samples": 8960, |
| "blank_id": 1024, |
| "max_symbols_per_step": 10, |
| "encoder": { |
| "filename": "encoder.onnx", |
| "hidden_size": 1024, |
| "num_hidden_layers": 24, |
| "inputs": { |
| "audio_features": "audio_signal", |
| "input_lengths": "length", |
| "cache_last_channel": "cache_last_channel", |
| "cache_last_time": "cache_last_time", |
| "cache_last_channel_len": "cache_last_channel_len" |
| }, |
| "outputs": { |
| "encoder_outputs": "outputs", |
| "output_lengths": "encoded_lengths", |
| "cache_last_channel_next": "cache_last_channel_next", |
| "cache_last_time_next": "cache_last_time_next", |
| "cache_last_channel_len_next": "cache_last_channel_len_next" |
| } |
| }, |
| "decoder": { |
| "filename": "decoder.onnx", |
| "hidden_size": 640, |
| "num_hidden_layers": 2, |
| "inputs": { |
| "targets": "targets", |
| "lstm_hidden_state": "h_in", |
| "lstm_cell_state": "c_in" |
| }, |
| "outputs": { |
| "outputs": "decoder_output", |
| "lstm_hidden_state": "h_out", |
| "lstm_cell_state": "c_out" |
| } |
| }, |
| "joiner": { |
| "filename": "joint.onnx", |
| "inputs": { |
| "encoder_outputs": "encoder_output", |
| "decoder_outputs": "decoder_output" |
| }, |
| "outputs": { |
| "logits": "joint_output" |
| } |
| } |
| } |
| } |