{
  "schema": "visualears-preprocessor-v2-nemo-fbank",
  "description": "NeMo AudioToMelSpectrogramPreprocessor-compatible browser feature pipeline for shenava_koochik_1_0. normalize=NA, preemphasis=0.97, torch.stft center=True, Slaney mel filters, requires processed_signal_length.",
  "sample_rate": 16000,
  "n_fft": 512,
  "win_length": 400,
  "hop_length": 160,
  "n_mels": 80,
  "window": "hann_periodic_false",
  "window_formula": "0.5 - 0.5*cos(2*pi*i/(win_length-1))",
  "center": true,
  "center_pad": 256,
  "pad_mode": "reflect",
  "preemphasis": 0.97,
  "mel_scale": "slaney/librosa.filters.mel(htk=False,norm='slaney')",
  "mel_filters_file": "mel_filters_slaney_80x257.json",
  "spectrum": "magnitude_power_2_no_fft_normalization",
  "log": "natural",
  "log_zero_guard_type": "add",
  "log_zero_guard_value": 5.960464477539063e-08,
  "normalize": "NA",
  "fixed_frames": 2005,
  "pad_value": 0.0,
  "frame_count_formula": "max(1, min(fixed_frames, floor(num_samples / hop_length) + 1))",
  "inputs": [
    {
      "name": "processed_signal",
      "dtype": "float16",
      "shape": [
        "batch",
        80,
        2005
      ]
    },
    {
      "name": "processed_signal_length",
      "dtype": "int64",
      "shape": [
        "batch"
      ]
    }
  ],
  "outputs": [
    {
      "name": "logits",
      "dtype": "float16",
      "shape": [
        "batch",
        252,
        1025
      ]
    },
    {
      "name": "encoded_lengths",
      "dtype": "int64",
      "shape": [
        "batch"
      ]
    }
  ],
  "output_stride": 8,
  "usable_steps_formula": "min(encoded_lengths[0], logits_steps)",
  "ms_per_output_step": 80,
  "blank_id": 1024,
  "ctc_decode": "greedy argmax; drop blank, repeats, and SentencePiece special tokens; join pieces; '▁' -> space"
}