{ "schema": "visualears-preprocessor-v2-nemo-fbank", "description": "NeMo AudioToMelSpectrogramPreprocessor-compatible feature pipeline for shenava-32m-v5.", "sample_rate": 16000, "n_fft": 512, "win_length": 400, "hop_length": 160, "n_mels": 80, "window": "hann_periodic_false", "center": true, "center_pad": 256, "pad_mode": "reflect", "preemphasis": 0.97, "mel_scale": "slaney/librosa.filters.mel(htk=False,norm='slaney')", "mel_filters_file": "mel_filters_slaney_80x257.json", "spectrum": "magnitude_power_2_no_fft_normalization", "log": "natural", "log_zero_guard_type": "add", "log_zero_guard_value": 5.960464477539063e-08, "normalize": "NA", "fixed_frames": 2005, "pad_value": 0.0, "frame_count_formula": "max(1, min(fixed_frames, floor(num_samples / hop_length) + 1))", "output_stride": 8, "usable_steps_formula": "min(encoded_lengths[0], logits_steps)", "ms_per_output_step": 80, "blank_id": 1024, "ctc_decode": "greedy argmax; drop blank, repeats, SentencePiece specials; join; '▁' -> space" }