| name: &name "vad_inference_postprocessing" | |
| dataset: null # Path of json file of evaluation data. Audio files should have unique names | |
| num_workers: 4 | |
| sample_rate: 16000 | |
| # functionality | |
| gen_seg_table: True # whether to converting frame level prediction to speech/no-speech segment in start and end times format | |
| write_to_manifest: True # whether to writing above segments to a single manifest json file. | |
| prepare_manifest: | |
| auto_split: True # whether to automatically split manifest entry by split_duration to avoid potential CUDA out of memory issue. | |
| split_duration: 400 # try smaller number if you still have CUDA memory issue | |
| vad: | |
| model_path: "vad_multilingual_marblenet" #.nemo local model path or pretrained model name or none | |
| parameters: # Parameters were tuned on 0~20db SNR noisy and clean multilingual ASR data. | |
| normalize_audio: False | |
| window_length_in_sec: 0.63 # window length in sec for VAD context input | |
| shift_length_in_sec: 0.08 # shift length in sec for generate frame level VAD prediction, Here we use 0.08 for faster inferene | |
| smoothing: False # false or type of smoothing method (eg: median, mean) | |
| overlap: 0.875 # overlap ratio for overlapped mean/median smoothing filter. If smoothing=False, ignore this value. | |
| postprocessing: | |
| onset: 0.5 # onset threshold for detecting the beginning and end of a speech | |
| offset: 0.3 # offset threshold for detecting the end of a speech. | |
| pad_onset: 0.2 # adding durations before each speech segment | |
| pad_offset: 0.2 # adding durations after each speech segment | |
| min_duration_on: 0.5 # threshold for small non_speech deletion | |
| min_duration_off: 0.5 # threshold for short speech segment deletion | |
| filter_speech_first: True | |
| prepared_manifest_vad_input: null # if not specify, it will automatically generated be "manifest_vad_input.json" | |
| frame_out_dir: "vad_frame" | |
| smoothing_out_dir: null # if not specify, it will automatically generated be frame_out_dir + "/overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap) | |
| table_out_dir: null # if not specify, it will automatically be frame_out_dir + "/table_output_tmp_" + key and value in postprocessing params | |
| out_manifest_filepath: null # if not specify it will automatically be "vad_out.json" | |
| # json manifest line example | |
| # {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-"} |