| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| During inference, we perform frame-level prediction by two approaches: |
| 1) shift the window of length window_length_in_sec (e.g. 0.63s) by shift_length_in_sec (e.g. 10ms) to generate the frame and use the prediction of the window to represent the label for the frame; |
| [this script demonstrate how to do this approach] |
| 2) generate predictions with overlapping input segments. Then a smoothing filter is applied to decide the label for a frame spanned by multiple segments. |
| [get frame level prediction by this script and use vad_overlap_posterior.py in NeMo/scripts/voice_activity_detection |
| One can also find posterior about converting frame level prediction |
| to speech/no-speech segment in start and end times format in that script.] |
| |
| Image https://raw.githubusercontent.com/NVIDIA/NeMo/main/tutorials/asr/images/vad_post_overlap_diagram.png |
| will help you understand this method. |
| |
| This script will also help you perform postprocessing and generate speech segments if needed |
| |
| Usage: |
| python vad_infer.py --config-path="../conf/vad" --config-name="vad_inference_postprocessing.yaml" dataset=<Path of json file of evaluation data. Audio files should have unique names> |
| |
| """ |
| import json |
| import os |
|
|
| import torch |
|
|
| from nemo.collections.asr.parts.utils.speaker_utils import write_rttm2manifest |
| from nemo.collections.asr.parts.utils.vad_utils import ( |
| generate_overlap_vad_seq, |
| generate_vad_frame_pred, |
| generate_vad_segment_table, |
| init_vad_model, |
| prepare_manifest, |
| ) |
| from nemo.core.config import hydra_runner |
| from nemo.utils import logging |
|
|
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
|
|
|
|
| @hydra_runner(config_path="../conf/vad", config_name="vad_inference_postprocessing.yaml") |
| def main(cfg): |
| if not cfg.dataset: |
| raise ValueError("You must input the path of json file of evaluation data") |
|
|
| |
| key_meta_map = {} |
| with open(cfg.dataset, 'r') as manifest: |
| for line in manifest.readlines(): |
| audio_filepath = json.loads(line.strip())['audio_filepath'] |
| uniq_audio_name = audio_filepath.split('/')[-1].rsplit('.', 1)[0] |
| if uniq_audio_name in key_meta_map: |
| raise ValueError("Please make sure each line is with different audio_filepath! ") |
| key_meta_map[uniq_audio_name] = {'audio_filepath': audio_filepath} |
|
|
| |
| manifest_vad_input = cfg.dataset |
| if cfg.prepare_manifest.auto_split: |
| logging.info("Split long audio file to avoid CUDA memory issue") |
| logging.debug("Try smaller split_duration if you still have CUDA memory issue") |
| config = { |
| 'input': manifest_vad_input, |
| 'window_length_in_sec': cfg.vad.parameters.window_length_in_sec, |
| 'split_duration': cfg.prepare_manifest.split_duration, |
| 'num_workers': cfg.num_workers, |
| 'prepared_manifest_vad_input': cfg.prepared_manifest_vad_input, |
| } |
| manifest_vad_input = prepare_manifest(config) |
| else: |
| logging.warning( |
| "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it." |
| ) |
|
|
| torch.set_grad_enabled(False) |
| vad_model = init_vad_model(cfg.vad.model_path) |
|
|
| |
| vad_model.setup_test_data( |
| test_data_config={ |
| 'vad_stream': True, |
| 'sample_rate': 16000, |
| 'manifest_filepath': manifest_vad_input, |
| 'labels': ['infer',], |
| 'num_workers': cfg.num_workers, |
| 'shuffle': False, |
| 'window_length_in_sec': cfg.vad.parameters.window_length_in_sec, |
| 'shift_length_in_sec': cfg.vad.parameters.shift_length_in_sec, |
| 'trim_silence': False, |
| 'normalize_audio': cfg.vad.parameters.normalize_audio, |
| } |
| ) |
|
|
| vad_model = vad_model.to(device) |
| vad_model.eval() |
|
|
| if not os.path.exists(cfg.frame_out_dir): |
| os.mkdir(cfg.frame_out_dir) |
| else: |
| logging.warning( |
| "Note frame_out_dir exists. If new file has same name as file inside existing folder, it will append result to existing file and might cause mistakes for next steps." |
| ) |
|
|
| logging.info("Generating frame level prediction ") |
| pred_dir = generate_vad_frame_pred( |
| vad_model=vad_model, |
| window_length_in_sec=cfg.vad.parameters.window_length_in_sec, |
| shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec, |
| manifest_vad_input=manifest_vad_input, |
| out_dir=cfg.frame_out_dir, |
| ) |
| logging.info( |
| f"Finish generating VAD frame level prediction with window_length_in_sec={cfg.vad.parameters.window_length_in_sec} and shift_length_in_sec={cfg.vad.parameters.shift_length_in_sec}" |
| ) |
| frame_length_in_sec = cfg.vad.parameters.shift_length_in_sec |
|
|
| |
| if cfg.vad.parameters.smoothing: |
| |
| |
| logging.info("Generating predictions with overlapping input segments") |
| smoothing_pred_dir = generate_overlap_vad_seq( |
| frame_pred_dir=pred_dir, |
| smoothing_method=cfg.vad.parameters.smoothing, |
| overlap=cfg.vad.parameters.overlap, |
| window_length_in_sec=cfg.vad.parameters.window_length_in_sec, |
| shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec, |
| num_workers=cfg.num_workers, |
| out_dir=cfg.smoothing_out_dir, |
| ) |
| logging.info( |
| f"Finish generating predictions with overlapping input segments with smoothing_method={cfg.vad.parameters.smoothing} and overlap={cfg.vad.parameters.overlap}" |
| ) |
| pred_dir = smoothing_pred_dir |
| frame_length_in_sec = 0.01 |
|
|
| |
| if cfg.gen_seg_table: |
| logging.info("Converting frame level prediction to speech/no-speech segment in start and end times format.") |
| table_out_dir = generate_vad_segment_table( |
| vad_pred_dir=pred_dir, |
| postprocessing_params=cfg.vad.parameters.postprocessing, |
| frame_length_in_sec=frame_length_in_sec, |
| num_workers=cfg.num_workers, |
| out_dir=cfg.table_out_dir, |
| ) |
| logging.info( |
| f"Finish generating speech semgents table with postprocessing_params: {cfg.vad.parameters.postprocessing}" |
| ) |
|
|
| if cfg.write_to_manifest: |
| for i in key_meta_map: |
| key_meta_map[i]['rttm_filepath'] = os.path.join(table_out_dir, i + ".txt") |
|
|
| if not cfg.out_manifest_filepath: |
| out_manifest_filepath = "vad_out.json" |
| else: |
| out_manifest_filepath = cfg.out_manifest_filepath |
| out_manifest_filepath = write_rttm2manifest(key_meta_map, out_manifest_filepath) |
| logging.info(f"Writing VAD output to manifest: {out_manifest_filepath}") |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|