| data_simulator: | |
| manifest_filepath: ??? # Manifest file with paths to single speaker audio files | |
| sr: 16000 # Sampling rate of the input audio files from the manifest | |
| random_seed: 42 | |
| multiprocessing_chunksize: 10000 # Max number that multiprocessing can handle at once | |
| session_config: | |
| num_speakers: 4 # Number of unique speakers per multispeaker audio session | |
| num_sessions: 60 # Number of sessions to simulate | |
| session_length: 600 # Length of each simulated multispeaker audio session (seconds) | |
| session_params: | |
| max_audio_read_sec: 20.0 # The maximum audio length in second when loading an audio file. The bigger the number, the slower the reading speed. Should be greater than 2.5 second. | |
| sentence_length_params: # k,p values for a negative_binomial distribution which is sampled to get the sentence length (in number of words) | |
| - 0.4 # k (Number of successes until the experiment is stopped) value must be a positive integer. | |
| - 0.05 # p (Success probability) must be in the range (0, 1]. The average sentence length will be k*(1-p)/p | |
| dominance_var: 0.11 # Variance in speaker dominance (where each speaker's dominance is sampled from a normal distribution centered on 1/`num_speakers`, and then the dominance values are together normalized to 1) | |
| min_dominance: 0.05 # Minimum percentage of speaking time per speaker (note that this can cause the dominance of the other speakers to be slightly reduced) | |
| turn_prob: 0.875 # Probability of switching speakers after each utterance | |
| min_turn_prob: 0.5 # Minimum turn probability when enforce mode is True to prevent from making excessive session length | |
| mean_silence: 0.15 # Mean proportion of silence to speaking time in the audio session. Should be in range [0, 1). | |
| mean_silence_var: 0.01 # var for mean silence in all audio sessions. This value should be 0 <= mean_silence_var < mean_silence * (1 - mean_silence) | |
| per_silence_var: 900 # var for per silence in each session, set large values to de-correlate silence lengths with the latest speech segment lengths | |
| per_silence_min: 0.0 # minimum per silence duration in seconds | |
| per_silence_max: -1 # maximum per silence duration in seconds, set -1 for no maximum | |
| mean_overlap: 0.1 # Mean proportion of overlap in the overall non-silence duration. Should be in range [0, 1) and recommend [0, 0.15] range. | |
| mean_overlap_var: 0.01 # var for mean overlap in all audio sessions. This value should be 0 <= mean_overlap_var < mean_overlap * (1 - mean_overlap) | |
| per_overlap_var: 900 # var for per overlap in each session, set large values to de-correlate silence lengths with the latest speech segment lengths | |
| per_overlap_min: 0.0 # minimum per overlap duration in seconds | |
| per_overlap_max: -1 # maximum per overlap duration in seconds, set -1 for no maximum | |
| start_window: true # Window the start of sentences to smooth the audio signal (and remove silence at the start of the clip) | |
| window_type: hamming # Type of windowing used when segmenting utterances ("hamming", "hann", "cosine") | |
| window_size: 0.05 # Length of window at the start or the end of segmented utterance (seconds) | |
| start_buffer: 0.1 # Buffer of silence before the start of the sentence (to avoid cutting off speech or starting abruptly) | |
| split_buffer: 0.1 # Split RTTM labels if greater than twice this amount of silence (to avoid long gaps between utterances as being labelled as speech) | |
| release_buffer: 0.1 # Buffer before window at end of sentence (to avoid cutting off speech or ending abruptly) | |
| normalize: true # Normalize speaker volumes | |
| normalization_type: equal # Normalizing speakers ("equal" - same volume per speaker, "var" - variable volume per speaker) | |
| normalization_var: 0.1 # Variance in speaker volume (sample from standard deviation centered at 1) | |
| min_volume: 0.75 # Minimum speaker volume (only used when variable normalization is used) | |
| max_volume: 1.25 # Maximum speaker volume (only used when variable normalization is used) | |
| end_buffer: 0.5 # Buffer at the end of the session to leave blank | |
| outputs: | |
| output_dir: ??? # Output directory for audio sessions and corresponding label files | |
| output_filename: multispeaker_session # Output filename for the wav and rttm files | |
| overwrite_output: true # If true, delete the output directory if it exists | |
| output_precision: 3 # Number of decimal places in output files | |
| background_noise: # If bg noise is used, a noise source position must be passed for RIR mode | |
| add_bg: false # Add ambient background noise if true | |
| background_manifest: null # Path to background noise manifest file | |
| num_noise_files: 10 # Number of randomly chosen noise source files to be potentially included in one session | |
| snr: 60 # SNR for background noise (using average speaker power), set `snr_min` and `snr_max` values to enable random SNR | |
| snr_min: null # Min random SNR for background noise (using average speaker power), set `null` to use fixed SNR | |
| snr_max: null # Max random SNR for background noise (using average speaker power), set `null` to use fixed SNR | |
| # Segment and session augmentations. Available augmentations are in nemo/collections/asr/parts/preprocessing/perturb.py | |
| # See tutorial at https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_Noise_Augmentation.ipynb | |
| # Note that ImpulsePerturbation, NoisePerturbation, RirAndNoisePerturbation and other perturbations that uses `collections.ASRAudioText` | |
| # cannot use multi-proccessing in simulation, due to non-pickable errors. | |
| segment_augmentor: | |
| add_seg_aug: False # Set True to enable augmentation on each speech segment | |
| augmentor: | |
| gain: # Randomly perturb the gain of each speech segment | |
| prob: 0.5 # Probability of applying gain augmentation | |
| min_gain_dbfs: -10.0 # Min dB level to add | |
| max_gain_dbfs: 10.0 # Max dB level to add | |
| session_augmentor: | |
| add_sess_aug: False # Set True to enable audio augmentation on the whole session | |
| augmentor: | |
| white_noise: # Add random white noise to the whole session | |
| prob: 1.0 # Probability of adding white noise | |
| min_level: -90 # Min level of noise loudness (dB) | |
| max_level: -46 # Max level of noise loudness (dB) | |
| speaker_enforcement: | |
| enforce_num_speakers: true # Enforce that all requested speakers are present in the output wav file | |
| enforce_time: # Percentage of the way through the audio session that enforcement mode is triggered (sampled between time 1 and 2) | |
| - 0.25 | |
| - 0.75 | |
| segment_manifest: # Parameters for regenerating the segment manifest file | |
| window: 0.5 # Window length for segmentation | |
| shift: 0.25 # Shift length for segmentation | |
| step_count: 50 # Number of the unit segments you want to create per utterance | |
| deci: 3 # Rounding decimals for segment manifest file | |
| rir_generation: # Using synthetic RIR augmentation | |
| use_rir: false # Whether to generate synthetic RIR | |
| toolkit: 'pyroomacoustics' # Which toolkit to use ("pyroomacoustics", "gpuRIR") | |
| room_config: | |
| room_sz: # Size of the shoebox room environment (1d array for specific, 2d array for random range to be sampled from) | |
| - - 2 | |
| - 3 | |
| - - 2 | |
| - 3 | |
| - - 2 | |
| - 3 | |
| pos_src: # Positions of the speakers in the simulated room environment (2d array for specific, 3d array for random ranges to be sampled from) | |
| - - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| - - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| - - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| - - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| noise_src_pos: # Position in room for the ambient background noise source | |
| - 1.5 | |
| - 1.5 | |
| - 2 | |
| mic_config: | |
| num_channels: 2 # Number of output audio channels | |
| pos_rcv: # Microphone positions in the simulated room environment (1d/2d array for specific, 2d/3d array for range assuming num_channels is 1/2+) | |
| - - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| - - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| - - 0.5 | |
| - 1.5 | |
| orV_rcv: null # Microphone orientations (needed for non-omnidirectional microphones) | |
| mic_pattern: omni # Microphone type ("omni" - omnidirectional) - currently only omnidirectional microphones are supported for pyroomacoustics | |
| absorbtion_params: # Note: only `T60` is used for pyroomacoustics simulations | |
| abs_weights: # Absorption coefficient ratios for each surface | |
| - 0.9 | |
| - 0.9 | |
| - 0.9 | |
| - 0.9 | |
| - 0.9 | |
| - 0.9 | |
| T60: 0.1 # Room reverberation time (`T60` is the time it takes for the RIR to decay by 60DB) | |
| att_diff: 15.0 # Starting attenuation (if this is different than att_max, the diffuse reverberation model is used by gpuRIR) | |
| att_max: 60.0 # End attenuation when using the diffuse reverberation model (gpuRIR) | |