data_simulator: manifest_filepath: ??? # Manifest file with paths to single speaker audio files sr: 16000 # Sampling rate of the input audio files from the manifest random_seed: 42 multiprocessing_chunksize: 10000 # Max number that multiprocessing can handle at once session_config: num_speakers: 4 # Number of unique speakers per multispeaker audio session num_sessions: 60 # Number of sessions to simulate session_length: 600 # Length of each simulated multispeaker audio session (seconds) session_params: max_audio_read_sec: 20.0 # The maximum audio length in second when loading an audio file. The bigger the number, the slower the reading speed. Should be greater than 2.5 second. sentence_length_params: # k,p values for a negative_binomial distribution which is sampled to get the sentence length (in number of words) - 0.4 # k (Number of successes until the experiment is stopped) value must be a positive integer. - 0.05 # p (Success probability) must be in the range (0, 1]. The average sentence length will be k*(1-p)/p dominance_var: 0.11 # Variance in speaker dominance (where each speaker's dominance is sampled from a normal distribution centered on 1/`num_speakers`, and then the dominance values are together normalized to 1) min_dominance: 0.05 # Minimum percentage of speaking time per speaker (note that this can cause the dominance of the other speakers to be slightly reduced) turn_prob: 0.875 # Probability of switching speakers after each utterance min_turn_prob: 0.5 # Minimum turn probability when enforce mode is True to prevent from making excessive session length mean_silence: 0.15 # Mean proportion of silence to speaking time in the audio session. Should be in range [0, 1). mean_silence_var: 0.01 # var for mean silence in all audio sessions. This value should be 0 <= mean_silence_var < mean_silence * (1 - mean_silence) per_silence_var: 900 # var for per silence in each session, set large values to de-correlate silence lengths with the latest speech segment lengths per_silence_min: 0.0 # minimum per silence duration in seconds per_silence_max: -1 # maximum per silence duration in seconds, set -1 for no maximum mean_overlap: 0.1 # Mean proportion of overlap in the overall non-silence duration. Should be in range [0, 1) and recommend [0, 0.15] range. mean_overlap_var: 0.01 # var for mean overlap in all audio sessions. This value should be 0 <= mean_overlap_var < mean_overlap * (1 - mean_overlap) per_overlap_var: 900 # var for per overlap in each session, set large values to de-correlate silence lengths with the latest speech segment lengths per_overlap_min: 0.0 # minimum per overlap duration in seconds per_overlap_max: -1 # maximum per overlap duration in seconds, set -1 for no maximum start_window: true # Window the start of sentences to smooth the audio signal (and remove silence at the start of the clip) window_type: hamming # Type of windowing used when segmenting utterances ("hamming", "hann", "cosine") window_size: 0.05 # Length of window at the start or the end of segmented utterance (seconds) start_buffer: 0.1 # Buffer of silence before the start of the sentence (to avoid cutting off speech or starting abruptly) split_buffer: 0.1 # Split RTTM labels if greater than twice this amount of silence (to avoid long gaps between utterances as being labelled as speech) release_buffer: 0.1 # Buffer before window at end of sentence (to avoid cutting off speech or ending abruptly) normalize: true # Normalize speaker volumes normalization_type: equal # Normalizing speakers ("equal" - same volume per speaker, "var" - variable volume per speaker) normalization_var: 0.1 # Variance in speaker volume (sample from standard deviation centered at 1) min_volume: 0.75 # Minimum speaker volume (only used when variable normalization is used) max_volume: 1.25 # Maximum speaker volume (only used when variable normalization is used) end_buffer: 0.5 # Buffer at the end of the session to leave blank outputs: output_dir: ??? # Output directory for audio sessions and corresponding label files output_filename: multispeaker_session # Output filename for the wav and rttm files overwrite_output: true # If true, delete the output directory if it exists output_precision: 3 # Number of decimal places in output files background_noise: # If bg noise is used, a noise source position must be passed for RIR mode add_bg: false # Add ambient background noise if true background_manifest: null # Path to background noise manifest file num_noise_files: 10 # Number of randomly chosen noise source files to be potentially included in one session snr: 60 # SNR for background noise (using average speaker power), set `snr_min` and `snr_max` values to enable random SNR snr_min: null # Min random SNR for background noise (using average speaker power), set `null` to use fixed SNR snr_max: null # Max random SNR for background noise (using average speaker power), set `null` to use fixed SNR # Segment and session augmentations. Available augmentations are in nemo/collections/asr/parts/preprocessing/perturb.py # See tutorial at https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_Noise_Augmentation.ipynb # Note that ImpulsePerturbation, NoisePerturbation, RirAndNoisePerturbation and other perturbations that uses `collections.ASRAudioText` # cannot use multi-proccessing in simulation, due to non-pickable errors. segment_augmentor: add_seg_aug: False # Set True to enable augmentation on each speech segment augmentor: gain: # Randomly perturb the gain of each speech segment prob: 0.5 # Probability of applying gain augmentation min_gain_dbfs: -10.0 # Min dB level to add max_gain_dbfs: 10.0 # Max dB level to add session_augmentor: add_sess_aug: False # Set True to enable audio augmentation on the whole session augmentor: white_noise: # Add random white noise to the whole session prob: 1.0 # Probability of adding white noise min_level: -90 # Min level of noise loudness (dB) max_level: -46 # Max level of noise loudness (dB) speaker_enforcement: enforce_num_speakers: true # Enforce that all requested speakers are present in the output wav file enforce_time: # Percentage of the way through the audio session that enforcement mode is triggered (sampled between time 1 and 2) - 0.25 - 0.75 segment_manifest: # Parameters for regenerating the segment manifest file window: 0.5 # Window length for segmentation shift: 0.25 # Shift length for segmentation step_count: 50 # Number of the unit segments you want to create per utterance deci: 3 # Rounding decimals for segment manifest file rir_generation: # Using synthetic RIR augmentation use_rir: false # Whether to generate synthetic RIR toolkit: 'pyroomacoustics' # Which toolkit to use ("pyroomacoustics", "gpuRIR") room_config: room_sz: # Size of the shoebox room environment (1d array for specific, 2d array for random range to be sampled from) - - 2 - 3 - - 2 - 3 - - 2 - 3 pos_src: # Positions of the speakers in the simulated room environment (2d array for specific, 3d array for random ranges to be sampled from) - - - 0.5 - 1.5 - - 0.5 - 1.5 - - 0.5 - 1.5 - - - 0.5 - 1.5 - - 0.5 - 1.5 - - 0.5 - 1.5 - - - 0.5 - 1.5 - - 0.5 - 1.5 - - 0.5 - 1.5 - - - 0.5 - 1.5 - - 0.5 - 1.5 - - 0.5 - 1.5 noise_src_pos: # Position in room for the ambient background noise source - 1.5 - 1.5 - 2 mic_config: num_channels: 2 # Number of output audio channels pos_rcv: # Microphone positions in the simulated room environment (1d/2d array for specific, 2d/3d array for range assuming num_channels is 1/2+) - - - 0.5 - 1.5 - - 0.5 - 1.5 - - 0.5 - 1.5 - - - 0.5 - 1.5 - - 0.5 - 1.5 - - 0.5 - 1.5 orV_rcv: null # Microphone orientations (needed for non-omnidirectional microphones) mic_pattern: omni # Microphone type ("omni" - omnidirectional) - currently only omnidirectional microphones are supported for pyroomacoustics absorbtion_params: # Note: only `T60` is used for pyroomacoustics simulations abs_weights: # Absorption coefficient ratios for each surface - 0.9 - 0.9 - 0.9 - 0.9 - 0.9 - 0.9 T60: 0.1 # Room reverberation time (`T60` is the time it takes for the RIR to decay by 60DB) att_diff: 15.0 # Starting attenuation (if this is different than att_max, the diffuse reverberation model is used by gpuRIR) att_max: 60.0 # End attenuation when using the diffuse reverberation model (gpuRIR)