NeMo_Canary / tools /speech_data_simulator /conf /data_simulator.yaml

Upload folder using huggingface_hub

b386992 verified 6 months ago

9.13 kB

	data_simulator:
	manifest_filepath: ??? # Manifest file with paths to single speaker audio files

	sr: 16000 # Sampling rate of the input audio files from the manifest
	random_seed: 42
	multiprocessing_chunksize: 10000 # Max number that multiprocessing can handle at once

	session_config:
	num_speakers: 4 # Number of unique speakers per multispeaker audio session
	num_sessions: 60 # Number of sessions to simulate
	session_length: 600 # Length of each simulated multispeaker audio session (seconds)

	session_params:
	max_audio_read_sec: 20.0 # The maximum audio length in second when loading an audio file. The bigger the number, the slower the reading speed. Should be greater than 2.5 second.
	sentence_length_params: # k,p values for a negative_binomial distribution which is sampled to get the sentence length (in number of words)
	- 0.4 # k (Number of successes until the experiment is stopped) value must be a positive integer.
	- 0.05 # p (Success probability) must be in the range (0, 1]. The average sentence length will be k*(1-p)/p
	dominance_var: 0.11 # Variance in speaker dominance (where each speaker's dominance is sampled from a normal distribution centered on 1/`num_speakers`, and then the dominance values are together normalized to 1)
	min_dominance: 0.05 # Minimum percentage of speaking time per speaker (note that this can cause the dominance of the other speakers to be slightly reduced)
	turn_prob: 0.875 # Probability of switching speakers after each utterance
	min_turn_prob: 0.5 # Minimum turn probability when enforce mode is True to prevent from making excessive session length
	mean_silence: 0.15 # Mean proportion of silence to speaking time in the audio session. Should be in range [0, 1).
	mean_silence_var: 0.01 # var for mean silence in all audio sessions. This value should be 0 <= mean_silence_var < mean_silence * (1 - mean_silence)
	per_silence_var: 900 # var for per silence in each session, set large values to de-correlate silence lengths with the latest speech segment lengths
	per_silence_min: 0.0 # minimum per silence duration in seconds
	per_silence_max: -1 # maximum per silence duration in seconds, set -1 for no maximum
	mean_overlap: 0.1 # Mean proportion of overlap in the overall non-silence duration. Should be in range [0, 1) and recommend [0, 0.15] range.
	mean_overlap_var: 0.01 # var for mean overlap in all audio sessions. This value should be 0 <= mean_overlap_var < mean_overlap * (1 - mean_overlap)
	per_overlap_var: 900 # var for per overlap in each session, set large values to de-correlate silence lengths with the latest speech segment lengths
	per_overlap_min: 0.0 # minimum per overlap duration in seconds
	per_overlap_max: -1 # maximum per overlap duration in seconds, set -1 for no maximum
	start_window: true # Window the start of sentences to smooth the audio signal (and remove silence at the start of the clip)
	window_type: hamming # Type of windowing used when segmenting utterances ("hamming", "hann", "cosine")
	window_size: 0.05 # Length of window at the start or the end of segmented utterance (seconds)
	start_buffer: 0.1 # Buffer of silence before the start of the sentence (to avoid cutting off speech or starting abruptly)
	split_buffer: 0.1 # Split RTTM labels if greater than twice this amount of silence (to avoid long gaps between utterances as being labelled as speech)
	release_buffer: 0.1 # Buffer before window at end of sentence (to avoid cutting off speech or ending abruptly)
	normalize: true # Normalize speaker volumes
	normalization_type: equal # Normalizing speakers ("equal" - same volume per speaker, "var" - variable volume per speaker)
	normalization_var: 0.1 # Variance in speaker volume (sample from standard deviation centered at 1)
	min_volume: 0.75 # Minimum speaker volume (only used when variable normalization is used)
	max_volume: 1.25 # Maximum speaker volume (only used when variable normalization is used)
	end_buffer: 0.5 # Buffer at the end of the session to leave blank

	outputs:
	output_dir: ??? # Output directory for audio sessions and corresponding label files
	output_filename: multispeaker_session # Output filename for the wav and rttm files
	overwrite_output: true # If true, delete the output directory if it exists
	output_precision: 3 # Number of decimal places in output files

	background_noise: # If bg noise is used, a noise source position must be passed for RIR mode
	add_bg: false # Add ambient background noise if true
	background_manifest: null # Path to background noise manifest file
	num_noise_files: 10 # Number of randomly chosen noise source files to be potentially included in one session
	snr: 60 # SNR for background noise (using average speaker power), set `snr_min` and `snr_max` values to enable random SNR
	snr_min: null # Min random SNR for background noise (using average speaker power), set `null` to use fixed SNR
	snr_max: null # Max random SNR for background noise (using average speaker power), set `null` to use fixed SNR

	# Segment and session augmentations. Available augmentations are in nemo/collections/asr/parts/preprocessing/perturb.py
	# See tutorial at https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_Noise_Augmentation.ipynb
	# Note that ImpulsePerturbation, NoisePerturbation, RirAndNoisePerturbation and other perturbations that uses `collections.ASRAudioText`
	# cannot use multi-proccessing in simulation, due to non-pickable errors.
	segment_augmentor:
	add_seg_aug: False # Set True to enable augmentation on each speech segment
	augmentor:
	gain: # Randomly perturb the gain of each speech segment
	prob: 0.5 # Probability of applying gain augmentation
	min_gain_dbfs: -10.0 # Min dB level to add
	max_gain_dbfs: 10.0 # Max dB level to add

	session_augmentor:
	add_sess_aug: False # Set True to enable audio augmentation on the whole session
	augmentor:
	white_noise: # Add random white noise to the whole session
	prob: 1.0 # Probability of adding white noise
	min_level: -90 # Min level of noise loudness (dB)
	max_level: -46 # Max level of noise loudness (dB)

	speaker_enforcement:
	enforce_num_speakers: true # Enforce that all requested speakers are present in the output wav file
	enforce_time: # Percentage of the way through the audio session that enforcement mode is triggered (sampled between time 1 and 2)
	- 0.25
	- 0.75

	segment_manifest: # Parameters for regenerating the segment manifest file
	window: 0.5 # Window length for segmentation
	shift: 0.25 # Shift length for segmentation
	step_count: 50 # Number of the unit segments you want to create per utterance
	deci: 3 # Rounding decimals for segment manifest file

	rir_generation: # Using synthetic RIR augmentation
	use_rir: false # Whether to generate synthetic RIR
	toolkit: 'pyroomacoustics' # Which toolkit to use ("pyroomacoustics", "gpuRIR")
	room_config:
	room_sz: # Size of the shoebox room environment (1d array for specific, 2d array for random range to be sampled from)
	- - 2
	- 3
	- - 2
	- 3
	- - 2
	- 3
	pos_src: # Positions of the speakers in the simulated room environment (2d array for specific, 3d array for random ranges to be sampled from)
	- - - 0.5
	- 1.5
	- - 0.5
	- 1.5
	- - 0.5
	- 1.5
	- - - 0.5
	- 1.5
	- - 0.5
	- 1.5
	- - 0.5
	- 1.5
	- - - 0.5
	- 1.5
	- - 0.5
	- 1.5
	- - 0.5
	- 1.5
	- - - 0.5
	- 1.5
	- - 0.5
	- 1.5
	- - 0.5
	- 1.5
	noise_src_pos: # Position in room for the ambient background noise source
	- 1.5
	- 1.5
	- 2
	mic_config:
	num_channels: 2 # Number of output audio channels
	pos_rcv: # Microphone positions in the simulated room environment (1d/2d array for specific, 2d/3d array for range assuming num_channels is 1/2+)
	- - - 0.5
	- 1.5
	- - 0.5
	- 1.5
	- - 0.5
	- 1.5
	- - - 0.5
	- 1.5
	- - 0.5
	- 1.5
	- - 0.5
	- 1.5
	orV_rcv: null # Microphone orientations (needed for non-omnidirectional microphones)
	mic_pattern: omni # Microphone type ("omni" - omnidirectional) - currently only omnidirectional microphones are supported for pyroomacoustics

	absorbtion_params: # Note: only `T60` is used for pyroomacoustics simulations
	abs_weights: # Absorption coefficient ratios for each surface
	- 0.9
	- 0.9
	- 0.9
	- 0.9
	- 0.9
	- 0.9
	T60: 0.1 # Room reverberation time (`T60` is the time it takes for the RIR to decay by 60DB)
	att_diff: 15.0 # Starting attenuation (if this is different than att_max, the diffuse reverberation model is used by gpuRIR)
	att_max: 60.0 # End attenuation when using the diffuse reverberation model (gpuRIR)