File size: 9,132 Bytes
b386992
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
data_simulator:
  manifest_filepath: ??? # Manifest file with paths to single speaker audio files

  sr: 16000 # Sampling rate of the input audio files from the manifest
  random_seed: 42
  multiprocessing_chunksize: 10000 # Max number that multiprocessing can handle at once

  session_config:
    num_speakers: 4 # Number of unique speakers per multispeaker audio session
    num_sessions: 60 # Number of sessions to simulate
    session_length: 600 # Length of each simulated multispeaker audio session (seconds)

  session_params:
    max_audio_read_sec: 20.0 # The maximum audio length in second when loading an audio file. The bigger the number, the slower the reading speed. Should be greater than 2.5 second.
    sentence_length_params: # k,p values for a negative_binomial distribution which is sampled to get the sentence length (in number of words)
    - 0.4 # k (Number of successes until the experiment is stopped) value must be a positive integer.
    - 0.05 # p (Success probability) must be in the range (0, 1]. The average sentence length will be k*(1-p)/p
    dominance_var: 0.11 # Variance in speaker dominance (where each speaker's dominance is sampled from a normal distribution centered on 1/`num_speakers`, and then the dominance values are together normalized to 1)
    min_dominance: 0.05 # Minimum percentage of speaking time per speaker (note that this can cause the dominance of the other speakers to be slightly reduced)
    turn_prob: 0.875 # Probability of switching speakers after each utterance
    min_turn_prob: 0.5 # Minimum turn probability when enforce mode is True to prevent from making excessive session length
    mean_silence: 0.15 # Mean proportion of silence to speaking time in the audio session. Should be in range [0, 1).
    mean_silence_var: 0.01 # var for mean silence in all audio sessions. This value should be 0 <= mean_silence_var < mean_silence * (1 - mean_silence)
    per_silence_var: 900 # var for per silence in each session, set large values to de-correlate silence lengths with the latest speech segment lengths
    per_silence_min: 0.0 # minimum per silence duration in seconds
    per_silence_max: -1 # maximum per silence duration in seconds, set -1 for no maximum
    mean_overlap: 0.1 # Mean proportion of overlap in the overall non-silence duration. Should be in range [0, 1) and recommend [0, 0.15] range.
    mean_overlap_var: 0.01 # var for mean overlap in all audio sessions. This value should be 0 <= mean_overlap_var < mean_overlap * (1 - mean_overlap)
    per_overlap_var: 900 # var for per overlap in each session, set large values to de-correlate silence lengths with the latest speech segment lengths
    per_overlap_min: 0.0 # minimum per overlap duration in seconds
    per_overlap_max: -1 # maximum per overlap duration in seconds, set -1 for no maximum
    start_window: true # Window the start of sentences to smooth the audio signal (and remove silence at the start of the clip)
    window_type: hamming # Type of windowing used when segmenting utterances ("hamming", "hann", "cosine")
    window_size: 0.05 # Length of window at the start or the end of segmented utterance (seconds)
    start_buffer: 0.1 # Buffer of silence before the start of the sentence (to avoid cutting off speech or starting abruptly)
    split_buffer: 0.1 # Split RTTM labels if greater than twice this amount of silence (to avoid long gaps between utterances as being labelled as speech)
    release_buffer: 0.1 # Buffer before window at end of sentence (to avoid cutting off speech or ending abruptly)
    normalize: true # Normalize speaker volumes 
    normalization_type: equal # Normalizing speakers ("equal" - same volume per speaker, "var" - variable volume per speaker)
    normalization_var: 0.1 # Variance in speaker volume (sample from standard deviation centered at 1)
    min_volume: 0.75 # Minimum speaker volume (only used when variable normalization is used)
    max_volume: 1.25 # Maximum speaker volume (only used when variable normalization is used)
    end_buffer: 0.5 # Buffer at the end of the session to leave blank

  outputs:
    output_dir: ??? # Output directory for audio sessions and corresponding label files
    output_filename: multispeaker_session # Output filename for the wav and rttm files
    overwrite_output: true # If true, delete the output directory if it exists
    output_precision: 3 # Number of decimal places in output files

  background_noise: # If bg noise is used, a noise source position must be passed for RIR mode
    add_bg: false # Add ambient background noise if true
    background_manifest: null # Path to background noise manifest file
    num_noise_files: 10 # Number of randomly chosen noise source files to be potentially included in one session
    snr: 60 # SNR for background noise (using average speaker power), set `snr_min` and `snr_max` values to enable random SNR
    snr_min: null  # Min random SNR for background noise (using average speaker power), set `null` to use fixed SNR
    snr_max: null  # Max random SNR for background noise (using average speaker power), set `null` to use fixed SNR

  # Segment and session augmentations. Available augmentations are in nemo/collections/asr/parts/preprocessing/perturb.py
  # See tutorial at https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_Noise_Augmentation.ipynb
  # Note that ImpulsePerturbation, NoisePerturbation, RirAndNoisePerturbation and other perturbations that uses `collections.ASRAudioText`
  # cannot use multi-proccessing in simulation, due to non-pickable errors.
  segment_augmentor:
    add_seg_aug: False # Set True to enable augmentation on each speech segment
    augmentor:
      gain:  # Randomly perturb the gain of each speech segment
        prob: 0.5  # Probability of applying gain augmentation
        min_gain_dbfs: -10.0  # Min dB level to add
        max_gain_dbfs: 10.0  # Max dB level to add

  session_augmentor:
    add_sess_aug: False # Set True to enable audio augmentation on the whole session
    augmentor:
      white_noise:  # Add random white noise to the whole session
        prob: 1.0  # Probability of adding white noise
        min_level: -90  # Min level of noise loudness (dB)
        max_level: -46  # Max level of noise loudness (dB)

  speaker_enforcement:
    enforce_num_speakers: true # Enforce that all requested speakers are present in the output wav file
    enforce_time:  # Percentage of the way through the audio session that enforcement mode is triggered (sampled between time 1 and 2)
    - 0.25
    - 0.75

  segment_manifest: # Parameters for regenerating the segment manifest file 
    window: 0.5 # Window length for segmentation 
    shift: 0.25 # Shift length for segmentation
    step_count: 50 # Number of the unit segments you want to create per utterance
    deci: 3 # Rounding decimals for segment manifest file

  rir_generation: # Using synthetic RIR augmentation
    use_rir: false # Whether to generate synthetic RIR
    toolkit: 'pyroomacoustics' # Which toolkit to use ("pyroomacoustics", "gpuRIR")
    room_config:
      room_sz: # Size of the shoebox room environment (1d array for specific, 2d array for random range to be sampled from)
      - - 2
        - 3
      - - 2
        - 3
      - - 2
        - 3
      pos_src: # Positions of the speakers in the simulated room environment (2d array for specific, 3d array for random ranges to be sampled from)
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      noise_src_pos: # Position in room for the ambient background noise source
      - 1.5
      - 1.5
      - 2
    mic_config:
      num_channels: 2 # Number of output audio channels
      pos_rcv: # Microphone positions in the simulated room environment (1d/2d array for specific, 2d/3d array for range assuming num_channels is 1/2+)
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      orV_rcv: null # Microphone orientations (needed for non-omnidirectional microphones)
      mic_pattern: omni # Microphone type ("omni" - omnidirectional) - currently only omnidirectional microphones are supported for pyroomacoustics

    absorbtion_params: # Note: only `T60` is used for pyroomacoustics simulations
      abs_weights: # Absorption coefficient ratios for each surface 
      - 0.9
      - 0.9
      - 0.9
      - 0.9
      - 0.9
      - 0.9
      T60: 0.1 # Room reverberation time (`T60` is the time it takes for the RIR to decay by 60DB)
      att_diff: 15.0 # Starting attenuation (if this is different than att_max, the diffuse reverberation model is used by gpuRIR)
      att_max: 60.0 # End attenuation when using the diffuse reverberation model (gpuRIR)