File size: 7,537 Bytes
7934b29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
data_simulator:
            # desired_overlap_amount = self._sample_from_overlap_model(running_len_samples - self.sess_silence_len)
  manifest_filepath: ??? # Manifest file with paths to single speaker audio files

  sr: 16000 # Sampling rate of the input audio files from the manifest
  random_seed: 42

  session_config:
    num_speakers: 4 # Number of unique speakers per multispeaker audio session
    num_sessions: 60 # Number of sessions to simulate
    session_length: 600 # Length of each simulated multispeaker audio session (seconds)

  session_params:
    sentence_length_params: # k,p values for a negative_binomial distribution which is sampled to get the sentence length (in number of words)
    - 0.4 # k (Number of successes until the experiment is stopped) value must be a positive integer.
    - 0.05 # p (Success probability) must be in the range (0, 1]. The average sentence length will be k*(1-p)/p
    dominance_var: 0.11 # Variance in speaker dominance (where each speaker's dominance is sampled from a normal distribution centered on 1/`num_speakers`, and then the dominance values are together normalized to 1)
    min_dominance: 0.05 # Minimum percentage of speaking time per speaker (note that this can cause the dominance of the other speakers to be slightly reduced)
    turn_prob: 0.875 # Probability of switching speakers after each utterance
    min_turn_prob: 0.5 # Minimum turn probability when enforce mode is True to prevent from making excessive session length
    mean_silence: 0.15 # Mean proportion of silence to speaking time in the audio session. Should be in range [0, 1).
    mean_silence_var: 0.01 # var for mean silence in all audio sessions. This value should be 0 <= mean_silence_var < mean_silence * (1 - mean_silence)
    per_silence_var: 900 # var for per silence in each session, set large values to de-correlate silence lengths with the latest speech segment lengths
    per_silence_min: 0.0 # minimum per silence duration in seconds
    per_silence_max: -1 # maximum per silence duration in seconds, set -1 for no maximum
    mean_overlap: 0.1 # Mean proportion of overlap in the overall non-silence duration. Should be in range [0, 1) and recommend [0, 0.15] range.
    mean_overlap_var: 0.01 # var for mean overlap in all audio sessions. This value should be 0 <= mean_overlap_var < mean_overlap * (1 - mean_overlap)
    per_overlap_var: 900 # var for per overlap in each session, set large values to de-correlate silence lengths with the latest speech segment lengths
    per_overlap_min: 0.0 # minimum per overlap duration in seconds
    per_overlap_max: -1 # maximum per overlap duration in seconds, set -1 for no maximum
    start_window: true # Window the start of sentences to smooth the audio signal (and remove silence at the start of the clip)
    window_type: hamming # Type of windowing used when segmenting utterances ("hamming", "hann", "cosine")
    window_size: 0.05 # Length of window at the start or the end of segmented utterance (seconds)
    start_buffer: 0.1 # Buffer of silence before the start of the sentence (to avoid cutting off speech or starting abruptly)
    split_buffer: 0.1 # Split RTTM labels if greater than twice this amount of silence (to avoid long gaps between utterances as being labelled as speech)
    release_buffer: 0.1 # Buffer before window at end of sentence (to avoid cutting off speech or ending abruptly)
    normalize: true # Normalize speaker volumes 
    normalization_type: equal # Normalizing speakers ("equal" - same volume per speaker, "var" - variable volume per speaker)
    normalization_var: 0.1 # Variance in speaker volume (sample from standard deviation centered at 1)
    min_volume: 0.75 # Minimum speaker volume (only used when variable normalization is used)
    max_volume: 1.25 # Maximum speaker volume (only used when variable normalization is used)
    end_buffer: 0.5 # Buffer at the end of the session to leave blank

  outputs:
    output_dir: ??? # Output directory for audio sessions and corresponding label files
    output_filename: multispeaker_session # Output filename for the wav and rttm files
    overwrite_output: true # If true, delete the output directory if it exists
    output_precision: 3 # Number of decimal places in output files

  background_noise: # If bg noise is used, a noise source position must be passed for RIR mode
    add_bg: false # Add ambient background noise if true
    background_manifest: null # Path to background noise manifest file
    num_noise_files: 10 # Number of randomly chosen noise source files to be potentially included in one session
    snr: 60 # SNR for background noise (using average speaker power)

  speaker_enforcement:
    enforce_num_speakers: true # Enforce that all requested speakers are present in the output wav file
    enforce_time:  # Percentage of the way through the audio session that enforcement mode is triggered (sampled between time 1 and 2)
    - 0.25
    - 0.75

  segment_manifest: # Parameters for regenerating the segment manifest file 
    window: 0.5 # Window length for segmentation 
    shift: 0.25 # Shift length for segmentation
    step_count: 50 # Number of the unit segments you want to create per utterance
    deci: 3 # Rounding decimals for segment manifest file

  rir_generation: # Using synthetic RIR augmentation
    use_rir: false # Whether to generate synthetic RIR
    toolkit: 'pyroomacoustics' # Which toolkit to use ("pyroomacoustics", "gpuRIR")
    room_config:
      room_sz: # Size of the shoebox room environment (1d array for specific, 2d array for random range to be sampled from)
      - - 2
        - 3
      - - 2
        - 3
      - - 2
        - 3
      pos_src: # Positions of the speakers in the simulated room environment (2d array for specific, 3d array for random ranges to be sampled from)
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      noise_src_pos: # Position in room for the ambient background noise source
      - 1.5
      - 1.5
      - 2
    mic_config:
      num_channels: 2 # Number of output audio channels
      pos_rcv: # Microphone positions in the simulated room environment (1d/2d array for specific, 2d/3d array for range assuming num_channels is 1/2+)
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      - - - 0.5
          - 1.5
        - - 0.5
          - 1.5
        - - 0.5
          - 1.5
      orV_rcv: null # Microphone orientations (needed for non-omnidirectional microphones)
      mic_pattern: omni # Microphone type ("omni" - omnidirectional) - currently only omnidirectional microphones are supported for pyroomacoustics

    absorbtion_params: # Note: only `T60` is used for pyroomacoustics simulations
      abs_weights: # Absorption coefficient ratios for each surface 
      - 0.9
      - 0.9
      - 0.9
      - 0.9
      - 0.9
      - 0.9
      T60: 0.1 # Room reverberation time (`T60` is the time it takes for the RIR to decay by 60DB)
      att_diff: 15.0 # Starting attenuation (if this is different than att_max, the diffuse reverberation model is used by gpuRIR)
      att_max: 60.0 # End attenuation when using the diffuse reverberation model (gpuRIR)