File size: 16,274 Bytes
fec9168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
# Temporal Reasoning Audio Dataset Pipeline Configuration
##uniform distributuon for clip duration
##not mixing datasets 

##count
##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder

##duration
##amplitude based filtering -> normalize -> threshold based selection
##gap between audio clips - x2/1.5 the shorter one -> add as param
##different clips of the same class can be contatenated to reach target duration
##consecutive ordering only
##based on n unique sources and total clips we can have -> shortest and longest duration calculation

##reject datapoint if same target audio clip cannot be repeated to maintain the gap - arg
##sample different clip from the same class -> check if different clips can be used to fill the gap - arg

##amplitude filtered durations in metadata csv

##get_max_clip_num_to_be_joined()
##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder

##ensure_silence_between_clips()
##silence should always be there between two clips

##order
##repeat target clips 
##second and second last - modify question types

##volume
##amplitude average loudness for a audio clip -> repetitions but same clip(argument) -> different volume levels based on dB levels

##add crossfade

##trimming - threshold separately for each audio clip - normalize 1 and 0 - get threshold -> trim -> concatenate
##leftmost and rightmost silence trimming
##buffer for trimming - cutting early and cutting a bit late to avoid cutting important parts 
##periodicity affect

##volume - trim and get average loudness -> normalize -> adjust volume levels 

##number of clips per samples to avoid silence 


# ESC-50 Dataset paths (each clip is 5 seconds)
esc50:
  audio_path: "/path/to/ESC-50_github/audio"
  metadata_path: "/path/to/ESC-50_github/meta/esc50.csv"

# Synthetic silence audio for concatenation
synthetic_silence:
  path: "/path/to/synthetic_silences"
  
# Output configuration
output:
  base_path: "/path/to/pipeline/test_ood"
# Dataset class-subset configuration
# Use this to create datasets (train/val/test) from a persistent subset
# of classes (e.g. use 40 of 50 classes for in-distribution splits and
# optionally create an OOD test set using all 50 classes).
dataset:
  use_class_subset: false                # if false, use all available classes
  num_classes_subset: 40                # number of classes to use for train/val/test
  subset_persist_path: "/path/to/class_subset.json"
  subset_seed: 42                       # RNG seed when sampling the subset (persisted)

# Audio generation parameters
audio:
  # Duration range for each GENERATED clip (in seconds)
  # Original ESC-50 clips are 5s and will be concatenated to create clips in this range
  min_clip_duration: 20.0     # Minimum duration for each generated clip
  max_clip_duration: 60.0     # Maximum duration for each generated clip
  
  # Crossfade and silence
  crossfade_duration: 500    # Crossfade between audio and silence (milliseconds) for smooth transitions
  silence_duration: 1000     # Default silence between clips (milliseconds)
  min_silence_duration: 100  # Minimum silence ALWAYS inserted between clips (milliseconds)
  max_extra_silence_per_gap: 500  # Maximum extra silence per gap when distributing remainder
  crossfade_within_source: 50  # Small crossfade within same-source repetitions (count task)
  with_silence: true         # Add silence between clips
  # Duration (seconds) of individual source clips (ESC-50 are 5s by default).
  # Used to compute how many source clips are concatenated to reach a target
  # generated clip duration. Change only if your source clips differ.
  source_clip_duration: 5.0
  
  # Audio normalization
  normalize: false
  normalize_target_dBFS: -20.0

# Random seed for reproducibility
random_seed: 42

# LLM for question generation (local Llama 3.1 8B)
llm:
  enabled: false  # Set to true to use LLM for question generation

# Task-specific configurations
tasks:
  count:
    enabled: true
    # Total duration for ALL samples in this task combined (in hours)
    # Pipeline will calculate number of samples based on min/max clip durations
    task_duration_size: 2.0  # hours
    
    # Maximum unique sound sources per sample (single number)
    # Actual number will be subsampled from max(1, max_clips-3) to min(max_clips, max_clips_per_sample)
    max_clips_per_sample: 10
    
    # Ordering mode for repeated clips of same source:
    # "random": Clips are shuffled randomly (A B A C B A C...) - tests recognition of recurring sounds
    # "consecutive": Same-source clips grouped together (AAA BBB CCC) - easier, just count blocks
    ordering_mode: "random"
    
    # Question templates for MCQ
    mcq_questions:
      - "What is the number of distinct sound sources in the audio file?"
      - "How many different types of sounds can be identified in this recording?"
      - "How many unique types of sound are present in this audio?"
      - "Identify the count of different sound sources in this clip."
      - "What is the total number of unique sounds heard in this audio?"
      - "How many distinct sound categories are there in this audio file?"
      - "Determine the number of unique sound sources in this recording."
      - "How many separate sound sources are included in the audio?"
      - "What is the total number of unique sound types in this audio?"
      - "How many different sound sources can be heard in this clip?"
    # Question templates for open-text
    open_text_questions:
      - "How many distinct sound sources are present in the audio?"
      - "Count the number of unique sounds in this recording."
      - "What is the total count of different sound categories heard?"
      - "Identify and count all unique sound types in the clip."
    
  duration:
    enabled: true
    # Total duration for ALL samples in this task combined (in hours)
    task_duration_size: 2.0  # hours
    
    # Number of unique sound sources per sample (can be single int or list)
    # Single int (e.g., 15): randomly samples from 1 to 15 (like count/order tasks)
    # List (e.g., [2,3,4]): randomly picks from the list
    # The script will automatically generate repetition patterns to create
    # shortest/longest variations based on the target clip duration
    num_unique_sources: 10
    
    # Ordering: only keep "consecutive" so repeated segments of the same
    # source remain grouped together, ensuring that multiple consecutive
    # clips of the same audio yield the longest duration unambiguously.
    ordering_methods: ["consecutive"]
    
    # =====================================================
    # Amplitude-based filtering parameters (preprocessing)
    # =====================================================
    # RELATIVE dB threshold below peak to consider as silence
    # For each clip: silence_threshold = clip_peak_dB + amplitude_threshold_db
    # Example: If clip peak is -5 dB and threshold is -20, silence threshold = -25 dB
    # Based on ESC-50 analysis: -20 dB gives ~60% effective duration (good balance)
    # More aggressive (removes more silence): -15 dB
    # More conservative (keeps more sound): -25 dB
    amplitude_threshold_db: -20.0
    
    # Minimum duration of sound region to keep (milliseconds)
    # Filters out very short transient noise spikes
    # ESC-50 is curated, so 20-30ms is sufficient
    min_sound_duration_ms: 25
    
    # =====================================================
    # Adaptive threshold strategy
    # =====================================================
    # "peak_relative": threshold = peak_dB + amplitude_threshold_db (fixed offset from peak)
    #   - Simple but not adaptive to actual noise levels
    # "noise_floor": threshold = percentile(dB, N) + delta_dB (RECOMMENDED)
    #   - Fully adaptive per-clip based on its own noise floor
    #   - Each clip analyzed independently - no fixed dB values needed
    #   - Better for diverse audio with varying noise levels
    threshold_strategy: "noise_floor"
    
    # Noise floor estimation percentile (used when threshold_strategy = noise_floor)
    # Lower percentile = more conservative estimate of background noise
    # 5 = use 5th percentile of dB values as noise floor estimate (better for sparse sounds)
    noise_floor_percentile: 2.0
    
    # Delta above noise floor (dB) to set as threshold
    # This is relative to EACH clip's own noise floor, not a fixed dB value
    # 8dB above the clip's noise floor works well for most ESC-50 clips
    # Higher = more conservative (keeps more), Lower = more aggressive (removes more)
    noise_floor_delta_db: 5.0
    
    # Path to preprocessed ESC-50 data (effective durations + trimmed audio)
    preprocessed_data_path: "/path/to/ESC-50_preprocessed"
    
    # =====================================================
    # Duration gap multipliers
    # =====================================================
    # For LONGEST questions: target_effective >= max_background × multiplier_longest
    multiplier_longest: 1.5
    # For SHORTEST questions: target_effective <= min_background × multiplier_shortest
    # Using 0.75 instead of 0.5 for smaller gaps (easier to distinguish)
    multiplier_shortest: 0.75
    
    # Minimum effective duration per source (seconds)
    # Clips with less than this duration are harder to distinguish
    min_effective_duration_per_source: 1.0
    
    # =====================================================
    # Fallback/rejection options
    # =====================================================
    # Reject sample if duration gap cannot be satisfied
    reject_if_gap_not_met: true
    # Try different clips from same class if one clip isn't enough
    sample_different_clips_same_class: true
    
    # Question types
    question_types: ["shortest", "longest"]
    # MCQ questions
    mcq_questions:
      shortest: "Which of the following sounds is heard for the shortest duration?"
      longest: "Which of the following sounds is heard for the longest duration?"
    # Open-text questions
    open_text_questions:
      shortest: "Which sound is heard for the shortest duration in the audio?"
      longest: "Which sound is heard for the longest duration in the audio?"
    
  order:
    enabled: true
    # Total duration for ALL samples in this task combined (in hours)
    task_duration_size: 2.0  # hours
    
    # Maximum clips to join per sample (minimum 2 for ordering)
    # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
    max_clips_per_sample: 10
    
    # Whether to allow repeating clips from the same source category
    # If true: sequence could be [dog, dog, cat, bird] (same clip repeated)
    # If false: sequence is always unique sources
    allow_source_repetition: false
    
    # Minimum clips needed for "second" and "second_last" questions
    # Set to 4 to ensure second and second_last refer to different positions
    # (with 3 clips, both would refer to middle clip at position 1)
    min_clips_for_second_questions: 3
    
    # Question types: "first", "last", "after", "before", "second", "second_last"
    # "second" and "second_last" only generated when n_clips >= min_clips_for_second_questions
    question_types: ["first", "last", "after", "before", "second", "second_last"]
    
    # MCQ question templates
    mcq_questions:
      first: "Which sound appears first in the audio clip?"
      last: "Which sound appears last in the audio clip?"
      after: "Which sound comes after {sound1}?"
      before: "Which sound comes before {sound2}?"
      second: "Which sound appears second in the audio clip?"
      second_last: "Which sound appears second to last in the audio clip?"
    # Open-text question templates
    open_text_questions:
      first: "What is the first sound you hear in the audio?"
      last: "What is the last sound you hear in the audio?"
      after: "What sound comes after {sound1}?"
      before: "What sound comes before {sound2}?"
      second: "What is the second sound you hear in the audio?"
      second_last: "What sound is second to last in the audio?"
      sequence: "List the sounds in the order they appear in the audio."
    
  volume:
    enabled: true
    # Total duration for ALL samples in this task combined (in hours)
    task_duration_size: 2.0  # hours
    
    # Maximum clips with different volumes per sample
    # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
    max_clips_per_sample: 10
    
    # =====================================================
    # Normalization settings (CRITICAL for volume comparison)
    # =====================================================
    # All clips are FIRST normalized to baseline, THEN volume adjusted
    # This ensures volume differences are controlled and comparable
    normalize_to_baseline: true
    baseline_dBFS: -20.0  # Normalize all clips to this level first (used if use_lufs=false)
    
    # =====================================================
    # LUFS (Perceived Loudness) Settings
    # =====================================================
    # LUFS (Loudness Units Full Scale) measures PERCEIVED loudness
    # Unlike dBFS which only measures RMS amplitude, LUFS accounts for
    # human hearing sensitivity to different frequencies (K-weighting)
    # 
    # IMPORTANT: For volume comparison task, we DISABLE LUFS normalization!
    # LUFS makes everything the same perceived loudness, defeating the purpose.
    # Instead, we normalize to a baseline dBFS then apply LARGE volume adjustments.
    use_lufs: false               # DISABLED for audible volume differences
    baseline_lufs: -23.0          # EBU R128 standard (not used when use_lufs=false)
    
    # =====================================================
    # Volume gap multipliers (similar to duration task)
    # =====================================================
    # For MAX_LOUDNESS questions: target_loudness >= second_loudest × multiplier_max
    # Multiplier 2.5 = ~8dB difference = clearly audible
    # Multiplier 4.0 = ~12dB difference = very obvious (4x perceived loudness)
    multiplier_max_loudness: 4.0
    
    # For MIN_LOUDNESS questions: target_loudness <= second_softest × multiplier_min
    # Multiplier 0.25 = ~12dB quieter = clearly distinguishable
    multiplier_min_loudness: 0.25
    
    # Reject sample if loudness gap cannot be satisfied
    reject_if_gap_not_met: true
    
    # =====================================================
    # Source clip options
    # =====================================================
    # If true: same clip can be repeated at different volumes
    # If false: always use different source clips (default behavior)
    use_same_clip_different_volumes: false
    
    # If use_same_clip_different_volumes is true, how many repetitions per source?
    # Can be a single int or list for variety
    repetitions_per_source: [2, 3, 4]
    
    # Question types: "max_loudness", "min_loudness"
    question_types: ["max_loudness", "min_loudness"]
    
    # MCQ questions
    mcq_questions:
      max_loudness: "Which sound has the maximum loudness in the audio?"
      min_loudness: "Which sound has the minimum loudness in the audio?"
    # Open-text questions
    open_text_questions:
      max_loudness: "Identify the sound with maximum loudness in the audio clip."
      min_loudness: "Identify the sound with minimum loudness in the audio clip."
      order_volume: "List the sounds in order from maximum to minimum loudness."

# MCQ options configuration
mcq:
  num_options: 4
  option_labels: ["A", "B", "C", "D"]
  # Strategy for generating distractor options
  # "present_only": only use sounds present in audio
  # "mixed": mix of present and absent sounds
  # "balanced": balanced distribution
  distractor_strategy: "balanced"

# Logging configuration
logging:
  level: "INFO"  # DEBUG, INFO, WARNING, ERROR, CRITICAL
  log_file: "pipeline.log"
  console_output: true