File size: 16,274 Bytes
fec9168 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 |
# Temporal Reasoning Audio Dataset Pipeline Configuration
##uniform distributuon for clip duration
##not mixing datasets
##count
##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder
##duration
##amplitude based filtering -> normalize -> threshold based selection
##gap between audio clips - x2/1.5 the shorter one -> add as param
##different clips of the same class can be contatenated to reach target duration
##consecutive ordering only
##based on n unique sources and total clips we can have -> shortest and longest duration calculation
##reject datapoint if same target audio clip cannot be repeated to maintain the gap - arg
##sample different clip from the same class -> check if different clips can be used to fill the gap - arg
##amplitude filtered durations in metadata csv
##get_max_clip_num_to_be_joined()
##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder
##ensure_silence_between_clips()
##silence should always be there between two clips
##order
##repeat target clips
##second and second last - modify question types
##volume
##amplitude average loudness for a audio clip -> repetitions but same clip(argument) -> different volume levels based on dB levels
##add crossfade
##trimming - threshold separately for each audio clip - normalize 1 and 0 - get threshold -> trim -> concatenate
##leftmost and rightmost silence trimming
##buffer for trimming - cutting early and cutting a bit late to avoid cutting important parts
##periodicity affect
##volume - trim and get average loudness -> normalize -> adjust volume levels
##number of clips per samples to avoid silence
# ESC-50 Dataset paths (each clip is 5 seconds)
esc50:
audio_path: "/path/to/ESC-50_github/audio"
metadata_path: "/path/to/ESC-50_github/meta/esc50.csv"
# Synthetic silence audio for concatenation
synthetic_silence:
path: "/path/to/synthetic_silences"
# Output configuration
output:
base_path: "/path/to/pipeline/test_ood"
# Dataset class-subset configuration
# Use this to create datasets (train/val/test) from a persistent subset
# of classes (e.g. use 40 of 50 classes for in-distribution splits and
# optionally create an OOD test set using all 50 classes).
dataset:
use_class_subset: false # if false, use all available classes
num_classes_subset: 40 # number of classes to use for train/val/test
subset_persist_path: "/path/to/class_subset.json"
subset_seed: 42 # RNG seed when sampling the subset (persisted)
# Audio generation parameters
audio:
# Duration range for each GENERATED clip (in seconds)
# Original ESC-50 clips are 5s and will be concatenated to create clips in this range
min_clip_duration: 20.0 # Minimum duration for each generated clip
max_clip_duration: 60.0 # Maximum duration for each generated clip
# Crossfade and silence
crossfade_duration: 500 # Crossfade between audio and silence (milliseconds) for smooth transitions
silence_duration: 1000 # Default silence between clips (milliseconds)
min_silence_duration: 100 # Minimum silence ALWAYS inserted between clips (milliseconds)
max_extra_silence_per_gap: 500 # Maximum extra silence per gap when distributing remainder
crossfade_within_source: 50 # Small crossfade within same-source repetitions (count task)
with_silence: true # Add silence between clips
# Duration (seconds) of individual source clips (ESC-50 are 5s by default).
# Used to compute how many source clips are concatenated to reach a target
# generated clip duration. Change only if your source clips differ.
source_clip_duration: 5.0
# Audio normalization
normalize: false
normalize_target_dBFS: -20.0
# Random seed for reproducibility
random_seed: 42
# LLM for question generation (local Llama 3.1 8B)
llm:
enabled: false # Set to true to use LLM for question generation
# Task-specific configurations
tasks:
count:
enabled: true
# Total duration for ALL samples in this task combined (in hours)
# Pipeline will calculate number of samples based on min/max clip durations
task_duration_size: 2.0 # hours
# Maximum unique sound sources per sample (single number)
# Actual number will be subsampled from max(1, max_clips-3) to min(max_clips, max_clips_per_sample)
max_clips_per_sample: 10
# Ordering mode for repeated clips of same source:
# "random": Clips are shuffled randomly (A B A C B A C...) - tests recognition of recurring sounds
# "consecutive": Same-source clips grouped together (AAA BBB CCC) - easier, just count blocks
ordering_mode: "random"
# Question templates for MCQ
mcq_questions:
- "What is the number of distinct sound sources in the audio file?"
- "How many different types of sounds can be identified in this recording?"
- "How many unique types of sound are present in this audio?"
- "Identify the count of different sound sources in this clip."
- "What is the total number of unique sounds heard in this audio?"
- "How many distinct sound categories are there in this audio file?"
- "Determine the number of unique sound sources in this recording."
- "How many separate sound sources are included in the audio?"
- "What is the total number of unique sound types in this audio?"
- "How many different sound sources can be heard in this clip?"
# Question templates for open-text
open_text_questions:
- "How many distinct sound sources are present in the audio?"
- "Count the number of unique sounds in this recording."
- "What is the total count of different sound categories heard?"
- "Identify and count all unique sound types in the clip."
duration:
enabled: true
# Total duration for ALL samples in this task combined (in hours)
task_duration_size: 2.0 # hours
# Number of unique sound sources per sample (can be single int or list)
# Single int (e.g., 15): randomly samples from 1 to 15 (like count/order tasks)
# List (e.g., [2,3,4]): randomly picks from the list
# The script will automatically generate repetition patterns to create
# shortest/longest variations based on the target clip duration
num_unique_sources: 10
# Ordering: only keep "consecutive" so repeated segments of the same
# source remain grouped together, ensuring that multiple consecutive
# clips of the same audio yield the longest duration unambiguously.
ordering_methods: ["consecutive"]
# =====================================================
# Amplitude-based filtering parameters (preprocessing)
# =====================================================
# RELATIVE dB threshold below peak to consider as silence
# For each clip: silence_threshold = clip_peak_dB + amplitude_threshold_db
# Example: If clip peak is -5 dB and threshold is -20, silence threshold = -25 dB
# Based on ESC-50 analysis: -20 dB gives ~60% effective duration (good balance)
# More aggressive (removes more silence): -15 dB
# More conservative (keeps more sound): -25 dB
amplitude_threshold_db: -20.0
# Minimum duration of sound region to keep (milliseconds)
# Filters out very short transient noise spikes
# ESC-50 is curated, so 20-30ms is sufficient
min_sound_duration_ms: 25
# =====================================================
# Adaptive threshold strategy
# =====================================================
# "peak_relative": threshold = peak_dB + amplitude_threshold_db (fixed offset from peak)
# - Simple but not adaptive to actual noise levels
# "noise_floor": threshold = percentile(dB, N) + delta_dB (RECOMMENDED)
# - Fully adaptive per-clip based on its own noise floor
# - Each clip analyzed independently - no fixed dB values needed
# - Better for diverse audio with varying noise levels
threshold_strategy: "noise_floor"
# Noise floor estimation percentile (used when threshold_strategy = noise_floor)
# Lower percentile = more conservative estimate of background noise
# 5 = use 5th percentile of dB values as noise floor estimate (better for sparse sounds)
noise_floor_percentile: 2.0
# Delta above noise floor (dB) to set as threshold
# This is relative to EACH clip's own noise floor, not a fixed dB value
# 8dB above the clip's noise floor works well for most ESC-50 clips
# Higher = more conservative (keeps more), Lower = more aggressive (removes more)
noise_floor_delta_db: 5.0
# Path to preprocessed ESC-50 data (effective durations + trimmed audio)
preprocessed_data_path: "/path/to/ESC-50_preprocessed"
# =====================================================
# Duration gap multipliers
# =====================================================
# For LONGEST questions: target_effective >= max_background × multiplier_longest
multiplier_longest: 1.5
# For SHORTEST questions: target_effective <= min_background × multiplier_shortest
# Using 0.75 instead of 0.5 for smaller gaps (easier to distinguish)
multiplier_shortest: 0.75
# Minimum effective duration per source (seconds)
# Clips with less than this duration are harder to distinguish
min_effective_duration_per_source: 1.0
# =====================================================
# Fallback/rejection options
# =====================================================
# Reject sample if duration gap cannot be satisfied
reject_if_gap_not_met: true
# Try different clips from same class if one clip isn't enough
sample_different_clips_same_class: true
# Question types
question_types: ["shortest", "longest"]
# MCQ questions
mcq_questions:
shortest: "Which of the following sounds is heard for the shortest duration?"
longest: "Which of the following sounds is heard for the longest duration?"
# Open-text questions
open_text_questions:
shortest: "Which sound is heard for the shortest duration in the audio?"
longest: "Which sound is heard for the longest duration in the audio?"
order:
enabled: true
# Total duration for ALL samples in this task combined (in hours)
task_duration_size: 2.0 # hours
# Maximum clips to join per sample (minimum 2 for ordering)
# Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
max_clips_per_sample: 10
# Whether to allow repeating clips from the same source category
# If true: sequence could be [dog, dog, cat, bird] (same clip repeated)
# If false: sequence is always unique sources
allow_source_repetition: false
# Minimum clips needed for "second" and "second_last" questions
# Set to 4 to ensure second and second_last refer to different positions
# (with 3 clips, both would refer to middle clip at position 1)
min_clips_for_second_questions: 3
# Question types: "first", "last", "after", "before", "second", "second_last"
# "second" and "second_last" only generated when n_clips >= min_clips_for_second_questions
question_types: ["first", "last", "after", "before", "second", "second_last"]
# MCQ question templates
mcq_questions:
first: "Which sound appears first in the audio clip?"
last: "Which sound appears last in the audio clip?"
after: "Which sound comes after {sound1}?"
before: "Which sound comes before {sound2}?"
second: "Which sound appears second in the audio clip?"
second_last: "Which sound appears second to last in the audio clip?"
# Open-text question templates
open_text_questions:
first: "What is the first sound you hear in the audio?"
last: "What is the last sound you hear in the audio?"
after: "What sound comes after {sound1}?"
before: "What sound comes before {sound2}?"
second: "What is the second sound you hear in the audio?"
second_last: "What sound is second to last in the audio?"
sequence: "List the sounds in the order they appear in the audio."
volume:
enabled: true
# Total duration for ALL samples in this task combined (in hours)
task_duration_size: 2.0 # hours
# Maximum clips with different volumes per sample
# Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
max_clips_per_sample: 10
# =====================================================
# Normalization settings (CRITICAL for volume comparison)
# =====================================================
# All clips are FIRST normalized to baseline, THEN volume adjusted
# This ensures volume differences are controlled and comparable
normalize_to_baseline: true
baseline_dBFS: -20.0 # Normalize all clips to this level first (used if use_lufs=false)
# =====================================================
# LUFS (Perceived Loudness) Settings
# =====================================================
# LUFS (Loudness Units Full Scale) measures PERCEIVED loudness
# Unlike dBFS which only measures RMS amplitude, LUFS accounts for
# human hearing sensitivity to different frequencies (K-weighting)
#
# IMPORTANT: For volume comparison task, we DISABLE LUFS normalization!
# LUFS makes everything the same perceived loudness, defeating the purpose.
# Instead, we normalize to a baseline dBFS then apply LARGE volume adjustments.
use_lufs: false # DISABLED for audible volume differences
baseline_lufs: -23.0 # EBU R128 standard (not used when use_lufs=false)
# =====================================================
# Volume gap multipliers (similar to duration task)
# =====================================================
# For MAX_LOUDNESS questions: target_loudness >= second_loudest × multiplier_max
# Multiplier 2.5 = ~8dB difference = clearly audible
# Multiplier 4.0 = ~12dB difference = very obvious (4x perceived loudness)
multiplier_max_loudness: 4.0
# For MIN_LOUDNESS questions: target_loudness <= second_softest × multiplier_min
# Multiplier 0.25 = ~12dB quieter = clearly distinguishable
multiplier_min_loudness: 0.25
# Reject sample if loudness gap cannot be satisfied
reject_if_gap_not_met: true
# =====================================================
# Source clip options
# =====================================================
# If true: same clip can be repeated at different volumes
# If false: always use different source clips (default behavior)
use_same_clip_different_volumes: false
# If use_same_clip_different_volumes is true, how many repetitions per source?
# Can be a single int or list for variety
repetitions_per_source: [2, 3, 4]
# Question types: "max_loudness", "min_loudness"
question_types: ["max_loudness", "min_loudness"]
# MCQ questions
mcq_questions:
max_loudness: "Which sound has the maximum loudness in the audio?"
min_loudness: "Which sound has the minimum loudness in the audio?"
# Open-text questions
open_text_questions:
max_loudness: "Identify the sound with maximum loudness in the audio clip."
min_loudness: "Identify the sound with minimum loudness in the audio clip."
order_volume: "List the sounds in order from maximum to minimum loudness."
# MCQ options configuration
mcq:
num_options: 4
option_labels: ["A", "B", "C", "D"]
# Strategy for generating distractor options
# "present_only": only use sounds present in audio
# "mixed": mix of present and absent sounds
# "balanced": balanced distribution
distractor_strategy: "balanced"
# Logging configuration
logging:
level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
log_file: "pipeline.log"
console_output: true
|