# Temporal Reasoning Audio Dataset Pipeline Configuration ##uniform distributuon for clip duration ##not mixing datasets ##count ##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder ##duration ##amplitude based filtering -> normalize -> threshold based selection ##gap between audio clips - x2/1.5 the shorter one -> add as param ##different clips of the same class can be contatenated to reach target duration ##consecutive ordering only ##based on n unique sources and total clips we can have -> shortest and longest duration calculation ##reject datapoint if same target audio clip cannot be repeated to maintain the gap - arg ##sample different clip from the same class -> check if different clips can be used to fill the gap - arg ##amplitude filtered durations in metadata csv ##get_max_clip_num_to_be_joined() ##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder ##ensure_silence_between_clips() ##silence should always be there between two clips ##order ##repeat target clips ##second and second last - modify question types ##volume ##amplitude average loudness for a audio clip -> repetitions but same clip(argument) -> different volume levels based on dB levels ##add crossfade ##trimming - threshold separately for each audio clip - normalize 1 and 0 - get threshold -> trim -> concatenate ##leftmost and rightmost silence trimming ##buffer for trimming - cutting early and cutting a bit late to avoid cutting important parts ##periodicity affect ##volume - trim and get average loudness -> normalize -> adjust volume levels ##number of clips per samples to avoid silence # ESC-50 Dataset paths (each clip is 5 seconds) esc50: audio_path: "/path/to/ESC-50_github/audio" metadata_path: "/path/to/ESC-50_github/meta/esc50.csv" # Synthetic silence audio for concatenation synthetic_silence: path: "/path/to/synthetic_silences" # Output configuration output: base_path: "/path/to/pipeline/test_ood" # Dataset class-subset configuration # Use this to create datasets (train/val/test) from a persistent subset # of classes (e.g. use 40 of 50 classes for in-distribution splits and # optionally create an OOD test set using all 50 classes). dataset: use_class_subset: false # if false, use all available classes num_classes_subset: 40 # number of classes to use for train/val/test subset_persist_path: "/path/to/class_subset.json" subset_seed: 42 # RNG seed when sampling the subset (persisted) # Audio generation parameters audio: # Duration range for each GENERATED clip (in seconds) # Original ESC-50 clips are 5s and will be concatenated to create clips in this range min_clip_duration: 20.0 # Minimum duration for each generated clip max_clip_duration: 60.0 # Maximum duration for each generated clip # Crossfade and silence crossfade_duration: 500 # Crossfade between audio and silence (milliseconds) for smooth transitions silence_duration: 1000 # Default silence between clips (milliseconds) min_silence_duration: 100 # Minimum silence ALWAYS inserted between clips (milliseconds) max_extra_silence_per_gap: 500 # Maximum extra silence per gap when distributing remainder crossfade_within_source: 50 # Small crossfade within same-source repetitions (count task) with_silence: true # Add silence between clips # Duration (seconds) of individual source clips (ESC-50 are 5s by default). # Used to compute how many source clips are concatenated to reach a target # generated clip duration. Change only if your source clips differ. source_clip_duration: 5.0 # Audio normalization normalize: false normalize_target_dBFS: -20.0 # Random seed for reproducibility random_seed: 42 # LLM for question generation (local Llama 3.1 8B) llm: enabled: false # Set to true to use LLM for question generation # Task-specific configurations tasks: count: enabled: true # Total duration for ALL samples in this task combined (in hours) # Pipeline will calculate number of samples based on min/max clip durations task_duration_size: 2.0 # hours # Maximum unique sound sources per sample (single number) # Actual number will be subsampled from max(1, max_clips-3) to min(max_clips, max_clips_per_sample) max_clips_per_sample: 10 # Ordering mode for repeated clips of same source: # "random": Clips are shuffled randomly (A B A C B A C...) - tests recognition of recurring sounds # "consecutive": Same-source clips grouped together (AAA BBB CCC) - easier, just count blocks ordering_mode: "random" # Question templates for MCQ mcq_questions: - "What is the number of distinct sound sources in the audio file?" - "How many different types of sounds can be identified in this recording?" - "How many unique types of sound are present in this audio?" - "Identify the count of different sound sources in this clip." - "What is the total number of unique sounds heard in this audio?" - "How many distinct sound categories are there in this audio file?" - "Determine the number of unique sound sources in this recording." - "How many separate sound sources are included in the audio?" - "What is the total number of unique sound types in this audio?" - "How many different sound sources can be heard in this clip?" # Question templates for open-text open_text_questions: - "How many distinct sound sources are present in the audio?" - "Count the number of unique sounds in this recording." - "What is the total count of different sound categories heard?" - "Identify and count all unique sound types in the clip." duration: enabled: true # Total duration for ALL samples in this task combined (in hours) task_duration_size: 2.0 # hours # Number of unique sound sources per sample (can be single int or list) # Single int (e.g., 15): randomly samples from 1 to 15 (like count/order tasks) # List (e.g., [2,3,4]): randomly picks from the list # The script will automatically generate repetition patterns to create # shortest/longest variations based on the target clip duration num_unique_sources: 10 # Ordering: only keep "consecutive" so repeated segments of the same # source remain grouped together, ensuring that multiple consecutive # clips of the same audio yield the longest duration unambiguously. ordering_methods: ["consecutive"] # ===================================================== # Amplitude-based filtering parameters (preprocessing) # ===================================================== # RELATIVE dB threshold below peak to consider as silence # For each clip: silence_threshold = clip_peak_dB + amplitude_threshold_db # Example: If clip peak is -5 dB and threshold is -20, silence threshold = -25 dB # Based on ESC-50 analysis: -20 dB gives ~60% effective duration (good balance) # More aggressive (removes more silence): -15 dB # More conservative (keeps more sound): -25 dB amplitude_threshold_db: -20.0 # Minimum duration of sound region to keep (milliseconds) # Filters out very short transient noise spikes # ESC-50 is curated, so 20-30ms is sufficient min_sound_duration_ms: 25 # ===================================================== # Adaptive threshold strategy # ===================================================== # "peak_relative": threshold = peak_dB + amplitude_threshold_db (fixed offset from peak) # - Simple but not adaptive to actual noise levels # "noise_floor": threshold = percentile(dB, N) + delta_dB (RECOMMENDED) # - Fully adaptive per-clip based on its own noise floor # - Each clip analyzed independently - no fixed dB values needed # - Better for diverse audio with varying noise levels threshold_strategy: "noise_floor" # Noise floor estimation percentile (used when threshold_strategy = noise_floor) # Lower percentile = more conservative estimate of background noise # 5 = use 5th percentile of dB values as noise floor estimate (better for sparse sounds) noise_floor_percentile: 2.0 # Delta above noise floor (dB) to set as threshold # This is relative to EACH clip's own noise floor, not a fixed dB value # 8dB above the clip's noise floor works well for most ESC-50 clips # Higher = more conservative (keeps more), Lower = more aggressive (removes more) noise_floor_delta_db: 5.0 # Path to preprocessed ESC-50 data (effective durations + trimmed audio) preprocessed_data_path: "/path/to/ESC-50_preprocessed" # ===================================================== # Duration gap multipliers # ===================================================== # For LONGEST questions: target_effective >= max_background × multiplier_longest multiplier_longest: 1.5 # For SHORTEST questions: target_effective <= min_background × multiplier_shortest # Using 0.75 instead of 0.5 for smaller gaps (easier to distinguish) multiplier_shortest: 0.75 # Minimum effective duration per source (seconds) # Clips with less than this duration are harder to distinguish min_effective_duration_per_source: 1.0 # ===================================================== # Fallback/rejection options # ===================================================== # Reject sample if duration gap cannot be satisfied reject_if_gap_not_met: true # Try different clips from same class if one clip isn't enough sample_different_clips_same_class: true # Question types question_types: ["shortest", "longest"] # MCQ questions mcq_questions: shortest: "Which of the following sounds is heard for the shortest duration?" longest: "Which of the following sounds is heard for the longest duration?" # Open-text questions open_text_questions: shortest: "Which sound is heard for the shortest duration in the audio?" longest: "Which sound is heard for the longest duration in the audio?" order: enabled: true # Total duration for ALL samples in this task combined (in hours) task_duration_size: 2.0 # hours # Maximum clips to join per sample (minimum 2 for ordering) # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample) max_clips_per_sample: 10 # Whether to allow repeating clips from the same source category # If true: sequence could be [dog, dog, cat, bird] (same clip repeated) # If false: sequence is always unique sources allow_source_repetition: false # Minimum clips needed for "second" and "second_last" questions # Set to 4 to ensure second and second_last refer to different positions # (with 3 clips, both would refer to middle clip at position 1) min_clips_for_second_questions: 3 # Question types: "first", "last", "after", "before", "second", "second_last" # "second" and "second_last" only generated when n_clips >= min_clips_for_second_questions question_types: ["first", "last", "after", "before", "second", "second_last"] # MCQ question templates mcq_questions: first: "Which sound appears first in the audio clip?" last: "Which sound appears last in the audio clip?" after: "Which sound comes after {sound1}?" before: "Which sound comes before {sound2}?" second: "Which sound appears second in the audio clip?" second_last: "Which sound appears second to last in the audio clip?" # Open-text question templates open_text_questions: first: "What is the first sound you hear in the audio?" last: "What is the last sound you hear in the audio?" after: "What sound comes after {sound1}?" before: "What sound comes before {sound2}?" second: "What is the second sound you hear in the audio?" second_last: "What sound is second to last in the audio?" sequence: "List the sounds in the order they appear in the audio." volume: enabled: true # Total duration for ALL samples in this task combined (in hours) task_duration_size: 2.0 # hours # Maximum clips with different volumes per sample # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample) max_clips_per_sample: 10 # ===================================================== # Normalization settings (CRITICAL for volume comparison) # ===================================================== # All clips are FIRST normalized to baseline, THEN volume adjusted # This ensures volume differences are controlled and comparable normalize_to_baseline: true baseline_dBFS: -20.0 # Normalize all clips to this level first (used if use_lufs=false) # ===================================================== # LUFS (Perceived Loudness) Settings # ===================================================== # LUFS (Loudness Units Full Scale) measures PERCEIVED loudness # Unlike dBFS which only measures RMS amplitude, LUFS accounts for # human hearing sensitivity to different frequencies (K-weighting) # # IMPORTANT: For volume comparison task, we DISABLE LUFS normalization! # LUFS makes everything the same perceived loudness, defeating the purpose. # Instead, we normalize to a baseline dBFS then apply LARGE volume adjustments. use_lufs: false # DISABLED for audible volume differences baseline_lufs: -23.0 # EBU R128 standard (not used when use_lufs=false) # ===================================================== # Volume gap multipliers (similar to duration task) # ===================================================== # For MAX_LOUDNESS questions: target_loudness >= second_loudest × multiplier_max # Multiplier 2.5 = ~8dB difference = clearly audible # Multiplier 4.0 = ~12dB difference = very obvious (4x perceived loudness) multiplier_max_loudness: 4.0 # For MIN_LOUDNESS questions: target_loudness <= second_softest × multiplier_min # Multiplier 0.25 = ~12dB quieter = clearly distinguishable multiplier_min_loudness: 0.25 # Reject sample if loudness gap cannot be satisfied reject_if_gap_not_met: true # ===================================================== # Source clip options # ===================================================== # If true: same clip can be repeated at different volumes # If false: always use different source clips (default behavior) use_same_clip_different_volumes: false # If use_same_clip_different_volumes is true, how many repetitions per source? # Can be a single int or list for variety repetitions_per_source: [2, 3, 4] # Question types: "max_loudness", "min_loudness" question_types: ["max_loudness", "min_loudness"] # MCQ questions mcq_questions: max_loudness: "Which sound has the maximum loudness in the audio?" min_loudness: "Which sound has the minimum loudness in the audio?" # Open-text questions open_text_questions: max_loudness: "Identify the sound with maximum loudness in the audio clip." min_loudness: "Identify the sound with minimum loudness in the audio clip." order_volume: "List the sounds in order from maximum to minimum loudness." # MCQ options configuration mcq: num_options: 4 option_labels: ["A", "B", "C", "D"] # Strategy for generating distractor options # "present_only": only use sounds present in audio # "mixed": mix of present and absent sounds # "balanced": balanced distribution distractor_strategy: "balanced" # Logging configuration logging: level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL log_file: "pipeline.log" console_output: true