| # Temporal Reasoning Audio Dataset Pipeline Configuration | |
| ##uniform distributuon for clip duration | |
| ##not mixing datasets | |
| ##count | |
| ##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder | |
| ##duration | |
| ##amplitude based filtering -> normalize -> threshold based selection | |
| ##gap between audio clips - x2/1.5 the shorter one -> add as param | |
| ##different clips of the same class can be contatenated to reach target duration | |
| ##consecutive ordering only | |
| ##based on n unique sources and total clips we can have -> shortest and longest duration calculation | |
| ##reject datapoint if same target audio clip cannot be repeated to maintain the gap - arg | |
| ##sample different clip from the same class -> check if different clips can be used to fill the gap - arg | |
| ##amplitude filtered durations in metadata csv | |
| ##get_max_clip_num_to_be_joined() | |
| ##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder | |
| ##ensure_silence_between_clips() | |
| ##silence should always be there between two clips | |
| ##order | |
| ##repeat target clips | |
| ##second and second last - modify question types | |
| ##volume | |
| ##amplitude average loudness for a audio clip -> repetitions but same clip(argument) -> different volume levels based on dB levels | |
| ##add crossfade | |
| ##trimming - threshold separately for each audio clip - normalize 1 and 0 - get threshold -> trim -> concatenate | |
| ##leftmost and rightmost silence trimming | |
| ##buffer for trimming - cutting early and cutting a bit late to avoid cutting important parts | |
| ##periodicity affect | |
| ##volume - trim and get average loudness -> normalize -> adjust volume levels | |
| ##number of clips per samples to avoid silence | |
| # ESC-50 Dataset paths (each clip is 5 seconds) | |
| esc50: | |
| audio_path: "/path/to/ESC-50_github/audio" | |
| metadata_path: "/path/to/ESC-50_github/meta/esc50.csv" | |
| # Synthetic silence audio for concatenation | |
| synthetic_silence: | |
| path: "/path/to/synthetic_silences" | |
| # Output configuration | |
| output: | |
| base_path: "/path/to/pipeline/test_ood" | |
| # Dataset class-subset configuration | |
| # Use this to create datasets (train/val/test) from a persistent subset | |
| # of classes (e.g. use 40 of 50 classes for in-distribution splits and | |
| # optionally create an OOD test set using all 50 classes). | |
| dataset: | |
| use_class_subset: false # if false, use all available classes | |
| num_classes_subset: 40 # number of classes to use for train/val/test | |
| subset_persist_path: "/path/to/class_subset.json" | |
| subset_seed: 42 # RNG seed when sampling the subset (persisted) | |
| # Audio generation parameters | |
| audio: | |
| # Duration range for each GENERATED clip (in seconds) | |
| # Original ESC-50 clips are 5s and will be concatenated to create clips in this range | |
| min_clip_duration: 20.0 # Minimum duration for each generated clip | |
| max_clip_duration: 60.0 # Maximum duration for each generated clip | |
| # Crossfade and silence | |
| crossfade_duration: 500 # Crossfade between audio and silence (milliseconds) for smooth transitions | |
| silence_duration: 1000 # Default silence between clips (milliseconds) | |
| min_silence_duration: 100 # Minimum silence ALWAYS inserted between clips (milliseconds) | |
| max_extra_silence_per_gap: 500 # Maximum extra silence per gap when distributing remainder | |
| crossfade_within_source: 50 # Small crossfade within same-source repetitions (count task) | |
| with_silence: true # Add silence between clips | |
| # Duration (seconds) of individual source clips (ESC-50 are 5s by default). | |
| # Used to compute how many source clips are concatenated to reach a target | |
| # generated clip duration. Change only if your source clips differ. | |
| source_clip_duration: 5.0 | |
| # Audio normalization | |
| normalize: false | |
| normalize_target_dBFS: -20.0 | |
| # Random seed for reproducibility | |
| random_seed: 42 | |
| # LLM for question generation (local Llama 3.1 8B) | |
| llm: | |
| enabled: false # Set to true to use LLM for question generation | |
| # Task-specific configurations | |
| tasks: | |
| count: | |
| enabled: true | |
| # Total duration for ALL samples in this task combined (in hours) | |
| # Pipeline will calculate number of samples based on min/max clip durations | |
| task_duration_size: 2.0 # hours | |
| # Maximum unique sound sources per sample (single number) | |
| # Actual number will be subsampled from max(1, max_clips-3) to min(max_clips, max_clips_per_sample) | |
| max_clips_per_sample: 10 | |
| # Ordering mode for repeated clips of same source: | |
| # "random": Clips are shuffled randomly (A B A C B A C...) - tests recognition of recurring sounds | |
| # "consecutive": Same-source clips grouped together (AAA BBB CCC) - easier, just count blocks | |
| ordering_mode: "random" | |
| # Question templates for MCQ | |
| mcq_questions: | |
| - "What is the number of distinct sound sources in the audio file?" | |
| - "How many different types of sounds can be identified in this recording?" | |
| - "How many unique types of sound are present in this audio?" | |
| - "Identify the count of different sound sources in this clip." | |
| - "What is the total number of unique sounds heard in this audio?" | |
| - "How many distinct sound categories are there in this audio file?" | |
| - "Determine the number of unique sound sources in this recording." | |
| - "How many separate sound sources are included in the audio?" | |
| - "What is the total number of unique sound types in this audio?" | |
| - "How many different sound sources can be heard in this clip?" | |
| # Question templates for open-text | |
| open_text_questions: | |
| - "How many distinct sound sources are present in the audio?" | |
| - "Count the number of unique sounds in this recording." | |
| - "What is the total count of different sound categories heard?" | |
| - "Identify and count all unique sound types in the clip." | |
| duration: | |
| enabled: true | |
| # Total duration for ALL samples in this task combined (in hours) | |
| task_duration_size: 2.0 # hours | |
| # Number of unique sound sources per sample (can be single int or list) | |
| # Single int (e.g., 15): randomly samples from 1 to 15 (like count/order tasks) | |
| # List (e.g., [2,3,4]): randomly picks from the list | |
| # The script will automatically generate repetition patterns to create | |
| # shortest/longest variations based on the target clip duration | |
| num_unique_sources: 10 | |
| # Ordering: only keep "consecutive" so repeated segments of the same | |
| # source remain grouped together, ensuring that multiple consecutive | |
| # clips of the same audio yield the longest duration unambiguously. | |
| ordering_methods: ["consecutive"] | |
| # ===================================================== | |
| # Amplitude-based filtering parameters (preprocessing) | |
| # ===================================================== | |
| # RELATIVE dB threshold below peak to consider as silence | |
| # For each clip: silence_threshold = clip_peak_dB + amplitude_threshold_db | |
| # Example: If clip peak is -5 dB and threshold is -20, silence threshold = -25 dB | |
| # Based on ESC-50 analysis: -20 dB gives ~60% effective duration (good balance) | |
| # More aggressive (removes more silence): -15 dB | |
| # More conservative (keeps more sound): -25 dB | |
| amplitude_threshold_db: -20.0 | |
| # Minimum duration of sound region to keep (milliseconds) | |
| # Filters out very short transient noise spikes | |
| # ESC-50 is curated, so 20-30ms is sufficient | |
| min_sound_duration_ms: 25 | |
| # ===================================================== | |
| # Adaptive threshold strategy | |
| # ===================================================== | |
| # "peak_relative": threshold = peak_dB + amplitude_threshold_db (fixed offset from peak) | |
| # - Simple but not adaptive to actual noise levels | |
| # "noise_floor": threshold = percentile(dB, N) + delta_dB (RECOMMENDED) | |
| # - Fully adaptive per-clip based on its own noise floor | |
| # - Each clip analyzed independently - no fixed dB values needed | |
| # - Better for diverse audio with varying noise levels | |
| threshold_strategy: "noise_floor" | |
| # Noise floor estimation percentile (used when threshold_strategy = noise_floor) | |
| # Lower percentile = more conservative estimate of background noise | |
| # 5 = use 5th percentile of dB values as noise floor estimate (better for sparse sounds) | |
| noise_floor_percentile: 2.0 | |
| # Delta above noise floor (dB) to set as threshold | |
| # This is relative to EACH clip's own noise floor, not a fixed dB value | |
| # 8dB above the clip's noise floor works well for most ESC-50 clips | |
| # Higher = more conservative (keeps more), Lower = more aggressive (removes more) | |
| noise_floor_delta_db: 5.0 | |
| # Path to preprocessed ESC-50 data (effective durations + trimmed audio) | |
| preprocessed_data_path: "/path/to/ESC-50_preprocessed" | |
| # ===================================================== | |
| # Duration gap multipliers | |
| # ===================================================== | |
| # For LONGEST questions: target_effective >= max_background × multiplier_longest | |
| multiplier_longest: 1.5 | |
| # For SHORTEST questions: target_effective <= min_background × multiplier_shortest | |
| # Using 0.75 instead of 0.5 for smaller gaps (easier to distinguish) | |
| multiplier_shortest: 0.75 | |
| # Minimum effective duration per source (seconds) | |
| # Clips with less than this duration are harder to distinguish | |
| min_effective_duration_per_source: 1.0 | |
| # ===================================================== | |
| # Fallback/rejection options | |
| # ===================================================== | |
| # Reject sample if duration gap cannot be satisfied | |
| reject_if_gap_not_met: true | |
| # Try different clips from same class if one clip isn't enough | |
| sample_different_clips_same_class: true | |
| # Question types | |
| question_types: ["shortest", "longest"] | |
| # MCQ questions | |
| mcq_questions: | |
| shortest: "Which of the following sounds is heard for the shortest duration?" | |
| longest: "Which of the following sounds is heard for the longest duration?" | |
| # Open-text questions | |
| open_text_questions: | |
| shortest: "Which sound is heard for the shortest duration in the audio?" | |
| longest: "Which sound is heard for the longest duration in the audio?" | |
| order: | |
| enabled: true | |
| # Total duration for ALL samples in this task combined (in hours) | |
| task_duration_size: 2.0 # hours | |
| # Maximum clips to join per sample (minimum 2 for ordering) | |
| # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample) | |
| max_clips_per_sample: 10 | |
| # Whether to allow repeating clips from the same source category | |
| # If true: sequence could be [dog, dog, cat, bird] (same clip repeated) | |
| # If false: sequence is always unique sources | |
| allow_source_repetition: false | |
| # Minimum clips needed for "second" and "second_last" questions | |
| # Set to 4 to ensure second and second_last refer to different positions | |
| # (with 3 clips, both would refer to middle clip at position 1) | |
| min_clips_for_second_questions: 3 | |
| # Question types: "first", "last", "after", "before", "second", "second_last" | |
| # "second" and "second_last" only generated when n_clips >= min_clips_for_second_questions | |
| question_types: ["first", "last", "after", "before", "second", "second_last"] | |
| # MCQ question templates | |
| mcq_questions: | |
| first: "Which sound appears first in the audio clip?" | |
| last: "Which sound appears last in the audio clip?" | |
| after: "Which sound comes after {sound1}?" | |
| before: "Which sound comes before {sound2}?" | |
| second: "Which sound appears second in the audio clip?" | |
| second_last: "Which sound appears second to last in the audio clip?" | |
| # Open-text question templates | |
| open_text_questions: | |
| first: "What is the first sound you hear in the audio?" | |
| last: "What is the last sound you hear in the audio?" | |
| after: "What sound comes after {sound1}?" | |
| before: "What sound comes before {sound2}?" | |
| second: "What is the second sound you hear in the audio?" | |
| second_last: "What sound is second to last in the audio?" | |
| sequence: "List the sounds in the order they appear in the audio." | |
| volume: | |
| enabled: true | |
| # Total duration for ALL samples in this task combined (in hours) | |
| task_duration_size: 2.0 # hours | |
| # Maximum clips with different volumes per sample | |
| # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample) | |
| max_clips_per_sample: 10 | |
| # ===================================================== | |
| # Normalization settings (CRITICAL for volume comparison) | |
| # ===================================================== | |
| # All clips are FIRST normalized to baseline, THEN volume adjusted | |
| # This ensures volume differences are controlled and comparable | |
| normalize_to_baseline: true | |
| baseline_dBFS: -20.0 # Normalize all clips to this level first (used if use_lufs=false) | |
| # ===================================================== | |
| # LUFS (Perceived Loudness) Settings | |
| # ===================================================== | |
| # LUFS (Loudness Units Full Scale) measures PERCEIVED loudness | |
| # Unlike dBFS which only measures RMS amplitude, LUFS accounts for | |
| # human hearing sensitivity to different frequencies (K-weighting) | |
| # | |
| # IMPORTANT: For volume comparison task, we DISABLE LUFS normalization! | |
| # LUFS makes everything the same perceived loudness, defeating the purpose. | |
| # Instead, we normalize to a baseline dBFS then apply LARGE volume adjustments. | |
| use_lufs: false # DISABLED for audible volume differences | |
| baseline_lufs: -23.0 # EBU R128 standard (not used when use_lufs=false) | |
| # ===================================================== | |
| # Volume gap multipliers (similar to duration task) | |
| # ===================================================== | |
| # For MAX_LOUDNESS questions: target_loudness >= second_loudest × multiplier_max | |
| # Multiplier 2.5 = ~8dB difference = clearly audible | |
| # Multiplier 4.0 = ~12dB difference = very obvious (4x perceived loudness) | |
| multiplier_max_loudness: 4.0 | |
| # For MIN_LOUDNESS questions: target_loudness <= second_softest × multiplier_min | |
| # Multiplier 0.25 = ~12dB quieter = clearly distinguishable | |
| multiplier_min_loudness: 0.25 | |
| # Reject sample if loudness gap cannot be satisfied | |
| reject_if_gap_not_met: true | |
| # ===================================================== | |
| # Source clip options | |
| # ===================================================== | |
| # If true: same clip can be repeated at different volumes | |
| # If false: always use different source clips (default behavior) | |
| use_same_clip_different_volumes: false | |
| # If use_same_clip_different_volumes is true, how many repetitions per source? | |
| # Can be a single int or list for variety | |
| repetitions_per_source: [2, 3, 4] | |
| # Question types: "max_loudness", "min_loudness" | |
| question_types: ["max_loudness", "min_loudness"] | |
| # MCQ questions | |
| mcq_questions: | |
| max_loudness: "Which sound has the maximum loudness in the audio?" | |
| min_loudness: "Which sound has the minimum loudness in the audio?" | |
| # Open-text questions | |
| open_text_questions: | |
| max_loudness: "Identify the sound with maximum loudness in the audio clip." | |
| min_loudness: "Identify the sound with minimum loudness in the audio clip." | |
| order_volume: "List the sounds in order from maximum to minimum loudness." | |
| # MCQ options configuration | |
| mcq: | |
| num_options: 4 | |
| option_labels: ["A", "B", "C", "D"] | |
| # Strategy for generating distractor options | |
| # "present_only": only use sounds present in audio | |
| # "mixed": mix of present and absent sounds | |
| # "balanced": balanced distribution | |
| distractor_strategy: "balanced" | |
| # Logging configuration | |
| logging: | |
| level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL | |
| log_file: "pipeline.log" | |
| console_output: true | |