File size: 16,274 Bytes

fec9168

# Temporal Reasoning Audio Dataset Pipeline Configuration
##uniform distributuon for clip duration
##not mixing datasets 

##count
##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder

##duration
##amplitude based filtering -> normalize -> threshold based selection
##gap between audio clips - x2/1.5 the shorter one -> add as param
##different clips of the same class can be contatenated to reach target duration
##consecutive ordering only
##based on n unique sources and total clips we can have -> shortest and longest duration calculation

##reject datapoint if same target audio clip cannot be repeated to maintain the gap - arg
##sample different clip from the same class -> check if different clips can be used to fill the gap - arg

##amplitude filtered durations in metadata csv

##get_max_clip_num_to_be_joined()
##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder

##ensure_silence_between_clips()
##silence should always be there between two clips

##order
##repeat target clips 
##second and second last - modify question types

##volume
##amplitude average loudness for a audio clip -> repetitions but same clip(argument) -> different volume levels based on dB levels

##add crossfade

##trimming - threshold separately for each audio clip - normalize 1 and 0 - get threshold -> trim -> concatenate
##leftmost and rightmost silence trimming
##buffer for trimming - cutting early and cutting a bit late to avoid cutting important parts 
##periodicity affect

##volume - trim and get average loudness -> normalize -> adjust volume levels 

##number of clips per samples to avoid silence 


# ESC-50 Dataset paths (each clip is 5 seconds)
esc50:
  audio_path: "/path/to/ESC-50_github/audio"
  metadata_path: "/path/to/ESC-50_github/meta/esc50.csv"

# Synthetic silence audio for concatenation
synthetic_silence:
  path: "/path/to/synthetic_silences"
  
# Output configuration
output:
  base_path: "/path/to/pipeline/test_ood"
# Dataset class-subset configuration
# Use this to create datasets (train/val/test) from a persistent subset
# of classes (e.g. use 40 of 50 classes for in-distribution splits and
# optionally create an OOD test set using all 50 classes).
dataset:
  use_class_subset: false                # if false, use all available classes
  num_classes_subset: 40                # number of classes to use for train/val/test
  subset_persist_path: "/path/to/class_subset.json"
  subset_seed: 42                       # RNG seed when sampling the subset (persisted)

# Audio generation parameters
audio:
  # Duration range for each GENERATED clip (in seconds)
  # Original ESC-50 clips are 5s and will be concatenated to create clips in this range
  min_clip_duration: 20.0     # Minimum duration for each generated clip
  max_clip_duration: 60.0     # Maximum duration for each generated clip
  
  # Crossfade and silence
  crossfade_duration: 500    # Crossfade between audio and silence (milliseconds) for smooth transitions
  silence_duration: 1000     # Default silence between clips (milliseconds)
  min_silence_duration: 100  # Minimum silence ALWAYS inserted between clips (milliseconds)
  max_extra_silence_per_gap: 500  # Maximum extra silence per gap when distributing remainder
  crossfade_within_source: 50  # Small crossfade within same-source repetitions (count task)
  with_silence: true         # Add silence between clips
  # Duration (seconds) of individual source clips (ESC-50 are 5s by default).
  # Used to compute how many source clips are concatenated to reach a target
  # generated clip duration. Change only if your source clips differ.
  source_clip_duration: 5.0
  
  # Audio normalization
  normalize: false
  normalize_target_dBFS: -20.0

# Random seed for reproducibility
random_seed: 42

# LLM for question generation (local Llama 3.1 8B)
llm:
  enabled: false  # Set to true to use LLM for question generation

# Task-specific configurations
tasks:
  count:
    enabled: true
    # Total duration for ALL samples in this task combined (in hours)
    # Pipeline will calculate number of samples based on min/max clip durations
    task_duration_size: 2.0  # hours
    
    # Maximum unique sound sources per sample (single number)
    # Actual number will be subsampled from max(1, max_clips-3) to min(max_clips, max_clips_per_sample)
    max_clips_per_sample: 10
    
    # Ordering mode for repeated clips of same source:
    # "random": Clips are shuffled randomly (A B A C B A C...) - tests recognition of recurring sounds
    # "consecutive": Same-source clips grouped together (AAA BBB CCC) - easier, just count blocks
    ordering_mode: "random"
    
    # Question templates for MCQ
    mcq_questions:
      - "What is the number of distinct sound sources in the audio file?"
      - "How many different types of sounds can be identified in this recording?"
      - "How many unique types of sound are present in this audio?"
      - "Identify the count of different sound sources in this clip."
      - "What is the total number of unique sounds heard in this audio?"
      - "How many distinct sound categories are there in this audio file?"
      - "Determine the number of unique sound sources in this recording."
      - "How many separate sound sources are included in the audio?"
      - "What is the total number of unique sound types in this audio?"
      - "How many different sound sources can be heard in this clip?"
    # Question templates for open-text
    open_text_questions:
      - "How many distinct sound sources are present in the audio?"
      - "Count the number of unique sounds in this recording."
      - "What is the total count of different sound categories heard?"
      - "Identify and count all unique sound types in the clip."
    
  duration:
    enabled: true
    # Total duration for ALL samples in this task combined (in hours)
    task_duration_size: 2.0  # hours
    
    # Number of unique sound sources per sample (can be single int or list)
    # Single int (e.g., 15): randomly samples from 1 to 15 (like count/order tasks)
    # List (e.g., [2,3,4]): randomly picks from the list
    # The script will automatically generate repetition patterns to create
    # shortest/longest variations based on the target clip duration
    num_unique_sources: 10
    
    # Ordering: only keep "consecutive" so repeated segments of the same
    # source remain grouped together, ensuring that multiple consecutive
    # clips of the same audio yield the longest duration unambiguously.
    ordering_methods: ["consecutive"]
    
    # =====================================================
    # Amplitude-based filtering parameters (preprocessing)
    # =====================================================
    # RELATIVE dB threshold below peak to consider as silence
    # For each clip: silence_threshold = clip_peak_dB + amplitude_threshold_db
    # Example: If clip peak is -5 dB and threshold is -20, silence threshold = -25 dB
    # Based on ESC-50 analysis: -20 dB gives ~60% effective duration (good balance)
    # More aggressive (removes more silence): -15 dB
    # More conservative (keeps more sound): -25 dB
    amplitude_threshold_db: -20.0
    
    # Minimum duration of sound region to keep (milliseconds)
    # Filters out very short transient noise spikes
    # ESC-50 is curated, so 20-30ms is sufficient
    min_sound_duration_ms: 25
    
    # =====================================================
    # Adaptive threshold strategy
    # =====================================================
    # "peak_relative": threshold = peak_dB + amplitude_threshold_db (fixed offset from peak)
    #   - Simple but not adaptive to actual noise levels
    # "noise_floor": threshold = percentile(dB, N) + delta_dB (RECOMMENDED)
    #   - Fully adaptive per-clip based on its own noise floor
    #   - Each clip analyzed independently - no fixed dB values needed
    #   - Better for diverse audio with varying noise levels
    threshold_strategy: "noise_floor"
    
    # Noise floor estimation percentile (used when threshold_strategy = noise_floor)
    # Lower percentile = more conservative estimate of background noise
    # 5 = use 5th percentile of dB values as noise floor estimate (better for sparse sounds)
    noise_floor_percentile: 2.0
    
    # Delta above noise floor (dB) to set as threshold
    # This is relative to EACH clip's own noise floor, not a fixed dB value
    # 8dB above the clip's noise floor works well for most ESC-50 clips
    # Higher = more conservative (keeps more), Lower = more aggressive (removes more)
    noise_floor_delta_db: 5.0
    
    # Path to preprocessed ESC-50 data (effective durations + trimmed audio)
    preprocessed_data_path: "/path/to/ESC-50_preprocessed"
    
    # =====================================================
    # Duration gap multipliers
    # =====================================================
    # For LONGEST questions: target_effective >= max_background × multiplier_longest
    multiplier_longest: 1.5
    # For SHORTEST questions: target_effective <= min_background × multiplier_shortest
    # Using 0.75 instead of 0.5 for smaller gaps (easier to distinguish)
    multiplier_shortest: 0.75
    
    # Minimum effective duration per source (seconds)
    # Clips with less than this duration are harder to distinguish
    min_effective_duration_per_source: 1.0
    
    # =====================================================
    # Fallback/rejection options
    # =====================================================
    # Reject sample if duration gap cannot be satisfied
    reject_if_gap_not_met: true
    # Try different clips from same class if one clip isn't enough
    sample_different_clips_same_class: true
    
    # Question types
    question_types: ["shortest", "longest"]
    # MCQ questions
    mcq_questions:
      shortest: "Which of the following sounds is heard for the shortest duration?"
      longest: "Which of the following sounds is heard for the longest duration?"
    # Open-text questions
    open_text_questions:
      shortest: "Which sound is heard for the shortest duration in the audio?"
      longest: "Which sound is heard for the longest duration in the audio?"
    
  order:
    enabled: true
    # Total duration for ALL samples in this task combined (in hours)
    task_duration_size: 2.0  # hours
    
    # Maximum clips to join per sample (minimum 2 for ordering)
    # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
    max_clips_per_sample: 10
    
    # Whether to allow repeating clips from the same source category
    # If true: sequence could be [dog, dog, cat, bird] (same clip repeated)
    # If false: sequence is always unique sources
    allow_source_repetition: false
    
    # Minimum clips needed for "second" and "second_last" questions
    # Set to 4 to ensure second and second_last refer to different positions
    # (with 3 clips, both would refer to middle clip at position 1)
    min_clips_for_second_questions: 3
    
    # Question types: "first", "last", "after", "before", "second", "second_last"
    # "second" and "second_last" only generated when n_clips >= min_clips_for_second_questions
    question_types: ["first", "last", "after", "before", "second", "second_last"]
    
    # MCQ question templates
    mcq_questions:
      first: "Which sound appears first in the audio clip?"
      last: "Which sound appears last in the audio clip?"
      after: "Which sound comes after {sound1}?"
      before: "Which sound comes before {sound2}?"
      second: "Which sound appears second in the audio clip?"
      second_last: "Which sound appears second to last in the audio clip?"
    # Open-text question templates
    open_text_questions:
      first: "What is the first sound you hear in the audio?"
      last: "What is the last sound you hear in the audio?"
      after: "What sound comes after {sound1}?"
      before: "What sound comes before {sound2}?"
      second: "What is the second sound you hear in the audio?"
      second_last: "What sound is second to last in the audio?"
      sequence: "List the sounds in the order they appear in the audio."
    
  volume:
    enabled: true
    # Total duration for ALL samples in this task combined (in hours)
    task_duration_size: 2.0  # hours
    
    # Maximum clips with different volumes per sample
    # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
    max_clips_per_sample: 10
    
    # =====================================================
    # Normalization settings (CRITICAL for volume comparison)
    # =====================================================
    # All clips are FIRST normalized to baseline, THEN volume adjusted
    # This ensures volume differences are controlled and comparable
    normalize_to_baseline: true
    baseline_dBFS: -20.0  # Normalize all clips to this level first (used if use_lufs=false)
    
    # =====================================================
    # LUFS (Perceived Loudness) Settings
    # =====================================================
    # LUFS (Loudness Units Full Scale) measures PERCEIVED loudness
    # Unlike dBFS which only measures RMS amplitude, LUFS accounts for
    # human hearing sensitivity to different frequencies (K-weighting)
    # 
    # IMPORTANT: For volume comparison task, we DISABLE LUFS normalization!
    # LUFS makes everything the same perceived loudness, defeating the purpose.
    # Instead, we normalize to a baseline dBFS then apply LARGE volume adjustments.
    use_lufs: false               # DISABLED for audible volume differences
    baseline_lufs: -23.0          # EBU R128 standard (not used when use_lufs=false)
    
    # =====================================================
    # Volume gap multipliers (similar to duration task)
    # =====================================================
    # For MAX_LOUDNESS questions: target_loudness >= second_loudest × multiplier_max
    # Multiplier 2.5 = ~8dB difference = clearly audible
    # Multiplier 4.0 = ~12dB difference = very obvious (4x perceived loudness)
    multiplier_max_loudness: 4.0
    
    # For MIN_LOUDNESS questions: target_loudness <= second_softest × multiplier_min
    # Multiplier 0.25 = ~12dB quieter = clearly distinguishable
    multiplier_min_loudness: 0.25
    
    # Reject sample if loudness gap cannot be satisfied
    reject_if_gap_not_met: true
    
    # =====================================================
    # Source clip options
    # =====================================================
    # If true: same clip can be repeated at different volumes
    # If false: always use different source clips (default behavior)
    use_same_clip_different_volumes: false
    
    # If use_same_clip_different_volumes is true, how many repetitions per source?
    # Can be a single int or list for variety
    repetitions_per_source: [2, 3, 4]
    
    # Question types: "max_loudness", "min_loudness"
    question_types: ["max_loudness", "min_loudness"]
    
    # MCQ questions
    mcq_questions:
      max_loudness: "Which sound has the maximum loudness in the audio?"
      min_loudness: "Which sound has the minimum loudness in the audio?"
    # Open-text questions
    open_text_questions:
      max_loudness: "Identify the sound with maximum loudness in the audio clip."
      min_loudness: "Identify the sound with minimum loudness in the audio clip."
      order_volume: "List the sounds in order from maximum to minimum loudness."

# MCQ options configuration
mcq:
  num_options: 4
  option_labels: ["A", "B", "C", "D"]
  # Strategy for generating distractor options
  # "present_only": only use sounds present in audio
  # "mixed": mix of present and absent sounds
  # "balanced": balanced distribution
  distractor_strategy: "balanced"

# Logging configuration
logging:
  level: "INFO"  # DEBUG, INFO, WARNING, ERROR, CRITICAL
  log_file: "pipeline.log"
  console_output: true