TREA_2.0_codebase / config.yaml
malay-36's picture
Upload folder using huggingface_hub
fec9168 verified
# Temporal Reasoning Audio Dataset Pipeline Configuration
##uniform distributuon for clip duration
##not mixing datasets
##count
##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder
##duration
##amplitude based filtering -> normalize -> threshold based selection
##gap between audio clips - x2/1.5 the shorter one -> add as param
##different clips of the same class can be contatenated to reach target duration
##consecutive ordering only
##based on n unique sources and total clips we can have -> shortest and longest duration calculation
##reject datapoint if same target audio clip cannot be repeated to maintain the gap - arg
##sample different clip from the same class -> check if different clips can be used to fill the gap - arg
##amplitude filtered durations in metadata csv
##get_max_clip_num_to_be_joined()
##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder
##ensure_silence_between_clips()
##silence should always be there between two clips
##order
##repeat target clips
##second and second last - modify question types
##volume
##amplitude average loudness for a audio clip -> repetitions but same clip(argument) -> different volume levels based on dB levels
##add crossfade
##trimming - threshold separately for each audio clip - normalize 1 and 0 - get threshold -> trim -> concatenate
##leftmost and rightmost silence trimming
##buffer for trimming - cutting early and cutting a bit late to avoid cutting important parts
##periodicity affect
##volume - trim and get average loudness -> normalize -> adjust volume levels
##number of clips per samples to avoid silence
# ESC-50 Dataset paths (each clip is 5 seconds)
esc50:
audio_path: "/path/to/ESC-50_github/audio"
metadata_path: "/path/to/ESC-50_github/meta/esc50.csv"
# Synthetic silence audio for concatenation
synthetic_silence:
path: "/path/to/synthetic_silences"
# Output configuration
output:
base_path: "/path/to/pipeline/test_ood"
# Dataset class-subset configuration
# Use this to create datasets (train/val/test) from a persistent subset
# of classes (e.g. use 40 of 50 classes for in-distribution splits and
# optionally create an OOD test set using all 50 classes).
dataset:
use_class_subset: false # if false, use all available classes
num_classes_subset: 40 # number of classes to use for train/val/test
subset_persist_path: "/path/to/class_subset.json"
subset_seed: 42 # RNG seed when sampling the subset (persisted)
# Audio generation parameters
audio:
# Duration range for each GENERATED clip (in seconds)
# Original ESC-50 clips are 5s and will be concatenated to create clips in this range
min_clip_duration: 20.0 # Minimum duration for each generated clip
max_clip_duration: 60.0 # Maximum duration for each generated clip
# Crossfade and silence
crossfade_duration: 500 # Crossfade between audio and silence (milliseconds) for smooth transitions
silence_duration: 1000 # Default silence between clips (milliseconds)
min_silence_duration: 100 # Minimum silence ALWAYS inserted between clips (milliseconds)
max_extra_silence_per_gap: 500 # Maximum extra silence per gap when distributing remainder
crossfade_within_source: 50 # Small crossfade within same-source repetitions (count task)
with_silence: true # Add silence between clips
# Duration (seconds) of individual source clips (ESC-50 are 5s by default).
# Used to compute how many source clips are concatenated to reach a target
# generated clip duration. Change only if your source clips differ.
source_clip_duration: 5.0
# Audio normalization
normalize: false
normalize_target_dBFS: -20.0
# Random seed for reproducibility
random_seed: 42
# LLM for question generation (local Llama 3.1 8B)
llm:
enabled: false # Set to true to use LLM for question generation
# Task-specific configurations
tasks:
count:
enabled: true
# Total duration for ALL samples in this task combined (in hours)
# Pipeline will calculate number of samples based on min/max clip durations
task_duration_size: 2.0 # hours
# Maximum unique sound sources per sample (single number)
# Actual number will be subsampled from max(1, max_clips-3) to min(max_clips, max_clips_per_sample)
max_clips_per_sample: 10
# Ordering mode for repeated clips of same source:
# "random": Clips are shuffled randomly (A B A C B A C...) - tests recognition of recurring sounds
# "consecutive": Same-source clips grouped together (AAA BBB CCC) - easier, just count blocks
ordering_mode: "random"
# Question templates for MCQ
mcq_questions:
- "What is the number of distinct sound sources in the audio file?"
- "How many different types of sounds can be identified in this recording?"
- "How many unique types of sound are present in this audio?"
- "Identify the count of different sound sources in this clip."
- "What is the total number of unique sounds heard in this audio?"
- "How many distinct sound categories are there in this audio file?"
- "Determine the number of unique sound sources in this recording."
- "How many separate sound sources are included in the audio?"
- "What is the total number of unique sound types in this audio?"
- "How many different sound sources can be heard in this clip?"
# Question templates for open-text
open_text_questions:
- "How many distinct sound sources are present in the audio?"
- "Count the number of unique sounds in this recording."
- "What is the total count of different sound categories heard?"
- "Identify and count all unique sound types in the clip."
duration:
enabled: true
# Total duration for ALL samples in this task combined (in hours)
task_duration_size: 2.0 # hours
# Number of unique sound sources per sample (can be single int or list)
# Single int (e.g., 15): randomly samples from 1 to 15 (like count/order tasks)
# List (e.g., [2,3,4]): randomly picks from the list
# The script will automatically generate repetition patterns to create
# shortest/longest variations based on the target clip duration
num_unique_sources: 10
# Ordering: only keep "consecutive" so repeated segments of the same
# source remain grouped together, ensuring that multiple consecutive
# clips of the same audio yield the longest duration unambiguously.
ordering_methods: ["consecutive"]
# =====================================================
# Amplitude-based filtering parameters (preprocessing)
# =====================================================
# RELATIVE dB threshold below peak to consider as silence
# For each clip: silence_threshold = clip_peak_dB + amplitude_threshold_db
# Example: If clip peak is -5 dB and threshold is -20, silence threshold = -25 dB
# Based on ESC-50 analysis: -20 dB gives ~60% effective duration (good balance)
# More aggressive (removes more silence): -15 dB
# More conservative (keeps more sound): -25 dB
amplitude_threshold_db: -20.0
# Minimum duration of sound region to keep (milliseconds)
# Filters out very short transient noise spikes
# ESC-50 is curated, so 20-30ms is sufficient
min_sound_duration_ms: 25
# =====================================================
# Adaptive threshold strategy
# =====================================================
# "peak_relative": threshold = peak_dB + amplitude_threshold_db (fixed offset from peak)
# - Simple but not adaptive to actual noise levels
# "noise_floor": threshold = percentile(dB, N) + delta_dB (RECOMMENDED)
# - Fully adaptive per-clip based on its own noise floor
# - Each clip analyzed independently - no fixed dB values needed
# - Better for diverse audio with varying noise levels
threshold_strategy: "noise_floor"
# Noise floor estimation percentile (used when threshold_strategy = noise_floor)
# Lower percentile = more conservative estimate of background noise
# 5 = use 5th percentile of dB values as noise floor estimate (better for sparse sounds)
noise_floor_percentile: 2.0
# Delta above noise floor (dB) to set as threshold
# This is relative to EACH clip's own noise floor, not a fixed dB value
# 8dB above the clip's noise floor works well for most ESC-50 clips
# Higher = more conservative (keeps more), Lower = more aggressive (removes more)
noise_floor_delta_db: 5.0
# Path to preprocessed ESC-50 data (effective durations + trimmed audio)
preprocessed_data_path: "/path/to/ESC-50_preprocessed"
# =====================================================
# Duration gap multipliers
# =====================================================
# For LONGEST questions: target_effective >= max_background × multiplier_longest
multiplier_longest: 1.5
# For SHORTEST questions: target_effective <= min_background × multiplier_shortest
# Using 0.75 instead of 0.5 for smaller gaps (easier to distinguish)
multiplier_shortest: 0.75
# Minimum effective duration per source (seconds)
# Clips with less than this duration are harder to distinguish
min_effective_duration_per_source: 1.0
# =====================================================
# Fallback/rejection options
# =====================================================
# Reject sample if duration gap cannot be satisfied
reject_if_gap_not_met: true
# Try different clips from same class if one clip isn't enough
sample_different_clips_same_class: true
# Question types
question_types: ["shortest", "longest"]
# MCQ questions
mcq_questions:
shortest: "Which of the following sounds is heard for the shortest duration?"
longest: "Which of the following sounds is heard for the longest duration?"
# Open-text questions
open_text_questions:
shortest: "Which sound is heard for the shortest duration in the audio?"
longest: "Which sound is heard for the longest duration in the audio?"
order:
enabled: true
# Total duration for ALL samples in this task combined (in hours)
task_duration_size: 2.0 # hours
# Maximum clips to join per sample (minimum 2 for ordering)
# Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
max_clips_per_sample: 10
# Whether to allow repeating clips from the same source category
# If true: sequence could be [dog, dog, cat, bird] (same clip repeated)
# If false: sequence is always unique sources
allow_source_repetition: false
# Minimum clips needed for "second" and "second_last" questions
# Set to 4 to ensure second and second_last refer to different positions
# (with 3 clips, both would refer to middle clip at position 1)
min_clips_for_second_questions: 3
# Question types: "first", "last", "after", "before", "second", "second_last"
# "second" and "second_last" only generated when n_clips >= min_clips_for_second_questions
question_types: ["first", "last", "after", "before", "second", "second_last"]
# MCQ question templates
mcq_questions:
first: "Which sound appears first in the audio clip?"
last: "Which sound appears last in the audio clip?"
after: "Which sound comes after {sound1}?"
before: "Which sound comes before {sound2}?"
second: "Which sound appears second in the audio clip?"
second_last: "Which sound appears second to last in the audio clip?"
# Open-text question templates
open_text_questions:
first: "What is the first sound you hear in the audio?"
last: "What is the last sound you hear in the audio?"
after: "What sound comes after {sound1}?"
before: "What sound comes before {sound2}?"
second: "What is the second sound you hear in the audio?"
second_last: "What sound is second to last in the audio?"
sequence: "List the sounds in the order they appear in the audio."
volume:
enabled: true
# Total duration for ALL samples in this task combined (in hours)
task_duration_size: 2.0 # hours
# Maximum clips with different volumes per sample
# Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
max_clips_per_sample: 10
# =====================================================
# Normalization settings (CRITICAL for volume comparison)
# =====================================================
# All clips are FIRST normalized to baseline, THEN volume adjusted
# This ensures volume differences are controlled and comparable
normalize_to_baseline: true
baseline_dBFS: -20.0 # Normalize all clips to this level first (used if use_lufs=false)
# =====================================================
# LUFS (Perceived Loudness) Settings
# =====================================================
# LUFS (Loudness Units Full Scale) measures PERCEIVED loudness
# Unlike dBFS which only measures RMS amplitude, LUFS accounts for
# human hearing sensitivity to different frequencies (K-weighting)
#
# IMPORTANT: For volume comparison task, we DISABLE LUFS normalization!
# LUFS makes everything the same perceived loudness, defeating the purpose.
# Instead, we normalize to a baseline dBFS then apply LARGE volume adjustments.
use_lufs: false # DISABLED for audible volume differences
baseline_lufs: -23.0 # EBU R128 standard (not used when use_lufs=false)
# =====================================================
# Volume gap multipliers (similar to duration task)
# =====================================================
# For MAX_LOUDNESS questions: target_loudness >= second_loudest × multiplier_max
# Multiplier 2.5 = ~8dB difference = clearly audible
# Multiplier 4.0 = ~12dB difference = very obvious (4x perceived loudness)
multiplier_max_loudness: 4.0
# For MIN_LOUDNESS questions: target_loudness <= second_softest × multiplier_min
# Multiplier 0.25 = ~12dB quieter = clearly distinguishable
multiplier_min_loudness: 0.25
# Reject sample if loudness gap cannot be satisfied
reject_if_gap_not_met: true
# =====================================================
# Source clip options
# =====================================================
# If true: same clip can be repeated at different volumes
# If false: always use different source clips (default behavior)
use_same_clip_different_volumes: false
# If use_same_clip_different_volumes is true, how many repetitions per source?
# Can be a single int or list for variety
repetitions_per_source: [2, 3, 4]
# Question types: "max_loudness", "min_loudness"
question_types: ["max_loudness", "min_loudness"]
# MCQ questions
mcq_questions:
max_loudness: "Which sound has the maximum loudness in the audio?"
min_loudness: "Which sound has the minimum loudness in the audio?"
# Open-text questions
open_text_questions:
max_loudness: "Identify the sound with maximum loudness in the audio clip."
min_loudness: "Identify the sound with minimum loudness in the audio clip."
order_volume: "List the sounds in order from maximum to minimum loudness."
# MCQ options configuration
mcq:
num_options: 4
option_labels: ["A", "B", "C", "D"]
# Strategy for generating distractor options
# "present_only": only use sounds present in audio
# "mixed": mix of present and absent sounds
# "balanced": balanced distribution
distractor_strategy: "balanced"
# Logging configuration
logging:
level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
log_file: "pipeline.log"
console_output: true