TREA_2.0_codebase / config.yaml

Upload folder using huggingface_hub

fec9168 verified 12 days ago

16.3 kB

	# Temporal Reasoning Audio Dataset Pipeline Configuration
	##uniform distributuon for clip duration
	##not mixing datasets

	##count
	##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder

	##duration
	##amplitude based filtering -> normalize -> threshold based selection
	##gap between audio clips - x2/1.5 the shorter one -> add as param
	##different clips of the same class can be contatenated to reach target duration
	##consecutive ordering only
	##based on n unique sources and total clips we can have -> shortest and longest duration calculation

	##reject datapoint if same target audio clip cannot be repeated to maintain the gap - arg
	##sample different clip from the same class -> check if different clips can be used to fill the gap - arg

	##amplitude filtered durations in metadata csv

	##get_max_clip_num_to_be_joined()
	##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder

	##ensure_silence_between_clips()
	##silence should always be there between two clips

	##order
	##repeat target clips
	##second and second last - modify question types

	##volume
	##amplitude average loudness for a audio clip -> repetitions but same clip(argument) -> different volume levels based on dB levels

	##add crossfade

	##trimming - threshold separately for each audio clip - normalize 1 and 0 - get threshold -> trim -> concatenate
	##leftmost and rightmost silence trimming
	##buffer for trimming - cutting early and cutting a bit late to avoid cutting important parts
	##periodicity affect

	##volume - trim and get average loudness -> normalize -> adjust volume levels

	##number of clips per samples to avoid silence


	# ESC-50 Dataset paths (each clip is 5 seconds)
	esc50:
	audio_path: "/path/to/ESC-50_github/audio"
	metadata_path: "/path/to/ESC-50_github/meta/esc50.csv"

	# Synthetic silence audio for concatenation
	synthetic_silence:
	path: "/path/to/synthetic_silences"

	# Output configuration
	output:
	base_path: "/path/to/pipeline/test_ood"
	# Dataset class-subset configuration
	# Use this to create datasets (train/val/test) from a persistent subset
	# of classes (e.g. use 40 of 50 classes for in-distribution splits and
	# optionally create an OOD test set using all 50 classes).
	dataset:
	use_class_subset: false # if false, use all available classes
	num_classes_subset: 40 # number of classes to use for train/val/test
	subset_persist_path: "/path/to/class_subset.json"
	subset_seed: 42 # RNG seed when sampling the subset (persisted)

	# Audio generation parameters
	audio:
	# Duration range for each GENERATED clip (in seconds)
	# Original ESC-50 clips are 5s and will be concatenated to create clips in this range
	min_clip_duration: 20.0 # Minimum duration for each generated clip
	max_clip_duration: 60.0 # Maximum duration for each generated clip

	# Crossfade and silence
	crossfade_duration: 500 # Crossfade between audio and silence (milliseconds) for smooth transitions
	silence_duration: 1000 # Default silence between clips (milliseconds)
	min_silence_duration: 100 # Minimum silence ALWAYS inserted between clips (milliseconds)
	max_extra_silence_per_gap: 500 # Maximum extra silence per gap when distributing remainder
	crossfade_within_source: 50 # Small crossfade within same-source repetitions (count task)
	with_silence: true # Add silence between clips
	# Duration (seconds) of individual source clips (ESC-50 are 5s by default).
	# Used to compute how many source clips are concatenated to reach a target
	# generated clip duration. Change only if your source clips differ.
	source_clip_duration: 5.0

	# Audio normalization
	normalize: false
	normalize_target_dBFS: -20.0

	# Random seed for reproducibility
	random_seed: 42

	# LLM for question generation (local Llama 3.1 8B)
	llm:
	enabled: false # Set to true to use LLM for question generation

	# Task-specific configurations
	tasks:
	count:
	enabled: true
	# Total duration for ALL samples in this task combined (in hours)
	# Pipeline will calculate number of samples based on min/max clip durations
	task_duration_size: 2.0 # hours

	# Maximum unique sound sources per sample (single number)
	# Actual number will be subsampled from max(1, max_clips-3) to min(max_clips, max_clips_per_sample)
	max_clips_per_sample: 10

	# Ordering mode for repeated clips of same source:
	# "random": Clips are shuffled randomly (A B A C B A C...) - tests recognition of recurring sounds
	# "consecutive": Same-source clips grouped together (AAA BBB CCC) - easier, just count blocks
	ordering_mode: "random"

	# Question templates for MCQ
	mcq_questions:
	- "What is the number of distinct sound sources in the audio file?"
	- "How many different types of sounds can be identified in this recording?"
	- "How many unique types of sound are present in this audio?"
	- "Identify the count of different sound sources in this clip."
	- "What is the total number of unique sounds heard in this audio?"
	- "How many distinct sound categories are there in this audio file?"
	- "Determine the number of unique sound sources in this recording."
	- "How many separate sound sources are included in the audio?"
	- "What is the total number of unique sound types in this audio?"
	- "How many different sound sources can be heard in this clip?"
	# Question templates for open-text
	open_text_questions:
	- "How many distinct sound sources are present in the audio?"
	- "Count the number of unique sounds in this recording."
	- "What is the total count of different sound categories heard?"
	- "Identify and count all unique sound types in the clip."

	duration:
	enabled: true
	# Total duration for ALL samples in this task combined (in hours)
	task_duration_size: 2.0 # hours

	# Number of unique sound sources per sample (can be single int or list)
	# Single int (e.g., 15): randomly samples from 1 to 15 (like count/order tasks)
	# List (e.g., [2,3,4]): randomly picks from the list
	# The script will automatically generate repetition patterns to create
	# shortest/longest variations based on the target clip duration
	num_unique_sources: 10

	# Ordering: only keep "consecutive" so repeated segments of the same
	# source remain grouped together, ensuring that multiple consecutive
	# clips of the same audio yield the longest duration unambiguously.
	ordering_methods: ["consecutive"]

	# =====================================================
	# Amplitude-based filtering parameters (preprocessing)
	# =====================================================
	# RELATIVE dB threshold below peak to consider as silence
	# For each clip: silence_threshold = clip_peak_dB + amplitude_threshold_db
	# Example: If clip peak is -5 dB and threshold is -20, silence threshold = -25 dB
	# Based on ESC-50 analysis: -20 dB gives ~60% effective duration (good balance)
	# More aggressive (removes more silence): -15 dB
	# More conservative (keeps more sound): -25 dB
	amplitude_threshold_db: -20.0

	# Minimum duration of sound region to keep (milliseconds)
	# Filters out very short transient noise spikes
	# ESC-50 is curated, so 20-30ms is sufficient
	min_sound_duration_ms: 25

	# =====================================================
	# Adaptive threshold strategy
	# =====================================================
	# "peak_relative": threshold = peak_dB + amplitude_threshold_db (fixed offset from peak)
	# - Simple but not adaptive to actual noise levels
	# "noise_floor": threshold = percentile(dB, N) + delta_dB (RECOMMENDED)
	# - Fully adaptive per-clip based on its own noise floor
	# - Each clip analyzed independently - no fixed dB values needed
	# - Better for diverse audio with varying noise levels
	threshold_strategy: "noise_floor"

	# Noise floor estimation percentile (used when threshold_strategy = noise_floor)
	# Lower percentile = more conservative estimate of background noise
	# 5 = use 5th percentile of dB values as noise floor estimate (better for sparse sounds)
	noise_floor_percentile: 2.0

	# Delta above noise floor (dB) to set as threshold
	# This is relative to EACH clip's own noise floor, not a fixed dB value
	# 8dB above the clip's noise floor works well for most ESC-50 clips
	# Higher = more conservative (keeps more), Lower = more aggressive (removes more)
	noise_floor_delta_db: 5.0

	# Path to preprocessed ESC-50 data (effective durations + trimmed audio)
	preprocessed_data_path: "/path/to/ESC-50_preprocessed"

	# =====================================================
	# Duration gap multipliers
	# =====================================================
	# For LONGEST questions: target_effective >= max_background × multiplier_longest
	multiplier_longest: 1.5
	# For SHORTEST questions: target_effective <= min_background × multiplier_shortest
	# Using 0.75 instead of 0.5 for smaller gaps (easier to distinguish)
	multiplier_shortest: 0.75

	# Minimum effective duration per source (seconds)
	# Clips with less than this duration are harder to distinguish
	min_effective_duration_per_source: 1.0

	# =====================================================
	# Fallback/rejection options
	# =====================================================
	# Reject sample if duration gap cannot be satisfied
	reject_if_gap_not_met: true
	# Try different clips from same class if one clip isn't enough
	sample_different_clips_same_class: true

	# Question types
	question_types: ["shortest", "longest"]
	# MCQ questions
	mcq_questions:
	shortest: "Which of the following sounds is heard for the shortest duration?"
	longest: "Which of the following sounds is heard for the longest duration?"
	# Open-text questions
	open_text_questions:
	shortest: "Which sound is heard for the shortest duration in the audio?"
	longest: "Which sound is heard for the longest duration in the audio?"

	order:
	enabled: true
	# Total duration for ALL samples in this task combined (in hours)
	task_duration_size: 2.0 # hours

	# Maximum clips to join per sample (minimum 2 for ordering)
	# Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
	max_clips_per_sample: 10

	# Whether to allow repeating clips from the same source category
	# If true: sequence could be [dog, dog, cat, bird] (same clip repeated)
	# If false: sequence is always unique sources
	allow_source_repetition: false

	# Minimum clips needed for "second" and "second_last" questions
	# Set to 4 to ensure second and second_last refer to different positions
	# (with 3 clips, both would refer to middle clip at position 1)
	min_clips_for_second_questions: 3

	# Question types: "first", "last", "after", "before", "second", "second_last"
	# "second" and "second_last" only generated when n_clips >= min_clips_for_second_questions
	question_types: ["first", "last", "after", "before", "second", "second_last"]

	# MCQ question templates
	mcq_questions:
	first: "Which sound appears first in the audio clip?"
	last: "Which sound appears last in the audio clip?"
	after: "Which sound comes after {sound1}?"
	before: "Which sound comes before {sound2}?"
	second: "Which sound appears second in the audio clip?"
	second_last: "Which sound appears second to last in the audio clip?"
	# Open-text question templates
	open_text_questions:
	first: "What is the first sound you hear in the audio?"
	last: "What is the last sound you hear in the audio?"
	after: "What sound comes after {sound1}?"
	before: "What sound comes before {sound2}?"
	second: "What is the second sound you hear in the audio?"
	second_last: "What sound is second to last in the audio?"
	sequence: "List the sounds in the order they appear in the audio."

	volume:
	enabled: true
	# Total duration for ALL samples in this task combined (in hours)
	task_duration_size: 2.0 # hours

	# Maximum clips with different volumes per sample
	# Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
	max_clips_per_sample: 10

	# =====================================================
	# Normalization settings (CRITICAL for volume comparison)
	# =====================================================
	# All clips are FIRST normalized to baseline, THEN volume adjusted
	# This ensures volume differences are controlled and comparable
	normalize_to_baseline: true
	baseline_dBFS: -20.0 # Normalize all clips to this level first (used if use_lufs=false)

	# =====================================================
	# LUFS (Perceived Loudness) Settings
	# =====================================================
	# LUFS (Loudness Units Full Scale) measures PERCEIVED loudness
	# Unlike dBFS which only measures RMS amplitude, LUFS accounts for
	# human hearing sensitivity to different frequencies (K-weighting)
	#
	# IMPORTANT: For volume comparison task, we DISABLE LUFS normalization!
	# LUFS makes everything the same perceived loudness, defeating the purpose.
	# Instead, we normalize to a baseline dBFS then apply LARGE volume adjustments.
	use_lufs: false # DISABLED for audible volume differences
	baseline_lufs: -23.0 # EBU R128 standard (not used when use_lufs=false)

	# =====================================================
	# Volume gap multipliers (similar to duration task)
	# =====================================================
	# For MAX_LOUDNESS questions: target_loudness >= second_loudest × multiplier_max
	# Multiplier 2.5 = ~8dB difference = clearly audible
	# Multiplier 4.0 = ~12dB difference = very obvious (4x perceived loudness)
	multiplier_max_loudness: 4.0

	# For MIN_LOUDNESS questions: target_loudness <= second_softest × multiplier_min
	# Multiplier 0.25 = ~12dB quieter = clearly distinguishable
	multiplier_min_loudness: 0.25

	# Reject sample if loudness gap cannot be satisfied
	reject_if_gap_not_met: true

	# =====================================================
	# Source clip options
	# =====================================================
	# If true: same clip can be repeated at different volumes
	# If false: always use different source clips (default behavior)
	use_same_clip_different_volumes: false

	# If use_same_clip_different_volumes is true, how many repetitions per source?
	# Can be a single int or list for variety
	repetitions_per_source: [2, 3, 4]

	# Question types: "max_loudness", "min_loudness"
	question_types: ["max_loudness", "min_loudness"]

	# MCQ questions
	mcq_questions:
	max_loudness: "Which sound has the maximum loudness in the audio?"
	min_loudness: "Which sound has the minimum loudness in the audio?"
	# Open-text questions
	open_text_questions:
	max_loudness: "Identify the sound with maximum loudness in the audio clip."
	min_loudness: "Identify the sound with minimum loudness in the audio clip."
	order_volume: "List the sounds in order from maximum to minimum loudness."

	# MCQ options configuration
	mcq:
	num_options: 4
	option_labels: ["A", "B", "C", "D"]
	# Strategy for generating distractor options
	# "present_only": only use sounds present in audio
	# "mixed": mix of present and absent sounds
	# "balanced": balanced distribution
	distractor_strategy: "balanced"

	# Logging configuration
	logging:
	level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
	log_file: "pipeline.log"
	console_output: true