diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..9f6451f9501eebea78d0916c1ed865756a794d4b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_1.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_10.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_11.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_12.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_13.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_14.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_15.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_16.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_17.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_18.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_2.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_20.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_3.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_4.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_5.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_6.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_7.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_8.wav filter=lfs diff=lfs merge=lfs -text +synthetic_silences/silent_9.wav filter=lfs diff=lfs merge=lfs -text diff --git a/DOCS.md b/DOCS.md new file mode 100644 index 0000000000000000000000000000000000000000..ad935fbf00e87e4e07f7f7291992e8c5a8178a93 --- /dev/null +++ b/DOCS.md @@ -0,0 +1,1296 @@ +# TREA 2.0 - Technical Documentation + +Comprehensive technical documentation for the TREA 2.0 audio dataset generation pipeline. This document covers the complete implementation including algorithms, mathematical formulations, configuration parameters, preprocessing details, and capacity-aware balancing mechanisms. + +**For Quick Start Guide**: See [README.md](README.md) + +--- + +## Table of Contents + +1. [Pipeline Overview](#pipeline-overview) +2. [How Sample Durations Are Generated](#how-sample-durations-are-generated) +3. [Configuration Reference](#configuration-reference) +4. [ESC-50 Preprocessing](#esc-50-preprocessing-duration-task-only) +5. [Audio Utilities](#audio-utilities) +6. [Task: COUNT](#task-count) +7. [Task: DURATION](#task-duration) +8. [Task: ORDER](#task-order) +9. [Task: VOLUME](#task-volume) +10. [Deterministic Balancing Mechanisms](#deterministic-balancing-mechanisms) +11. [Rejection Logic and Retry Mechanisms](#rejection-logic-and-retry-mechanisms) +12. [Command-Line Arguments](#command-line-arguments) +13. [Summary](#summary) + +--- + +## Pipeline Overview + +### Architecture + +The pipeline generates four types of audio-based question-answering samples: + +| Task | Question Type | Example Question | +|------|---------------|------------------| +| **COUNT** | Counting unique sounds | "How many unique sounds do you hear?" | +| **DURATION** | Temporal comparison | "Which sound plays for the longest duration?" | +| **ORDER** | Temporal ordering | "Which sound plays first/last/after X?" | +| **VOLUME** | Loudness comparison | "Which sound is the loudest/softest?" | + +### Directory Structure + +``` +pipeline/ +├── main.py # Entry point - orchestrates all tasks +├── config.yaml # All configuration parameters +├── tasks/ +│ ├── task_count.py # CountTaskGenerator class +│ ├── task_duration.py # DurationTaskGenerator class +│ ├── task_order.py # OrderTaskGenerator class +│ └── task_volume.py # VolumeTaskGenerator class +├── utils/ +│ ├── __init__.py # Exports all utilities +│ ├── audio_utils.py # Audio processing functions +│ ├── dataset_utils.py # ESC50Dataset, PreprocessedESC50Dataset +│ ├── question_utils.py # QuestionGenerator +│ ├── llm_utils.py # LLMQuestionGenerator +│ └── logger.py # setup_logger +└── output/ # Generated outputs +``` + +### Data Flow + +``` +ESC-50 Dataset (2000 clips, 50 categories, 5s each) + ↓ +[DURATION TASK ONLY] Preprocessing Script (preprocess_esc50.py) +├── Detects sound regions using adaptive noise-floor thresholding +├── Trims leading/trailing silence (keeps internal structure) +├── Calculates effective durations + ↓ +ESC-50_preprocessed/ +├── effective_durations.csv (metadata with effective durations) +└── trimmed_audio/*.wav (edge-trimmed clips) + ↓ +Pipeline (task-specific generation with balancing) +├── COUNT: Uses raw ESC-50 clips +├── DURATION: Uses preprocessed clips with effective durations +├── ORDER: Uses raw ESC-50 clips +└── VOLUME: Uses raw ESC-50 clips (normalized then volume-adjusted) + ↓ +output/{task}/ +├── audios/*.wav (generated audio samples) +├── {task}_mcq.csv (multiple choice questions) +├── {task}_open_text.csv (open-ended questions) +└── {task}_metadata.csv (detailed metadata) +``` + +### Entry Point: `main.py` + +The main orchestration happens via individual task runner functions: + +```python +def run_count_task(config: dict, logger): + generator = CountTaskGenerator(config, logger) + generator.dataset.reset_category_usage() + generator.generate_dataset() + +def run_duration_task(config: dict, logger): + generator = DurationTaskGenerator(config, logger) + generator.dataset.reset_category_usage() + generator.generate_dataset() + +def run_order_task(config: dict, logger): + generator = OrderTaskGenerator(config, logger) + generator.dataset.reset_category_usage() + generator.generate_dataset() + +def run_volume_task(config: dict, logger): + generator = VolumeTaskGenerator(config, logger) + generator.dataset.reset_category_usage() + generator.generate_dataset() +``` + +--- + +## How Sample Durations Are Generated + +**IMPORTANT**: Sample durations are generated upfront to **exactly fill the target task duration**. + +### The Algorithm + +Located in `utils/audio_utils.py`: + +```python +def generate_sample_durations_for_task( + task_duration_hours: float, + min_clip_duration: float, + max_clip_duration: float +) -> list: + """ + Generate sample durations that exactly fill the target task duration. + """ + task_duration_seconds = task_duration_hours * 3600 + remaining = task_duration_seconds + durations = [] + + while remaining >= min_clip_duration: + # Cap max at remaining to avoid overshoot + effective_max = min(max_clip_duration, remaining) + + # If remaining is less than min, we can't fit another sample + if effective_max < min_clip_duration: + break + + # Sample uniformly within valid range + d = random.uniform(min_clip_duration, effective_max) + durations.append(d) + remaining -= d + + # Shuffle to randomize order + random.shuffle(durations) + + return durations +``` + +1. Start with `remaining = total_seconds` +2. While `remaining >= min_clip_duration`: + - Sample `d ~ Uniform(min, min(max, remaining))` + - Append `d` to durations list + - Subtract `d` from remaining +3. Shuffle and return + +### Mathematical Properties + +**Guarantee**: $\sum_{i=1}^{N} d_i \leq T$ and $T - \sum d_i < d_{\min}$ + +Where: +- $T$ = total task duration +- $d_i$ = duration of sample $i$ +- $d_{\min}$ = minimum clip duration +- $N$ = number of samples generated (variable, not fixed!) + +**Each duration**: $d_i \sim \text{Uniform}(d_{\min}, \min(d_{\max}, \text{remaining}_i))$ + +### Example + +With `task_duration_size = 1.0` hours (3600s), `min = 20s`, `max = 60s`: + +``` +remaining=3600 → d₁=45.2s → remaining=3554.8 +remaining=3554.8 → d₂=28.7s → remaining=3526.1 +remaining=3526.1 → d₃=52.1s → remaining=3474.0 +... +remaining=35.2 → d₈₉=35.2s → remaining=0 (capped at remaining) +``` + +Result: 89 samples totaling exactly 3600s (instead of estimated 90) + +### Where It's Called + +Each task's `generate_dataset()` method uses this: + +```python +def generate_dataset(self) -> tuple: + # Generate all durations upfront + sample_durations = generate_sample_durations_for_task( + self.task_duration_hours, + self.min_clip_duration, + self.max_clip_duration + ) + num_samples = len(sample_durations) + + self.logger.info(f"Generating {num_samples} samples...") + + # Each sample uses its pre-assigned duration + for i, target_duration in enumerate(sample_durations): + metadata = self.generate_sample(i, target_duration=target_duration, ...) +``` +``` + +--- + +## Configuration Reference + +All parameters are defined in `config.yaml`. + +### Dataset Class Subset Configuration + +```yaml +dataset: + use_class_subset: false # Enable to use only a subset of ESC-50 classes + num_classes_subset: 40 # Number of classes for train/val/test (e.g., 40 of 50) + subset_persist_path: "output/class_subset.json" # Path to save/load class subset + subset_seed: 42 # Random seed for subset selection (persisted) +``` + +**Purpose**: Create in-distribution (ID) splits using a subset of classes, then optionally test on out-of-distribution (OOD) using all classes. + +**Workflow**: +1. Set `use_class_subset: true` and `num_classes_subset: 40` +2. Run pipeline - 40 classes randomly selected and saved to `class_subset.json` +3. Generate train/val/test splits - all use same 40 classes +4. For OOD test: Set `use_class_subset: false`, use different output path + +### Global Audio Parameters + +```yaml +audio: + min_clip_duration: 20.0 # Minimum generated clip duration (seconds) + max_clip_duration: 60.0 # Maximum generated clip duration (seconds) + source_clip_duration: 5.0 # ESC-50 clip length (seconds) + + # Silence and crossfade parameters (applied to ALL tasks) + min_silence_duration: 100 # Minimum silence ALWAYS between clips (ms) + max_extra_silence_per_gap: 500 # Max extra silence per gap when distributing remainder (ms) + crossfade_duration: 500 # Crossfade between audio-silence transitions (ms) for smooth joins + crossfade_within_source: 50 # Small crossfade within same-source repetitions (ms) for COUNT task + with_silence: true # Enable silence insertion between clips + + normalize: false + normalize_target_dBFS: -20.0 +``` + +### Task-Specific Parameters + +#### COUNT Task +```yaml +count: + enabled: true + task_duration_size: 2.0 # Hours of total audio to generate + max_clips_per_sample: 10 # Maximum unique sounds per sample (1 to 10) + ordering_mode: "random" # "random" (shuffled clips) or "consecutive" (grouped by source) + + # CAPACITY-AWARE ANSWER BALANCING: + # - Creates balanced distribution of answers from 1 to max_clips_per_sample + # - Sorts samples by capacity (max_clips each can fit) + # - Assigns higher targets to high-capacity samples + # - Clamps targets to what actually fits (reduces excessive silence) +``` + +#### DURATION Task +```yaml +duration: + enabled: true + task_duration_size: 2.0 + preprocessed_data_path: "/home/debarpanb1/TREA_2.0/ESC-50_preprocessed" + question_types: ["shortest", "longest"] + num_unique_sources: 10 # Can be int or list (e.g., [2,3,4,5]) + ordering_methods: ["consecutive"] # Only consecutive for duration task + + # Preprocessing parameters (adaptive noise-floor thresholding) + threshold_strategy: "noise_floor" # Adaptive per-clip (recommended) + noise_floor_percentile: 2.0 # Use 2nd percentile as noise floor + noise_floor_delta_db: 5.0 # Threshold = noise_floor + 5dB + min_sound_duration_ms: 25 # Filter transient spikes + + # Gap multipliers + multiplier_longest: 1.5 # Target must be ≥ 1.5x max background + multiplier_shortest: 0.75 # Target must be ≤ 0.75x min background (changed from 0.5) + min_effective_duration_per_source: 1.0 # Minimum duration per source (seconds) + + reject_if_gap_not_met: true + sample_different_clips_same_class: true +``` + +#### ORDER Task +```yaml +order: + enabled: true + task_duration_size: 2.0 + max_clips_per_sample: 10 # Cap for maximum clips to join + question_types: ["first", "last", "second", "second_last", "after", "before"] + min_clips_for_second_questions: 3 # "second" and "second_last" require ≥3 clips + allow_source_repetition: false # Each clip from unique source + + # CAPACITY-AWARE QUESTION TYPE BALANCING: + # - Each question type appears equally across samples + # - Advanced types (second, second_last) assigned to high-capacity samples + # - Basic types (first, last, after, before) for lower-capacity samples + # - NO n_clips balancing: randomly samples from [max(2, max_clips-3), max_clips_per_sample] +``` + +#### VOLUME Task +```yaml +volume: + enabled: true + task_duration_size: 2.0 + max_clips_per_sample: 10 # Cap for maximum clips with different volumes + question_types: ["max_loudness", "min_loudness"] + + # Normalization (CRITICAL for controlled volume comparison) + normalize_to_baseline: true + baseline_dBFS: -20.0 # All clips normalized to this level first + use_lufs: false # DISABLED - LUFS makes everything same perceived loudness! + baseline_lufs: -23.0 # EBU R128 standard (not used when use_lufs=false) + + # Volume gap constraints (multipliers) + multiplier_max_loudness: 4.0 # Max must be ≥ 4x second-loudest (~12 dB) + multiplier_min_loudness: 0.25 # Min must be ≤ 0.25x second-softest (~12 dB) + reject_if_gap_not_met: true + + # Source clip options + use_same_clip_different_volumes: false # Use different clips (not same clip repeated) + repetitions_per_source: [2, 3, 4] # If same clip used, how many repetitions + + # QUESTION TYPE BALANCING: Each question type appears equally across samples + # NO n_clips balancing: randomly samples from [max(2, max_clips-3), max_clips_per_sample] +``` + +--- + +## ESC-50 Preprocessing (Duration Task Only) + +**File**: `preprocess_esc50.py` +**Purpose**: Preprocess ESC-50 clips for duration task by detecting actual sound regions and trimming silence. + +### Why Preprocessing? + +The DURATION task compares sound durations. Raw ESC-50 clips have variable amounts of leading/trailing silence, which would make duration comparisons ambiguous. Preprocessing: + +1. **Detects actual sound regions** using adaptive amplitude thresholding +2. **Trims leading and trailing silence** (preserves internal structure) +3. **Calculates effective duration** (sum of all sound regions) +4. **Generates metadata CSV** with per-clip durations + +### Preprocessing Pipeline + +``` +Raw ESC-50 clip (5s with silence) + ↓ +1. Load audio and convert to amplitude array +2. Compute RMS envelope (frame-by-frame energy) +3. Convert RMS to dB values +4. Apply adaptive threshold strategy +5. Detect contiguous sound regions +6. Trim edges (only if silence >= 100ms) +7. Calculate effective duration +8. Save trimmed audio + metadata +``` + +### Adaptive Noise-Floor Thresholding + +The preprocessing uses an **adaptive per-clip threshold** strategy: + +```python +# Strategy: 'noise_floor' (adaptive, recommended) +noise_floor_db = np.percentile(db_values, noise_floor_percentile) # e.g., 2nd percentile +absolute_threshold = noise_floor_db + noise_floor_delta_db # e.g., +5 dB above noise floor +``` + +**Key Parameters** (from `config.yaml`): +```yaml +duration: + threshold_strategy: "noise_floor" # Adaptive per-clip (recommended) + noise_floor_percentile: 2.0 # Use 2nd percentile as noise floor estimate + noise_floor_delta_db: 5.0 # Threshold = noise_floor + 5 dB + min_sound_duration_ms: 25 # Filter out transient spikes < 25ms +``` + +**Why Adaptive?** +- Each clip has different background noise levels +- Fixed threshold (e.g., -40 dB) works poorly across diverse sounds +- Adaptive threshold adjusts per-clip based on its own noise floor + +**Alternative** (legacy): +```yaml +threshold_strategy: "peak_relative" # threshold = peak_dB - 20 dB (fixed offset) +amplitude_threshold_db: -20.0 +``` + +### Edge Trimming Strategy + +**ADAPTIVE EDGE-ONLY TRIMMING** - preserves natural periodicity: + +```python +def extract_sound_with_edges_trimmed(audio, regions, min_silence_to_trim_ms=100, buffer_ratio=0.1): + """ + Trim ONLY leftmost and rightmost silence IF significant. + Preserves ALL internal structure (perfect for periodic sounds). + """ + leading_silence_ms = regions[0][0] # Time before first sound + trailing_silence_ms = len(audio) - regions[-1][1] # Time after last sound + + # Only trim if silence >= 100ms + if leading_silence_ms >= min_silence_to_trim_ms: + buffer_ms = max(200, int(leading_silence_ms * 0.1)) # Keep 10% as buffer + trim_start_ms = max(0, regions[0][0] - buffer_ms) + else: + trim_start_ms = 0 # Keep from start + + # Similar for trailing silence + ... + + return audio[trim_start_ms:trim_end_ms] +``` + +**Why Edge-Only?** +- Clock ticks, footsteps, typing have periodic silence between sounds +- Removing internal silences destroys natural rhythm +- Edge trimming removes irrelevant silence while preserving periodicity + +### Output Files + +``` +ESC-50_preprocessed/ +├── effective_durations.csv +│ ├── filename +│ ├── category +│ ├── raw_duration_s (original 5.0s) +│ ├── final_duration_s (after edge trimming) +│ ├── effective_duration_s (sum of sound regions) +│ ├── num_sound_regions +│ ├── peak_amplitude_db +│ ├── avg_rms_db +│ └── threshold_strategy, noise_floor_percentile, noise_floor_delta_db +└── trimmed_audio/ + ├── 1-100032-A-0.wav (edge-trimmed clips) + └── ... +``` + +### Running Preprocessing + +```bash +# Using config defaults +python preprocess_esc50.py --config config.yaml + +# Override parameters +python preprocess_esc50.py --config config.yaml \ + --threshold-strategy noise_floor \ + --noise-floor-percentile 2.0 \ + --noise-floor-delta-db 5.0 \ + --min-sound-ms 25 + +# Don't save trimmed audio (only CSV) +python preprocess_esc50.py --config config.yaml --no-trimmed-audio +``` + +### Preprocessing Statistics Example + +``` +ESC-50 Preprocessing Summary +============================================================ +Total clips processed: 2000 +Successfully processed: 2000 + +Raw duration statistics: + Mean: 5.000s Std: 0.000s Min: 5.000s Max: 5.000s + +Final duration statistics (edges trimmed): + Mean: 4.723s Std: 0.412s Min: 2.134s Max: 5.000s + +Effective duration statistics (sum of sound regions): + Mean: 3.856s Std: 0.823s Min: 0.542s Max: 4.982s + +Comparison: + Avg effective: 3.856s + Avg final: 4.723s + Difference: 0.867s (internal silences preserved) + +Average edge trimming reduction: 5.5% +``` + +### How Duration Task Uses Preprocessed Data + +The `DurationTaskGenerator` loads preprocessed data: + +```python +self.preprocessed_dataset = PreprocessedESC50Dataset( + metadata_csv=config['tasks']['duration']['preprocessed_data_path'] + '/effective_durations.csv', + audio_dir=config['tasks']['duration']['preprocessed_data_path'] + '/trimmed_audio' +) + +# Calculate average effective duration for slot distribution +effective_durations = self.preprocessed_dataset.metadata_df['effective_duration_s'] +self.avg_effective_duration = effective_durations.mean() # ~3.856s +``` + +--- + +## Audio Utilities + +Located in `utils/audio_utils.py`. + +### `generate_single_clip_duration(min_duration, max_duration) → float` + +**Purpose**: Generate a random target clip duration using UNIFORM sampling. + +**Implementation**: +```python +def generate_single_clip_duration(min_duration: float, max_duration: float) -> float: + return random.uniform(min_duration, max_duration) +``` + +**Mathematical Formulation**: +$$d \sim \text{Uniform}(d_{\min}, d_{\max})$$ + +With default values (20s, 60s): +- Mean: $\mu = \frac{20 + 60}{2} = 40$ seconds +- Standard Deviation: $\sigma = \frac{60 - 20}{\sqrt{12}} \approx 11.5$ seconds + +--- + +### `get_max_clip_num_to_be_joined(target_duration_s, source_duration_s, min_silence_ms) → Tuple[int, float]` + +**Purpose**: Calculate maximum number of source clips that can fit in target duration. + +**Returns**: Tuple of (max_clips, remainder_seconds) + +**Implementation** (conceptual): +```python +def get_max_clip_num_to_be_joined(target_s, source_s, min_silence_ms): + silence_s = min_silence_ms / 1000.0 + # Each clip + silence except last + effective_unit = source_s + silence_s + max_clips = int((target_s + silence_s) / effective_unit) + remainder = target_s - (max_clips * source_s + (max_clips - 1) * silence_s) + return max_clips, remainder +``` + +**Mathematical Formula**: +$$N_{\max} = \left\lfloor \frac{T + g}{S + g} \right\rfloor$$ + +Where: +- $T$ = target duration (seconds) +- $S$ = source clip duration (5.0s for ESC-50) +- $g$ = minimum silence gap (seconds) + +--- + +### `build_count_task_audio(source_audios, source_categories, target_duration, ...)` + +**Purpose**: Build the final audio for COUNT task. + +**Parameters**: +- `source_audios`: List of AudioSegment objects (one per category) +- `source_categories`: List of category names +- `target_duration`: Target total duration in seconds +- `ordering_mode`: "random" or "consecutive" +- `source_clip_duration_seconds`: Duration of each source clip +- `min_silence_ms`, `max_extra_silence_per_gap_ms`: Silence parameters + +**Returns**: Tuple of (final_audio, clip_sequence, build_metadata) + +--- + +### `build_duration_task_audio(...)` + +**Purpose**: Build audio for DURATION task with slot distribution. + +--- + +### `build_clip_sequence_with_silences(clips, target_duration_s, min_silence_ms, max_extra_silence_per_gap_ms, crossfade_ms)` + +**Purpose**: Concatenate clips with random silence gaps and smooth crossfades. + +**Algorithm**: +1. Calculate total audio content duration +2. Calculate minimum required silence: `(n_clips - 1) × min_silence_ms` +3. Calculate available extra time: `target_duration - total_audio - min_silence` +4. Distribute extra time randomly across gaps (up to `max_extra_silence_per_gap_ms` per gap) +5. Build sequence with crossfades: + - Audio → Silence: crossfade for smooth transition + - Silence → Audio: No crossfade (preserves audio start) + +**Crossfade Benefits**: +- Smooth transitions between audio and silence +- Reduces clicks/pops at audio boundaries +- Preserves natural sound attack (no crossfade at audio start) + +--- + +## Task: COUNT + +**File**: `tasks/task_count.py` +**Class**: `CountTaskGenerator` + +### Complete Flow + +``` +CountTaskGenerator.__init__(config, logger) + ↓ + Initialize: + - ESC50Dataset (loads metadata, tracks category usage) + - AudioProcessor + - QuestionGenerator + - LLMQuestionGenerator (if enabled) + ↓ +generate_dataset() + ↓ + 1. num_samples = calculate_num_samples_for_task(task_duration_hours, min, max) + 2. Create balanced_answers list from num_clips_per_sample + 3. Shuffle balanced_answers + 4. For each sample: + generate_sample(sample_id, target_unique_count=balanced_answers[i]) + 5. Save CSVs +``` + +### Key Method: `generate_sample(sample_id, target_unique_count)` + +**Pipeline**: +1. Generate random target duration: `clip_duration_seconds = generate_single_clip_duration(min, max)` +2. Calculate max clips: `max_clips, remainder = get_max_clip_num_to_be_joined(...)` +3. Cap `n_unique_audios` at min(target_unique_count, max_clips, 50) +4. Select categories: `selected_categories = dataset.get_least_used_categories(n_unique_audios)` +5. Track usage: Increment `category_usage_counts` for each selected category +6. Sample one file per category: `dataset.sample_file_from_category(category)` +7. Load source audios +8. Build final audio: `build_count_task_audio(source_audios, categories, target_duration, ordering_mode, ...)` +9. Export audio file +10. Generate MCQ and open-text questions +11. Return metadata dict + +### Balanced Answer Distribution (Updated with max_clips_per_sample) + +```python +# In generate_dataset() +max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10) # Single number: 10 +possible_answers = list(range(1, max_clips_per_sample + 1)) # [1, 2, 3, ..., 10] + +samples_per_answer = num_samples // len(possible_answers) +remainder = num_samples % len(possible_answers) + +balanced_answers = [] +for answer in possible_answers: + count = samples_per_answer + (1 if remainder > 0 else 0) + balanced_answers.extend([answer] * count) + remainder = max(0, remainder - 1) + +random.shuffle(balanced_answers) +``` + +**For 90 samples, max_clips_per_sample=10**: Each answer (1-10) appears exactly 9 times. + +### Silence Reduction Strategy (NEW) + +Each sample's target answer is capped at what actually fits in the duration: + +```python +# In generate_sample() +max_clips, _ = get_max_clip_num_to_be_joined(clip_duration_seconds, source_clip_duration, min_silence_ms) + +if target_unique_count is not None: + # Cap target at what actually fits (reduces silence) + n_unique_audios = min(target_unique_count, max_clips, len(CATEGORIES)) +``` + +**Example**: +- Target answer from balanced pool: **8 unique sounds** +- Duration allows: **max_clips = 7** +- Actual n_unique_audios: **min(8, 7) = 7** ✓ (uses max possible, reduces silence) + +**Why?** Prevents excessive silence when target exceeds what fits in duration. + +--- + +## Task: DURATION + +**File**: `tasks/task_duration.py` +**Class**: `DurationTaskGenerator` + +### Complete Flow + +``` +DurationTaskGenerator.__init__(config, logger) + ↓ + Initialize: + - PreprocessedESC50Dataset (uses effective_durations.csv) + - Calculate avg_effective_duration from preprocessed data + - AudioProcessor, QuestionGenerator + - Load multiplier_longest, multiplier_shortest from config + ↓ +generate_dataset() + ↓ + 1. num_samples = calculate_num_samples_for_task(...) + 2. Create balanced question types: ["longest"] * 45 + ["shortest"] * 45 + 3. Shuffle balanced_types + 4. While len(samples) < num_samples: + generate_sample(sample_idx, question_type=balanced_types[idx]) + If returns None → increment rejection_count, continue + 5. Save CSVs +``` + +### Key Methods + +#### `_calculate_max_clips_and_sources(target_duration_s, question_type)` + +**Purpose**: Determine valid number of sources based on question type and duration. + +**For LONGEST**: +- Target needs ≥2 clips to beat backgrounds by 1.5x +- `min_valid_sources = 2` +- `max_valid_sources = max_clips - 2 + 1` + +**For SHORTEST**: +- Target gets 1 clip +- Each background needs ≥2 clips to be 2x target +- `max_valid_sources = 1 + (max_clips - 1) // 2` + +```python +# Filter config values to valid range, then pick RANDOMLY +valid_config_sources = [n for n in num_sources_config if min_valid <= n <= max_valid] +n_sources = random.choice(valid_config_sources) +``` + +#### `_try_generate_sample(sample_id, question_type)` + +**Full Algorithm**: +1. Generate target duration: `generate_single_clip_duration(min, max)` +2. Calculate max_clips and n_sources: `_calculate_max_clips_and_sources(...)` +3. Select target category (least used) +4. Select background categories (from remaining least used) +5. Calculate slot distribution based on question_type +6. For each category, select source files and generate clip durations +7. Load and trim clips +8. Calculate total effective duration per category +9. Verify gap constraint +10. If gap not satisfied, try `_try_improve_slot_distribution()` +11. If still not satisfied, return None (triggers retry) +12. Build audio and generate questions +13. Return metadata + +#### `_try_improve_slot_distribution(slot_distribution, durations, question_type, max_clips)` + +**Purpose**: Redistribute slots to satisfy gap constraint. + +--- + +## Task: ORDER + +**File**: `tasks/task_order.py` +**Class**: `OrderTaskGenerator` + +### Complete Flow + +``` +OrderTaskGenerator.__init__(config, logger) + ↓ + Initialize ESC50Dataset, AudioProcessor, QuestionGenerator + ↓ +generate_dataset() + ↓ + 1. Generate sample durations upfront (exact fill) + 2. num_samples = len(sample_durations) + 3. Create balanced question_types distribution + 4. For each sample: + generate_sample(sample_id, target_question_type=balanced_types[i]) + → n_clips randomly selected from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)] + 5. Save CSVs +``` + +### Key Method: `_get_valid_question_types(n_clips)` + +Filters question types based on clip count: +- `second`, `second_last`: require `n_clips >= min_clips_for_second_questions` (default: 4) +- `after`, `before`: require `n_clips >= 2` +- `first`, `last`: always valid + +### Key Method: `generate_sample(sample_id, target_question_type, target_duration_seconds)` + +**Algorithm**: +1. Use pre-generated `target_duration_seconds` (from sample_durations) +2. Calculate max_clips from duration: `get_max_clip_num_to_be_joined(...)` +3. **Silence reduction - randomly select n_clips**: + ```python + min_clips = max(2, max_clips - 3) + max_clips_allowed = min(max_clips, max_clips_per_sample, len(CATEGORIES)) + if min_clips > max_clips_allowed: # Handle edge case + min_clips = max_clips_allowed + n_clips = random.randint(min_clips, max_clips_allowed) + ``` +4. Get valid question types for n_clips +5. Select answer position based on question type: + - `first` → position 0 + - `last` → position n_clips - 1 + - `second` → position 1 + - `second_last` → position n_clips - 2 + - `after` → random position 1 to n-1 + - `before` → random position 0 to n-2 +6. Select categories using least-used balancing (answer first, then others) +7. Build audio with `build_clip_sequence_with_silences` (includes crossfade) +8. Generate questions including sequence question +9. Return metadata + +**Silence Reduction**: Target n_clips is capped at `max_clips` to avoid excessive silence. + +--- + +## Task: VOLUME + +**File**: `tasks/task_volume.py` +**Class**: `VolumeTaskGenerator` + +### Complete Flow + +``` +VolumeTaskGenerator.__init__(config, logger) + ↓ + Initialize ESC50Dataset, AudioProcessor, QuestionGenerator + Load multiplier_max_loudness, multiplier_min_loudness, baseline normalization settings + ↓ +generate_dataset() + ↓ + 1. Generate sample durations upfront (exact fill) + 2. num_samples = len(sample_durations) + 3. Create balanced clips_count_pool from 2 to max_clips_per_sample + 4. Create balanced question_types: ["max_loudness"] * N/2 + ["min_loudness"] * N/2 + 5. Shuffle both pools + 6. Store clips_count_pool as instance variable + 7. For each sample: + generate_sample(sample_id, target_question_type=balanced_types[i]) + → Uses clips_count_pool.pop(0) internally, capped at max_clips_that_fit + → Normalizes clips to baseline, applies volume adjustments + → Verifies gap constraints (up to 10 attempts) + 8. Save CSVs +``` + +### Key Methods + +#### `_normalize_to_baseline(audio)` + +```python +def _normalize_to_baseline(self, audio): + if not self.normalize_to_baseline: + return audio + change_in_dBFS = self.baseline_dBFS - audio.dBFS + return audio.apply_gain(change_in_dBFS) +``` + +#### `_verify_loudness_gap(volume_levels, question_type)` + +**For MAX_LOUDNESS**: +```python +required_gap_dB = 20 * math.log10(self.multiplier_max_loudness) # ≈ 3.52 dB +actual_gap_dB = max_level - second_max +gap_satisfied = actual_gap_dB >= required_gap_dB +``` + +**For MIN_LOUDNESS**: +```python +required_gap_dB = abs(20 * math.log10(self.multiplier_min_loudness)) # ≈ 6.02 dB +actual_gap_dB = second_min - min_level +gap_satisfied = actual_gap_dB >= required_gap_dB +``` + +#### Volume Level Generation + +Volume levels are generated to satisfy gap constraints: +- For `max_loudness`: target gets +gap_dB above baseline, backgrounds at/below baseline +- For `min_loudness`: target gets -gap_dB below baseline, backgrounds at/above baseline + +--- + +## Deterministic Balancing Mechanisms + +### Overview + +The pipeline ensures balanced distributions across multiple dimensions with **capacity-aware assignment**. + +### 1. Capacity-Aware Answer Balancing (COUNT Task) + +Each possible answer (1-10) appears equally often, but **higher targets are assigned to samples with higher capacity**. + +```python +# Calculate capacity for each sample +for duration in sample_durations: + max_clips, _ = get_max_clip_num_to_be_joined(duration, source_clip_duration, min_silence_ms) + max_for_sample = min(max_clips, max_clips_per_sample, len(CATEGORIES)) + sample_max_clips.append(max_for_sample) + +# Create balanced pool +possible_answers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] +samples_per_answer = num_samples // len(possible_answers) +remainder = num_samples % len(possible_answers) + +assignment_pool = [] +for answer in possible_answers: + count = samples_per_answer + (1 if remainder > 0 else 0) + assignment_pool.extend([answer] * count) + remainder = max(0, remainder - 1) + +# Sort samples by capacity (descending) +sample_info.sort(key=lambda x: x[2], reverse=True) + +# Sort pool descending - assign high targets first +assignment_pool.sort(reverse=True) + +# Assign targets, clamped to capacity +for idx, (sample_idx, duration, capacity) in enumerate(sample_info): + target = min(assignment_pool[idx], capacity) + balanced_assignments[sample_idx] = target +``` + +**Guarantee**: Each answer value appears equally, and high targets go to samples that can fit them. + +### 2. Capacity-Aware Question Type Balancing (ORDER Task) + +ORDER task uses **capacity-aware balancing** - advanced question types assigned to high-capacity samples. + +```python +# Separate question types by requirements +basic_types = ['first', 'last', 'after', 'before'] # Need >= 2 clips +advanced_types = ['second', 'second_last'] # Need >= min_clips_for_second (e.g., 3) + +# Sort samples by capacity (descending) +sample_info.sort(key=lambda x: x[2], reverse=True) + +# Build assignment pool - advanced types first +samples_per_type = num_samples // len(question_types) +remainder = num_samples % len(question_types) + +assignment_pool = [] +# Add advanced types first (for high-capacity samples) +for qtype in advanced_types: + count = samples_per_type + (1 if remainder > 0 else 0) + assignment_pool.extend([qtype] * count) + remainder = max(0, remainder - 1) + +# Then basic types +for qtype in basic_types: + count = samples_per_type + (1 if remainder > 0 else 0) + assignment_pool.extend([qtype] * count) + remainder = max(0, remainder - 1) + +# Assign with validation +for idx, (sample_idx, duration, capacity) in enumerate(sample_info): + target_qtype = assignment_pool[idx] + valid_types = _get_valid_question_types(capacity) + + if target_qtype not in valid_types: + # Downgrade to valid type + target_qtype = random.choice(valid_types) + + balanced_assignments[sample_idx] = target_qtype +``` + +### 3. Simple Question Type Balancing (DURATION, VOLUME Tasks) + +```python +# DURATION: 2 types → N/2 each +# VOLUME: 2 types → N/2 each + +samples_per_type = num_samples // len(question_types) +remainder = num_samples % len(question_types) + +balanced_types = [] +for qtype in question_types: + count = samples_per_type + (1 if remainder > 0 else 0) + balanced_types.extend([qtype] * count) + remainder = max(0, remainder - 1) + +random.shuffle(balanced_types) +``` + +### 4. Category Usage Balancing + +All 50 ESC-50 categories are used equally via least-used selection: + +```python +def get_least_used_categories(self, n: int, exclude: List[str] = None) -> List[str]: + # Sort categories by usage count + sorted_cats = sorted( + self.category_usage_counts.items(), + key=lambda x: (x[1], x[0]) # Sort by count, then alphabetically for ties + ) + # Filter excluded and return first n + available = [cat for cat, _ in sorted_cats if cat not in (exclude or [])] + return available[:n] +``` + +Each task calls `reset_category_usage()` at the start to ensure independent balancing. + +### 5. N_Clips Selection Strategy + +**COUNT Task**: Uses capacity-aware answer balancing (see #1 above) + +**ORDER and VOLUME Tasks**: Use **silence reduction strategy** (NOT balanced): +```python +# Randomly sample n_clips from valid range to minimize silence +min_clips = max(2, max_clips - 3) +max_clips_allowed = min(max_clips, max_clips_per_sample, len(CATEGORIES)) + +if min_clips > max_clips_allowed: + min_clips = max_clips_allowed # Handle edge case + +n_clips = random.randint(min_clips, max_clips_allowed) +``` + +This maximizes clip usage within the allowed range, minimizing excessive silence. + +--- + +## Rejection Logic and Retry Mechanisms + +### When Samples Are Rejected + +Rejections occur only in tasks with gap constraints: + +1. **DURATION Task**: Gap constraint not satisfied + - LONGEST: target_duration < max_background × 1.5 + - SHORTEST: target_duration > min_background × 0.5 + +2. **VOLUME Task**: Gap constraint not satisfied + - MAX_LOUDNESS: actual_gap_dB < required_gap_dB (3.52 dB) + - MIN_LOUDNESS: actual_gap_dB < required_gap_dB (6.02 dB) + +### DURATION Task Retry Logic + +```python +def generate_dataset(self): + all_metadata = [] + sample_idx = 0 + type_idx = 0 + + while len(all_metadata) < num_samples and type_idx < len(balanced_types) * 2: + question_type = balanced_types[type_idx % len(balanced_types)] + + metadata = self.generate_sample(sample_idx, question_type) + + if metadata is not None: + all_metadata.append(metadata) + sample_idx += 1 + # If None, sample was rejected - just move to next + + type_idx += 1 +``` + +### Rejection Rate Calculation + +$$\text{Rejection Rate} = \frac{\text{rejections}}{\text{rejections} + \text{successes}} \times 100\%$$ + +--- + +## Complete Task Creation Explanation + +### How Each Task Is Generated (Step-by-Step) + +#### COUNT TASK - "How many unique sounds?" + +**Goal**: Create audio with N unique sound sources, ask how many distinct sounds exist. + +**Process**: +1. **Preprocessing**: None (uses raw ESC-50 clips) +2. **Duration Generation**: `target_duration ~ Uniform(20s, 60s)` per sample +3. **Calculate Max Clips**: `max_clips = get_max_clip_num_to_be_joined(target_duration, 5s, 100ms)` + - Example: 45s duration → ~8 clips of 5s each with 100ms silence between +4. **Balanced Answer Selection**: Pre-generated pool of answers [1,2,3,...,10] balanced equally + - Target answer (e.g., 5 unique sounds) selected from pool +5. **Silence Reduction**: Cap target at `min(target_answer, max_clips)` + - If target=8 but max_clips=6 → use 6 (prevents excessive silence) +6. **Category Selection**: Pick N least-used categories from ESC-50 (balancing) +7. **Audio Construction**: + - Load one file per category + - Calculate repetitions needed: `total_clips = max_clips` + - Distribute repetitions across N sources + - **Ordering mode**: + - `random`: Shuffle clips (A B A C B...) - harder, tests recognition + - `consecutive`: Group same-source (AAA BBB CCC) - easier +8. **Silence Insertion**: + - Minimum 100ms silence between EVERY clip + - Extra silence (up to 500ms per gap) distributed from remainder + - **Crossfade**: 50ms within same-source, 500ms at audio-silence boundaries +9. **Question Generation**: MCQ + open-text asking "How many unique sounds?" +10. **Export**: Save audio WAV + metadata + +**Example**: +- Target duration: 40s +- Max clips that fit: 7 clips (7×5s + 6×0.1s = 35.6s) +- Target answer: 3 unique sounds +- Actual: 3 unique sounds (7 total clips: 3+2+2 repetitions) +- Ordering: Random shuffle → [A B A C B A C] +- Result: Audio with 3 distinct sounds, some repeated, with silences and crossfades + +#### DURATION TASK - "Which sound is longest/shortest?" + +**Goal**: Create audio where one sound has clearly longest/shortest duration compared to others. + +**Process**: +1. **Preprocessing** (preprocess_esc50.py - REQUIRED): + - Load raw ESC-50 clips + - Detect sound regions using adaptive noise-floor thresholding + - Trim leading/trailing silence (preserve internal structure) + - Calculate effective duration per clip + - Save trimmed audio + effective_durations.csv +2. **Duration Generation**: `target_duration ~ Uniform(20s, 60s)` per sample +3. **Calculate Max Clips**: Based on average effective duration (~3.86s) +4. **Determine N Sources**: Based on question type and max_clips + - **LONGEST**: Target needs ≥2 clips, backgrounds get 1 each → `n_sources ≤ max_clips - 1` + - **SHORTEST**: Target gets 1 clip, backgrounds need ≥2 each → `n_sources ≤ 1 + (max_clips-1)//2` +5. **Category Selection**: Pick target + backgrounds from least-used categories +6. **Slot Distribution**: Allocate clips to each source + - LONGEST: Give most clips to target, 1 to each background + - SHORTEST: Give 1 to target, multiple to each background +7. **Clip Selection**: For each source, select clips from preprocessed dataset +8. **Gap Verification**: + - LONGEST: `target_duration ≥ max_background × 1.5` ✓ + - SHORTEST: `target_duration ≤ min_background × 0.75` ✓ + - If gap not satisfied: Try redistributing slots, or reject sample +9. **Audio Construction**: + - Load trimmed clips + - Concatenate with consecutive ordering (preserve periodicity) + - Insert silences with crossfades +10. **Question Generation**: "Which sound is longest/shortest?" +11. **Export**: Audio + metadata + +**Example**: +- Question type: LONGEST +- Target duration: 50s, max_clips: 12 +- N sources: 4 (target + 3 backgrounds) +- Slot distribution: Target=6 clips (6×3.8s=22.8s), Backgrounds=2 clips each (2×3.8s=7.6s) +- Gap check: 22.8s ≥ 7.6s × 1.5 = 11.4s ✓ +- Result: Target sound clearly longest + +#### ORDER TASK - "Which sound is first/last/after X?" + +**Goal**: Create ordered sequence of sounds, ask about temporal relationships. + +**Process**: +1. **Preprocessing**: None (uses raw ESC-50) +2. **Duration Generation**: Pre-generated durations to exactly fill task duration +3. **Calculate Max Clips**: `get_max_clip_num_to_be_joined(target_duration, 5s, 100ms)` +4. **Balanced N_Clips Selection**: Pre-generated pool [2,3,4,...,10] balanced equally + - Target n_clips (e.g., 5) selected from pool + - Capped at `min(target_n_clips, max_clips)` (silence reduction) +5. **Question Type Selection**: From balanced pool (first, last, second, after, before, second_last) +6. **Answer Position Determination**: Based on question type + - `first` → position 0 + - `last` → position n_clips-1 + - `second` → position 1 + - `second_last` → position n_clips-2 + - `after`/`before` → random valid position +7. **Category Selection**: Answer category at determined position, others from least-used +8. **Audio Construction**: + - Load one clip per position + - Build sequence with silences (min 100ms + random extra up to 500ms per gap) + - **Crossfade**: 500ms at audio-silence boundaries for smooth transitions +9. **Question Generation**: + - MCQ: "Which sound is first?" with 4 options + - Open-text: "What is the first sound?" + full sequence +10. **Export**: Audio + metadata + +**Example**: +- Target n_clips: 4, max_clips: 8 → use 4 ✓ +- Question: "Which sound is second?" +- Answer position: 1 (0-indexed) +- Sequence: [dog, cat, bird, rain] → Answer: cat +- Audio: 4 clips in order with silences and crossfades + +#### VOLUME TASK - "Which sound is loudest/softest?" + +**Goal**: Create audio with clips at different volume levels, ask about loudness comparison. + +**Process**: +1. **Preprocessing**: None (uses raw ESC-50) +2. **Duration Generation**: Pre-generated durations +3. **Calculate Max Clips**: `get_max_clip_num_to_be_joined(...)` +4. **Balanced N_Clips Selection**: From pool [2,3,...,10], capped at max_clips +5. **Question Type Selection**: "max_loudness" or "min_loudness" (balanced 50/50) +6. **Volume Level Generation**: Create n_clips volume adjustments (in dB) + - Ensure gap constraint (multiplier 4.0 for max, 0.25 for min) + - Example: [+12dB, 0dB, -6dB] → max at +12dB has ≥12dB gap from second +7. **Gap Verification** (up to 10 attempts): + - MAX: `max_level - second_max ≥ 20×log10(4.0) ≈ 12dB` + - MIN: `second_min - min_level ≥ 20×log10(4.0) ≈ 12dB` + - If not satisfied: Regenerate levels or reject +8. **Category Selection**: Answer at determined position, others from least-used +9. **Audio Construction**: + - Load clips + - **CRITICAL: Normalize all to baseline (-20 dBFS)** → ensures controlled comparison + - Apply volume adjustments to normalized clips + - Concatenate with silences and crossfades +10. **Question Generation**: "Which sound has maximum/minimum loudness?" +11. **Export**: Audio + metadata with volume levels + +**Example**: +- Target n_clips: 3, max_clips: 6 → use 3 ✓ +- Question: "max_loudness" +- Volume levels: [+12dB, 0dB, -6dB] +- Gap check: 12 - 0 = 12dB ≥ 12dB ✓ +- Process: Normalize all clips to -20dBFS, then adjust to [-8dBFS, -20dBFS, -26dBFS] +- Result: First sound clearly loudest + +### Key Innovations + +1. **Crossfade Everywhere**: Smooth transitions at audio-silence boundaries (500ms), small crossfade within same-source repetitions (50ms) +2. **Adaptive Preprocessing**: Noise-floor thresholding adapts per-clip (duration task) +3. **Silence Reduction**: ORDER/VOLUME tasks sample n_clips from [max_clips-3, max_clips_per_sample] to minimize silence +4. **Balanced Distribution**: + - **COUNT**: Balances answers (1 to max_clips_per_sample) + question types + - **ORDER/VOLUME**: Balances question types only (n_clips uses silence reduction) +5. **Category Balancing**: Least-used selection ensures all 50 ESC-50 categories used evenly +6. **Gap Constraints**: Mathematical guarantees for duration/volume comparisons +7. **Exact Duration Filling**: Pre-generate sample durations to exactly fill task duration (no wasted time) + +--- + +## Command-Line Arguments + +### Main Pipeline (`main.py`) + +```bash +python main.py [OPTIONS] + +Options: + --config, -c PATH Path to config YAML (default: config.yaml) + --tasks, -t TASKS Specific tasks to run (choices: count, duration, order, volume) + --output, -o PATH Custom output directory (overrides config) + +Examples: + # Run all enabled tasks with default config + python main.py + + # Run specific tasks only + python main.py --tasks count order + + # Use custom config and output + python main.py --config my_config.yaml --output ./my_dataset +``` + +### Preprocessing Script (`preprocess_esc50.py`) + +```bash +python preprocess_esc50.py [OPTIONS] + +Options: + --config PATH Path to config YAML (default: config.yaml) + --threshold-strategy STRATEGY "noise_floor" or "peak_relative" + --threshold-db FLOAT Threshold in dB (for peak_relative) + --noise-floor-percentile FLOAT Percentile for noise floor estimation + --noise-floor-delta-db FLOAT Delta above noise floor in dB + --min-sound-ms INT Minimum sound duration in ms + --no-trimmed-audio Skip saving trimmed audio files + --output-dir PATH Custom output directory + +Examples: + # Use config defaults + python preprocess_esc50.py --config config.yaml + + # Override threshold parameters + python preprocess_esc50.py --config config.yaml \ + --threshold-strategy noise_floor \ + --noise-floor-percentile 2.0 \ + --noise-floor-delta-db 5.0 \ + --min-sound-ms 25 + + # Generate metadata only (no trimmed audio) + python preprocess_esc50.py --config config.yaml --no-trimmed-audio +``` + +--- + +## Summary + +The TREA 2.0 pipeline generates balanced, constraint-satisfying audio QA samples through: + +1. **Preprocessing** (Duration only): Adaptive noise-floor thresholding + edge trimming +2. **Exact Duration Filling**: Pre-generate sample durations to sum exactly to task duration +3. **Capacity-Aware Balancing**: + - **COUNT**: High answer targets → high-capacity samples + - **ORDER**: Advanced question types → high-capacity samples +4. **Silence Reduction**: ORDER/VOLUME randomly sample n_clips from [max_clips-3, max_clips_per_sample] +5. **Crossfade Transitions**: Smooth audio-silence boundaries (500ms) + within-source (50ms) +6. **Category Balancing**: Least-used selection ensures even ESC-50 category distribution +7. **Gap Constraints**: Mathematical guarantees (1.5x for longest, 0.75x for shortest, 4.0x/0.25x for volume) +8. **Retry Mechanisms**: Failed samples rejected, pipeline continues until target count reached + +All randomness is seeded (`random_seed: 42`) for reproducibility. diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..74fa7906eef5c3d2457e753465e8c52676afb937 --- /dev/null +++ b/README.md @@ -0,0 +1,112 @@ +# TREA 2.0 Pipeline + +Audio question-answering dataset generator using ESC-50. Creates four task types: COUNT, DURATION, ORDER, and VOLUME. + +## Quick Start + +```bash +# 1. Install dependencies +pip install -r requirements.txt + +# 2. Preprocess ESC-50 (required for DURATION task only) +python preprocess_esc50.py --config config.yaml + +# 3. Generate datasets +python main.py --config config.yaml +``` + +## Configuration + +Edit `config.yaml` to set: +- **Task duration**: `task_duration_size` (hours) per task +- **Clip duration range**: `min_clip_duration` to `max_clip_duration` (seconds) +- **ESC-50 paths**: Point to your ESC-50 dataset location +- **Enable/disable tasks**: Set `enabled: true/false` for each task + +## Key Files + +- **`config.yaml`** - All configuration parameters +- **`main.py`** - Pipeline entry point (runs all tasks) +- **`preprocess_esc50.py`** - Preprocess ESC-50 for duration task +- **`tasks/task_*.py`** - Individual task generators + +## Tasks + +| Task | Question | Example | +|------|----------|---------| +| **COUNT** | "How many unique sounds?" | Audio with 5 distinct sound types | +| **DURATION** | "Which sound is longest/shortest?" | Compare sound durations | +| **ORDER** | "Which sound is first/last/after X?" | Temporal sequence questions | +| **VOLUME** | "Which sound is loudest/softest?" | Loudness comparison | + +## Output Structure + +``` +output/{task}/ +├── audios/*.wav # Generated audio files +├── {task}_mcq.csv # Multiple choice questions +├── {task}_open_text.csv # Open-ended questions +└── {task}_metadata.csv # Detailed metadata +``` + +## Shell scripts (quick) + +Use the provided shell helpers for simple runs. + +Run full pipeline (uses `python main.py` under the hood): + +```bash +# Make executable and run (from pipeline/) +./run_pipeline.sh + +# With custom config, tasks, and output +./run_pipeline.sh --config my_config.yaml --tasks count,order --output ./my_dataset +``` + +Run the LLM answer generation across splits (uses `llm_answer_generator.py`): + +```bash +# Processes open_text CSVs across splits/tasks defined in the script +./run_llm_answers_all.sh + +# Or run per-file with the helper script directly +python llm_answer_generator.py --input /path/to/count_open_text.csv --mode open_text --task count +``` + + +## Advanced Usage + +```bash +# Run specific tasks only +python main.py --tasks count order + +# Use custom config +python main.py --config my_config.yaml + +# Custom output directory +python main.py --output /path/to/output + +# Preprocess with custom parameters +python preprocess_esc50.py --config config.yaml \ + --threshold-strategy noise_floor \ + --noise-floor-percentile 2.0 \ + --noise-floor-delta-db 5.0 +``` + +## Documentation + +See **`DOCS.md`** for complete technical documentation including: +- Mathematical formulations +- Detailed algorithm explanations +- Configuration parameter reference +- Preprocessing pipeline details +- Balancing mechanisms + +## Requirements + +- Python 3.8+ +- pydub +- numpy +- pandas +- tqdm +- pyyaml diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1ed33629ef32a7e96074f8a9aa7156c1bec2b30b --- /dev/null +++ b/config.yaml @@ -0,0 +1,348 @@ +# Temporal Reasoning Audio Dataset Pipeline Configuration +##uniform distributuon for clip duration +##not mixing datasets + +##count +##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder + +##duration +##amplitude based filtering -> normalize -> threshold based selection +##gap between audio clips - x2/1.5 the shorter one -> add as param +##different clips of the same class can be contatenated to reach target duration +##consecutive ordering only +##based on n unique sources and total clips we can have -> shortest and longest duration calculation + +##reject datapoint if same target audio clip cannot be repeated to maintain the gap - arg +##sample different clip from the same class -> check if different clips can be used to fill the gap - arg + +##amplitude filtered durations in metadata csv + +##get_max_clip_num_to_be_joined() +##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder + +##ensure_silence_between_clips() +##silence should always be there between two clips + +##order +##repeat target clips +##second and second last - modify question types + +##volume +##amplitude average loudness for a audio clip -> repetitions but same clip(argument) -> different volume levels based on dB levels + +##add crossfade + +##trimming - threshold separately for each audio clip - normalize 1 and 0 - get threshold -> trim -> concatenate +##leftmost and rightmost silence trimming +##buffer for trimming - cutting early and cutting a bit late to avoid cutting important parts +##periodicity affect + +##volume - trim and get average loudness -> normalize -> adjust volume levels + +##number of clips per samples to avoid silence + + +# ESC-50 Dataset paths (each clip is 5 seconds) +esc50: + audio_path: "/path/to/ESC-50_github/audio" + metadata_path: "/path/to/ESC-50_github/meta/esc50.csv" + +# Synthetic silence audio for concatenation +synthetic_silence: + path: "/path/to/synthetic_silences" + +# Output configuration +output: + base_path: "/path/to/pipeline/test_ood" +# Dataset class-subset configuration +# Use this to create datasets (train/val/test) from a persistent subset +# of classes (e.g. use 40 of 50 classes for in-distribution splits and +# optionally create an OOD test set using all 50 classes). +dataset: + use_class_subset: false # if false, use all available classes + num_classes_subset: 40 # number of classes to use for train/val/test + subset_persist_path: "/path/to/class_subset.json" + subset_seed: 42 # RNG seed when sampling the subset (persisted) + +# Audio generation parameters +audio: + # Duration range for each GENERATED clip (in seconds) + # Original ESC-50 clips are 5s and will be concatenated to create clips in this range + min_clip_duration: 20.0 # Minimum duration for each generated clip + max_clip_duration: 60.0 # Maximum duration for each generated clip + + # Crossfade and silence + crossfade_duration: 500 # Crossfade between audio and silence (milliseconds) for smooth transitions + silence_duration: 1000 # Default silence between clips (milliseconds) + min_silence_duration: 100 # Minimum silence ALWAYS inserted between clips (milliseconds) + max_extra_silence_per_gap: 500 # Maximum extra silence per gap when distributing remainder + crossfade_within_source: 50 # Small crossfade within same-source repetitions (count task) + with_silence: true # Add silence between clips + # Duration (seconds) of individual source clips (ESC-50 are 5s by default). + # Used to compute how many source clips are concatenated to reach a target + # generated clip duration. Change only if your source clips differ. + source_clip_duration: 5.0 + + # Audio normalization + normalize: false + normalize_target_dBFS: -20.0 + +# Random seed for reproducibility +random_seed: 42 + +# LLM for question generation (local Llama 3.1 8B) +llm: + enabled: false # Set to true to use LLM for question generation + +# Task-specific configurations +tasks: + count: + enabled: true + # Total duration for ALL samples in this task combined (in hours) + # Pipeline will calculate number of samples based on min/max clip durations + task_duration_size: 2.0 # hours + + # Maximum unique sound sources per sample (single number) + # Actual number will be subsampled from max(1, max_clips-3) to min(max_clips, max_clips_per_sample) + max_clips_per_sample: 10 + + # Ordering mode for repeated clips of same source: + # "random": Clips are shuffled randomly (A B A C B A C...) - tests recognition of recurring sounds + # "consecutive": Same-source clips grouped together (AAA BBB CCC) - easier, just count blocks + ordering_mode: "random" + + # Question templates for MCQ + mcq_questions: + - "What is the number of distinct sound sources in the audio file?" + - "How many different types of sounds can be identified in this recording?" + - "How many unique types of sound are present in this audio?" + - "Identify the count of different sound sources in this clip." + - "What is the total number of unique sounds heard in this audio?" + - "How many distinct sound categories are there in this audio file?" + - "Determine the number of unique sound sources in this recording." + - "How many separate sound sources are included in the audio?" + - "What is the total number of unique sound types in this audio?" + - "How many different sound sources can be heard in this clip?" + # Question templates for open-text + open_text_questions: + - "How many distinct sound sources are present in the audio?" + - "Count the number of unique sounds in this recording." + - "What is the total count of different sound categories heard?" + - "Identify and count all unique sound types in the clip." + + duration: + enabled: true + # Total duration for ALL samples in this task combined (in hours) + task_duration_size: 2.0 # hours + + # Number of unique sound sources per sample (can be single int or list) + # Single int (e.g., 15): randomly samples from 1 to 15 (like count/order tasks) + # List (e.g., [2,3,4]): randomly picks from the list + # The script will automatically generate repetition patterns to create + # shortest/longest variations based on the target clip duration + num_unique_sources: 10 + + # Ordering: only keep "consecutive" so repeated segments of the same + # source remain grouped together, ensuring that multiple consecutive + # clips of the same audio yield the longest duration unambiguously. + ordering_methods: ["consecutive"] + + # ===================================================== + # Amplitude-based filtering parameters (preprocessing) + # ===================================================== + # RELATIVE dB threshold below peak to consider as silence + # For each clip: silence_threshold = clip_peak_dB + amplitude_threshold_db + # Example: If clip peak is -5 dB and threshold is -20, silence threshold = -25 dB + # Based on ESC-50 analysis: -20 dB gives ~60% effective duration (good balance) + # More aggressive (removes more silence): -15 dB + # More conservative (keeps more sound): -25 dB + amplitude_threshold_db: -20.0 + + # Minimum duration of sound region to keep (milliseconds) + # Filters out very short transient noise spikes + # ESC-50 is curated, so 20-30ms is sufficient + min_sound_duration_ms: 25 + + # ===================================================== + # Adaptive threshold strategy + # ===================================================== + # "peak_relative": threshold = peak_dB + amplitude_threshold_db (fixed offset from peak) + # - Simple but not adaptive to actual noise levels + # "noise_floor": threshold = percentile(dB, N) + delta_dB (RECOMMENDED) + # - Fully adaptive per-clip based on its own noise floor + # - Each clip analyzed independently - no fixed dB values needed + # - Better for diverse audio with varying noise levels + threshold_strategy: "noise_floor" + + # Noise floor estimation percentile (used when threshold_strategy = noise_floor) + # Lower percentile = more conservative estimate of background noise + # 5 = use 5th percentile of dB values as noise floor estimate (better for sparse sounds) + noise_floor_percentile: 2.0 + + # Delta above noise floor (dB) to set as threshold + # This is relative to EACH clip's own noise floor, not a fixed dB value + # 8dB above the clip's noise floor works well for most ESC-50 clips + # Higher = more conservative (keeps more), Lower = more aggressive (removes more) + noise_floor_delta_db: 5.0 + + # Path to preprocessed ESC-50 data (effective durations + trimmed audio) + preprocessed_data_path: "/path/to/ESC-50_preprocessed" + + # ===================================================== + # Duration gap multipliers + # ===================================================== + # For LONGEST questions: target_effective >= max_background × multiplier_longest + multiplier_longest: 1.5 + # For SHORTEST questions: target_effective <= min_background × multiplier_shortest + # Using 0.75 instead of 0.5 for smaller gaps (easier to distinguish) + multiplier_shortest: 0.75 + + # Minimum effective duration per source (seconds) + # Clips with less than this duration are harder to distinguish + min_effective_duration_per_source: 1.0 + + # ===================================================== + # Fallback/rejection options + # ===================================================== + # Reject sample if duration gap cannot be satisfied + reject_if_gap_not_met: true + # Try different clips from same class if one clip isn't enough + sample_different_clips_same_class: true + + # Question types + question_types: ["shortest", "longest"] + # MCQ questions + mcq_questions: + shortest: "Which of the following sounds is heard for the shortest duration?" + longest: "Which of the following sounds is heard for the longest duration?" + # Open-text questions + open_text_questions: + shortest: "Which sound is heard for the shortest duration in the audio?" + longest: "Which sound is heard for the longest duration in the audio?" + + order: + enabled: true + # Total duration for ALL samples in this task combined (in hours) + task_duration_size: 2.0 # hours + + # Maximum clips to join per sample (minimum 2 for ordering) + # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample) + max_clips_per_sample: 10 + + # Whether to allow repeating clips from the same source category + # If true: sequence could be [dog, dog, cat, bird] (same clip repeated) + # If false: sequence is always unique sources + allow_source_repetition: false + + # Minimum clips needed for "second" and "second_last" questions + # Set to 4 to ensure second and second_last refer to different positions + # (with 3 clips, both would refer to middle clip at position 1) + min_clips_for_second_questions: 3 + + # Question types: "first", "last", "after", "before", "second", "second_last" + # "second" and "second_last" only generated when n_clips >= min_clips_for_second_questions + question_types: ["first", "last", "after", "before", "second", "second_last"] + + # MCQ question templates + mcq_questions: + first: "Which sound appears first in the audio clip?" + last: "Which sound appears last in the audio clip?" + after: "Which sound comes after {sound1}?" + before: "Which sound comes before {sound2}?" + second: "Which sound appears second in the audio clip?" + second_last: "Which sound appears second to last in the audio clip?" + # Open-text question templates + open_text_questions: + first: "What is the first sound you hear in the audio?" + last: "What is the last sound you hear in the audio?" + after: "What sound comes after {sound1}?" + before: "What sound comes before {sound2}?" + second: "What is the second sound you hear in the audio?" + second_last: "What sound is second to last in the audio?" + sequence: "List the sounds in the order they appear in the audio." + + volume: + enabled: true + # Total duration for ALL samples in this task combined (in hours) + task_duration_size: 2.0 # hours + + # Maximum clips with different volumes per sample + # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample) + max_clips_per_sample: 10 + + # ===================================================== + # Normalization settings (CRITICAL for volume comparison) + # ===================================================== + # All clips are FIRST normalized to baseline, THEN volume adjusted + # This ensures volume differences are controlled and comparable + normalize_to_baseline: true + baseline_dBFS: -20.0 # Normalize all clips to this level first (used if use_lufs=false) + + # ===================================================== + # LUFS (Perceived Loudness) Settings + # ===================================================== + # LUFS (Loudness Units Full Scale) measures PERCEIVED loudness + # Unlike dBFS which only measures RMS amplitude, LUFS accounts for + # human hearing sensitivity to different frequencies (K-weighting) + # + # IMPORTANT: For volume comparison task, we DISABLE LUFS normalization! + # LUFS makes everything the same perceived loudness, defeating the purpose. + # Instead, we normalize to a baseline dBFS then apply LARGE volume adjustments. + use_lufs: false # DISABLED for audible volume differences + baseline_lufs: -23.0 # EBU R128 standard (not used when use_lufs=false) + + # ===================================================== + # Volume gap multipliers (similar to duration task) + # ===================================================== + # For MAX_LOUDNESS questions: target_loudness >= second_loudest × multiplier_max + # Multiplier 2.5 = ~8dB difference = clearly audible + # Multiplier 4.0 = ~12dB difference = very obvious (4x perceived loudness) + multiplier_max_loudness: 4.0 + + # For MIN_LOUDNESS questions: target_loudness <= second_softest × multiplier_min + # Multiplier 0.25 = ~12dB quieter = clearly distinguishable + multiplier_min_loudness: 0.25 + + # Reject sample if loudness gap cannot be satisfied + reject_if_gap_not_met: true + + # ===================================================== + # Source clip options + # ===================================================== + # If true: same clip can be repeated at different volumes + # If false: always use different source clips (default behavior) + use_same_clip_different_volumes: false + + # If use_same_clip_different_volumes is true, how many repetitions per source? + # Can be a single int or list for variety + repetitions_per_source: [2, 3, 4] + + # Question types: "max_loudness", "min_loudness" + question_types: ["max_loudness", "min_loudness"] + + # MCQ questions + mcq_questions: + max_loudness: "Which sound has the maximum loudness in the audio?" + min_loudness: "Which sound has the minimum loudness in the audio?" + # Open-text questions + open_text_questions: + max_loudness: "Identify the sound with maximum loudness in the audio clip." + min_loudness: "Identify the sound with minimum loudness in the audio clip." + order_volume: "List the sounds in order from maximum to minimum loudness." + +# MCQ options configuration +mcq: + num_options: 4 + option_labels: ["A", "B", "C", "D"] + # Strategy for generating distractor options + # "present_only": only use sounds present in audio + # "mixed": mix of present and absent sounds + # "balanced": balanced distribution + distractor_strategy: "balanced" + +# Logging configuration +logging: + level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL + log_file: "pipeline.log" + console_output: true + \ No newline at end of file diff --git a/llm_answer_generator.py b/llm_answer_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..b8e15e356645d3908ed5538e3a1b403a51577de6 --- /dev/null +++ b/llm_answer_generator.py @@ -0,0 +1,268 @@ +import pandas as pd +import argparse +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch +import random + +# Convert MCQ CSV to NL answers using a text-only LLM (meta-llama/Llama-3.1-8B-Instruct) +# Adds: (1) stronger LLM-driven variability for duration/volume in open_text mode via system prompt +# (2) --one_word_ratio (default 0.2) to skip forward pass for a fraction of rows, +# outputting the normalized (underscore-removed) answer only. + + +def convert_to_natural_phrase(val): + """Convert underscore-separated tokens to natural phrases.""" + if isinstance(val, str) and "_" in val: + val = val.replace("_", " ") + return val + + +def generate_answer(tokenizer, model, question, correct_value, device, mode="mcq"): + """Generate a natural language answer using a text-only LLM. + + mode: "mcq" (default) uses the original MCQ-oriented prompt. + "open_text" uses a direct rewrite prompt for provided question/answer pairs. + """ + correct_value = convert_to_natural_phrase(correct_value) + + if mode == "open_text": + system_preamble = ( + "You convert (Question, short Answer) into EXACTLY ONE natural English sentence that answers the Question.\n\n" + "HARD RULES:\n" + "- Output exactly ONE sentence. No newlines, no bullet points, no labels, no quotes.\n" + "- Use ONLY the provided Answer content as the factual answer; do not add any new facts.\n" + "- Be concise and direct.\n" + "- Do NOT include any numbers unless the question is a COUNT question.\n" + "- Vary phrasing strongly across items; avoid repeating the same structure.\n\n" + "VARIABILITY REQUIREMENT (IMPORTANT):\n" + "- For all questions, you MUST vary sentence structure.\n" + "- Randomly choose ONE of these patterns each time:\n" + " (A) Start with the sound name (Answer) -> then the relation.\n" + " (B) Start with the relation -> then the sound name (Answer).\n" + " (C) Use an 'it`s...' style clause after the Answer.\n" + " (D) Use a short, natural rephrase with different verbs (e.g., lasts, continues, stands out, comes through).\n" + "- Do not always use 'The sound with the ... is ...' — that pattern should be rare.\n\n" + "TASK HANDLING (infer from the Question):\n" + "- COUNT questions (how many / count / number):\n" + " * If Answer is numeric, write it EITHER as digits (e.g., 10) OR as a word (e.g., ten). Do NOT include both.\n" + "- DURATION questions (longest/shortest):\n" + " * Clearly state longest vs shortest, and use the Answer as the sound name. Do not include any numbers.\n" + "- VOLUME questions (minimum/maximum loudness, quietest/loudest):\n" + " * Match minimum vs maximum loudness and use the Answer as the sound name. No dB values.\n" + "- ORDER questions (first/second/before/after/second-to-last):\n" + " * Match the requested relation and use the Answer as the sound name.\n\n" + "Return only the sentence." + ) + + user_prompt = ( + f"Question: {question}\n" + f"Answer: {correct_value}\n" + "Rewrite the answer as a single, natural sentence that directly answers the question." + ) + else: + system_preamble = ( + "You are a helpful assistant that converts multiple-choice QA pairs into natural language answers.\n" + "CRITICAL RULES:\n" + "1. Write as a human would naturally speak - vary sentence structure and avoid repetitive patterns\n" + "2. Keep responses concise but natural and affirmative avoiding words like 'might/may' or 'could' - one clear sentence\n" + "3. Do not mention 'among the options/among the following' even if the question mentions it. This natural language statement is supposed to be a direct answer.\n" + "4. Do NOT invent sounds.\n" + "5. Do not reason to answer the question, you're just supposed to provide the correct mcq answer as a natural language answer in a single sentence.\n" + "Return only the natural language answer, nothing else." + ) + user_prompt = ( + f"Now, given the question: '{question}' and the correct answer: '{correct_value}', " + f"write one natural-language answer as you would expect from a human." + ) + + # Chat format + messages = [ + {"role": "system", "content": system_preamble}, + {"role": "user", "content": user_prompt}, + ] + inputs = tokenizer.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, + return_tensors="pt", + ).to(device) + + input_length = inputs.shape[1] + + with torch.no_grad(): + output = model.generate( + inputs, + max_new_tokens=64, + do_sample=True, + temperature=0.8, + top_p=0.9, + repetition_penalty=1.05, + no_repeat_ngram_size=3, + pad_token_id=tokenizer.eos_token_id, + eos_token_id=tokenizer.eos_token_id, + ) + + generated_ids = output[0, input_length:] + response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip() + print(f"Model response: {response}") + return response + + +def detect_csv_format(df): + """ + Detect CSV layout and return column mappings. + Supports: + - original MCQ format + - perturbed MCQ format + - open-text format (question/answer present) + """ + columns = df.columns.tolist() + + if "correct" in columns and "id" in columns and "audio_path" in columns: + # Original format (count.csv) + return { + "id_col": "id", + "audio_path_col": "audio_path", + "answer_col": "correct", + "question_col": "question", + "format_type": "original", + } + if "answer" in columns and "idx" in columns and "new_audio_path" in columns: + # Perturbed format (count_perturbed.csv) + return { + "id_col": "idx", + "audio_path_col": "new_audio_path", + "answer_col": "answer", + "question_col": "question", + "format_type": "perturbed", + } + if "answer" in columns and "question" in columns: + # Open-text format + return { + "id_col": "id" if "id" in columns else None, + "audio_path_col": "audio_path" if "audio_path" in columns else None, + "answer_col": "answer", + "question_col": "question", + "format_type": "open_text", + } + + raise ValueError(f"Unknown CSV format. Columns found: {columns}") + + +def main(): + parser = argparse.ArgumentParser( + description="Convert CSV to NL answers (MCQ or open-text) using meta-llama/Llama-3.1-8B-Instruct" + ) + parser.add_argument("--input", required=True, help="Input CSV file") + parser.add_argument("--output", required=False, help="Output CSV file (defaults to input for in-place append)") + parser.add_argument( + "--mode", + required=True, + choices=["mcq", "open_text"], + help="Conversion mode: mcq -> convert MCQ correct option to natural answer; open_text -> rewrite provided short answer to a natural sentence", + ) + parser.add_argument( + "--task", + required=True, + choices=["count", "duration", "order", "volume"], + help="Task type this CSV belongs to (used for bookkeeping/logging)", + ) + + # NEW: one-word skipping + parser.add_argument( + "--one_word_ratio", + type=float, + default=0.2, + help="Fraction of samples to output as just the normalized one-word/phrase answer (no LLM forward pass). Default 0.2", + ) + parser.add_argument( + "--seed", + type=int, + default=123, + help="Random seed for reproducible one_word sampling.", + ) + + args = parser.parse_args() + random.seed(args.seed) + + print("Loading meta-llama/Llama-3.1-8B-Instruct tokenizer and model...") + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", use_fast=False) + model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-3.1-8B-Instruct", + torch_dtype="auto", + device_map="auto", + ) + model.eval() + + df = pd.read_csv(args.input) + + # Detect CSV format and get column mappings + format_info = detect_csv_format(df) + print(f"Detected CSV format: {format_info['format_type']}") + + # Validate requested mode against detected CSV format + if args.mode == "mcq" and format_info["format_type"] == "open_text": + raise ValueError( + "Requested mode=mcq but input appears to be open_text format. Use --mode open_text or supply an MCQ CSV." + ) + if args.mode == "open_text" and format_info["format_type"] != "open_text": + raise ValueError( + "Requested mode=open_text but input does not appear to be open_text format. Use --mode mcq or supply an open_text CSV." + ) + + output_path = args.output if args.output else args.input + + nl_rows = [] + device = model.device + + for i, row in df.iterrows(): + question = row[format_info["question_col"]] + + # Resolve correct_value from CSV format + if format_info["format_type"] == "open_text": + correct_value = row[format_info["answer_col"]] + else: + correct_letter = row[format_info["answer_col"]] + option_map = {"A": "optionA", "B": "optionB", "C": "optionC", "D": "optionD"} + correct_value = row[option_map[correct_letter]] + + # Normalize underscores BEFORE deciding one_word skip + correct_value = convert_to_natural_phrase(correct_value) + + print(f"[{i+1}/{len(df)}] Q: {question} | Ans: {correct_value}") + + # 20%: one-word/phrase answer, no forward pass + if random.random() < args.one_word_ratio: + nl_answer = correct_value + print(f"Skipped LLM (one_word_ratio). Output: {nl_answer}") + else: + nl_answer = generate_answer( + tokenizer, + model, + question, + correct_value, + device, + mode=("open_text" if format_info["format_type"] == "open_text" else "mcq"), + ) + + nl_rows.append( + { + "question": question, + "id": row[format_info["id_col"]] if format_info.get("id_col") and format_info["id_col"] in row else None, + "audio_path": row[format_info["audio_path_col"]] + if format_info.get("audio_path_col") + else None, + "original_answer": correct_value, + "open_text_answer": nl_answer, + } + ) + + # Merge back as new column to the original CSV to preserve all fields + nl_df = pd.DataFrame(nl_rows) + df["open_text_answer"] = nl_df["open_text_answer"] + df.to_csv(output_path, index=False) + print(f"Appended natural language answers to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..d7e9851f9f8d60e29af530a4408160eeac809909 --- /dev/null +++ b/main.py @@ -0,0 +1,272 @@ +""" +Main pipeline runner for temporal reasoning audio dataset generation. + +This script orchestrates the generation of all task datasets. +""" + +import argparse +import sys +import yaml +from pathlib import Path +from typing import List, Optional + +# Add project root to path +sys.path.append(str(Path(__file__).parent)) + +from utils import setup_logger, set_random_seed +from tasks.task_count import CountTaskGenerator +from tasks.task_duration import DurationTaskGenerator +from tasks.task_order import OrderTaskGenerator +from tasks.task_volume import VolumeTaskGenerator + + +def load_config(config_path: str) -> dict: + """Load configuration from YAML file.""" + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + return config + + +def run_count_task(config: dict, logger): + """Run the count task generation.""" + if not config['tasks']['count']['enabled']: + logger.info("Count task is disabled, skipping...") + return + + logger.info("=" * 80) + logger.info("STARTING COUNT TASK GENERATION") + logger.info("=" * 80) + + generator = CountTaskGenerator(config, logger) + generator.dataset.reset_category_usage() # Reset counter for this task + generator.generate_dataset() + + # Log category usage statistics + usage_stats = generator.dataset.get_category_usage_stats() + sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True) + logger.info("Category usage statistics (as answers):") + logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})") + logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})") + logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}") + + logger.info("Count task completed successfully!") + + +def run_duration_task(config: dict, logger): + """Run the duration task generation.""" + if not config['tasks']['duration']['enabled']: + logger.info("Duration task is disabled, skipping...") + return + + logger.info("=" * 80) + logger.info("STARTING DURATION TASK GENERATION") + logger.info("=" * 80) + + generator = DurationTaskGenerator(config, logger) + generator.dataset.reset_category_usage() # Reset counter for this task + generator.generate_dataset() + + # Log category usage statistics + usage_stats = generator.dataset.get_category_usage_stats() + sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True) + logger.info("Category usage statistics (as longest/shortest answers):") + logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})") + logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})") + logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}") + + logger.info("Duration task completed successfully!") + + +def run_order_task(config: dict, logger): + """Run the order task generation.""" + if not config['tasks']['order']['enabled']: + logger.info("Order task is disabled, skipping...") + return + + logger.info("=" * 80) + logger.info("STARTING ORDER TASK GENERATION") + logger.info("=" * 80) + + generator = OrderTaskGenerator(config, logger) + generator.dataset.reset_category_usage() # Reset counter for this task + generator.generate_dataset() + + # Log category usage statistics + usage_stats = generator.dataset.get_category_usage_stats() + sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True) + logger.info("Category usage statistics (as first/last/after/before answers):") + logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})") + logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})") + logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}") + + logger.info("Order task completed successfully!") + + +def run_volume_task(config: dict, logger): + """Run the volume task generation.""" + if not config['tasks']['volume']['enabled']: + logger.info("Volume task is disabled, skipping...") + return + + logger.info("=" * 80) + logger.info("STARTING VOLUME TASK GENERATION") + logger.info("=" * 80) + + generator = VolumeTaskGenerator(config, logger) + generator.dataset.reset_category_usage() # Reset counter for this task + generator.generate_dataset() + + # Log category usage statistics + usage_stats = generator.dataset.get_category_usage_stats() + sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True) + logger.info("Category usage statistics (as loudest/softest answers):") + logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})") + logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})") + logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}") + + logger.info("Volume task completed successfully!") + + +def run_pipeline( + config_path: str, + tasks: Optional[List[str]] = None, + output_path: Optional[str] = None +): + """ + Run the complete dataset generation pipeline. + + Args: + config_path: Path to configuration YAML file + tasks: Optional list of specific tasks to run (default: all enabled tasks) + output_path: Optional custom output path (overrides config) + """ + # Load configuration + config = load_config(config_path) + + # Override output path if provided + if output_path: + config['output']['base_path'] = output_path + + # Create output directory + output_base = Path(config['output']['base_path']) + output_base.mkdir(parents=True, exist_ok=True) + + # Set random seed + set_random_seed(config['random_seed']) + + # Setup main logger + logger = setup_logger( + 'pipeline', + log_file=str(output_base / config['logging']['log_file']), + level=config['logging']['level'], + console_output=config['logging']['console_output'] + ) + + logger.info("=" * 80) + logger.info("TEMPORAL REASONING AUDIO DATASET GENERATION PIPELINE") + logger.info("=" * 80) + logger.info(f"Configuration: {config_path}") + logger.info(f"Output directory: {output_base}") + logger.info(f"Random seed: {config['random_seed']}") + logger.info(f"ESC-50 audio path: {config['esc50']['audio_path']}") + logger.info(f"ESC-50 metadata path: {config['esc50']['metadata_path']}") + + # Determine which tasks to run + task_map = { + 'count': run_count_task, + 'duration': run_duration_task, + 'order': run_order_task, + 'volume': run_volume_task + } + + if tasks: + tasks_to_run = {k: v for k, v in task_map.items() if k in tasks} + logger.info(f"Running specific tasks: {', '.join(tasks)}") + else: + tasks_to_run = task_map + logger.info("Running all enabled tasks") + + # Run tasks + for task_name, task_func in tasks_to_run.items(): + try: + task_func(config, logger) + except Exception as e: + logger.error(f"Error running {task_name} task: {e}", exc_info=True) + raise + + logger.info("=" * 80) + logger.info("PIPELINE COMPLETED SUCCESSFULLY!") + logger.info("=" * 80) + logger.info(f"All outputs saved to: {output_base}") + + +def main(): + """Main entry point with argument parsing.""" + parser = argparse.ArgumentParser( + description="Temporal Reasoning Audio Dataset Generation Pipeline", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Run all tasks with default config + python main.py + + # Run with custom config + python main.py --config my_config.yaml + + # Run specific tasks only + python main.py --tasks count duration + + # Use custom output directory + python main.py --output /path/to/output + + # Combine options + python main.py --config custom.yaml --tasks count order --output ./my_dataset + """ + ) + + parser.add_argument( + '--config', '-c', + type=str, + default='config.yaml', + help='Path to configuration YAML file (default: config.yaml)' + ) + + parser.add_argument( + '--tasks', '-t', + nargs='+', + choices=['count', 'duration', 'order', 'volume'], + help='Specific tasks to run (default: all enabled tasks)' + ) + + parser.add_argument( + '--output', '-o', + type=str, + help='Custom output directory (overrides config)' + ) + + args = parser.parse_args() + + # Check if config file exists + config_path = Path(args.config) + if not config_path.exists(): + # Try relative to script directory + script_dir = Path(__file__).parent + config_path = script_dir / args.config + if not config_path.exists(): + print(f"Error: Config file not found: {args.config}") + sys.exit(1) + + # Run pipeline + try: + run_pipeline( + config_path=str(config_path), + tasks=args.tasks, + output_path=args.output + ) + except Exception as e: + print(f"Pipeline failed with error: {e}") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/preprocess_esc50.py b/preprocess_esc50.py new file mode 100644 index 0000000000000000000000000000000000000000..f84de2ac2b9c0d7631cfd4791a6b4e8b0e45b205 --- /dev/null +++ b/preprocess_esc50.py @@ -0,0 +1,714 @@ +#!/usr/bin/env python3 +""" +ESC-50 Preprocessing Script for Duration Task + +This script processes all ESC-50 audio clips to: +1. Apply amplitude-based filtering to detect actual sound regions +2. Calculate effective duration (portion containing actual sound) +3. Save trimmed audio files (with silence removed) +4. Generate a CSV with all metadata including effective durations + +Usage: + python preprocess_esc50.py --config config.yaml + python preprocess_esc50.py --config config.yaml --threshold-db -40 --min-sound-ms 50 +""" + +import argparse +import os +import sys +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import numpy as np +import pandas as pd +from pydub import AudioSegment +from tqdm import tqdm + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from utils.logger import setup_logger + +logger = setup_logger(__name__) + + +def get_amplitude_array(audio: AudioSegment) -> np.ndarray: + """ + Convert AudioSegment to numpy array of amplitudes. + + Args: + audio: Input audio segment + + Returns: + Numpy array of amplitude values (normalized to -1 to 1) + """ + samples = np.array(audio.get_array_of_samples()) + + # Handle stereo by averaging channels + if audio.channels == 2: + samples = samples.reshape((-1, 2)).mean(axis=1) + + # Normalize to -1 to 1 range + max_val = float(2 ** (audio.sample_width * 8 - 1)) + samples = samples / max_val + + return samples + + +def compute_rms_envelope(samples: np.ndarray, frame_size_ms: int, hop_size_ms: int, + sample_rate: int) -> Tuple[np.ndarray, np.ndarray]: + """ + Compute RMS envelope of audio signal. + + Args: + samples: Audio samples as numpy array + frame_size_ms: Frame size in milliseconds + hop_size_ms: Hop size in milliseconds + sample_rate: Audio sample rate + + Returns: + Tuple of (rms_values, time_stamps_ms) + """ + frame_size = int(sample_rate * frame_size_ms / 1000) + hop_size = int(sample_rate * hop_size_ms / 1000) + + rms_values = [] + time_stamps = [] + + for i in range(0, len(samples) - frame_size + 1, hop_size): + frame = samples[i:i + frame_size] + rms = np.sqrt(np.mean(frame ** 2)) + rms_values.append(rms) + time_stamps.append(i / sample_rate * 1000) # Convert to ms + + return np.array(rms_values), np.array(time_stamps) + + +def rms_to_db(rms: np.ndarray, reference: float = 1.0) -> np.ndarray: + """ + Convert RMS values to decibels. + + Args: + rms: RMS values + reference: Reference value (default 1.0 for normalized audio) + + Returns: + dB values + """ + # Avoid log(0) by using a small epsilon + epsilon = 1e-10 + return 20 * np.log10(np.maximum(rms, epsilon) / reference) + + +def detect_sound_regions( + audio: AudioSegment, + threshold_db: float = -40.0, + min_sound_duration_ms: int = 50, + frame_size_ms: int = 20, + hop_size_ms: int = 10, + merge_gap_ms: int = 100, + threshold_strategy: str = 'noise_floor', + noise_floor_percentile: float = 10.0, + noise_floor_delta_db: float = 15.0 +) -> List[Tuple[int, int]]: + """ + Detect regions in audio that contain actual sound (above threshold). + + Supports two threshold strategies: + - 'peak_relative': threshold = peak_db + threshold_db (old behavior) + - 'noise_floor': threshold = percentile(db_values, p) + delta_db (adaptive per-clip) + + The 'noise_floor' strategy is recommended as it adapts to each clip's + actual background noise level rather than using a fixed offset from peak. + + Args: + audio: Input audio segment + threshold_db: dB threshold below peak (used if strategy='peak_relative') + min_sound_duration_ms: Minimum duration of sound region to keep + frame_size_ms: Frame size for RMS computation + hop_size_ms: Hop size for RMS computation + merge_gap_ms: Merge regions separated by less than this gap + threshold_strategy: 'peak_relative' or 'noise_floor' + noise_floor_percentile: Percentile for noise floor estimation (default 10) + noise_floor_delta_db: dB above noise floor to set threshold (default 15) + + Returns: + List of (start_ms, end_ms) tuples for sound regions + """ + samples = get_amplitude_array(audio) + sample_rate = audio.frame_rate + + # Compute RMS envelope + rms_values, time_stamps = compute_rms_envelope( + samples, frame_size_ms, hop_size_ms, sample_rate + ) + + if len(rms_values) == 0: + return [] + + # Convert to dB + db_values = rms_to_db(rms_values) + + # Compute threshold based on strategy + peak_db = np.max(db_values) + + if threshold_strategy == 'noise_floor': + # ADAPTIVE: Use noise floor (low percentile) + delta + # This adapts to each clip's actual background noise level + noise_floor_db = np.percentile(db_values, noise_floor_percentile) + absolute_threshold = noise_floor_db + noise_floor_delta_db + + # Safeguard: don't exceed peak (would detect nothing) + # Leave at least 1 dB below peak + absolute_threshold = min(absolute_threshold, peak_db - 1.0) + + logger.debug( + f"Noise-floor threshold: floor={noise_floor_db:.1f}dB (p{noise_floor_percentile}), " + f"delta={noise_floor_delta_db}dB, threshold={absolute_threshold:.1f}dB, peak={peak_db:.1f}dB" + ) + else: + # OLD: peak-relative threshold + absolute_threshold = peak_db + threshold_db # threshold_db is negative + logger.debug( + f"Peak-relative threshold: peak={peak_db:.1f}dB, offset={threshold_db}dB, " + f"threshold={absolute_threshold:.1f}dB" + ) + + # Find frames above threshold + above_threshold = db_values > absolute_threshold + + # Find contiguous regions + regions = [] + in_region = False + region_start = 0 + + for i, (is_above, time_ms) in enumerate(zip(above_threshold, time_stamps)): + if is_above and not in_region: + # Start of new region + in_region = True + region_start = time_ms + elif not is_above and in_region: + # End of region + in_region = False + region_end = time_ms + if region_end - region_start >= min_sound_duration_ms: + regions.append((int(region_start), int(region_end))) + + # Handle case where audio ends while still in a region + if in_region: + region_end = time_stamps[-1] + hop_size_ms + if region_end - region_start >= min_sound_duration_ms: + regions.append((int(region_start), int(region_end))) + + # Merge regions that are close together + if len(regions) > 1: + merged_regions = [regions[0]] + for start, end in regions[1:]: + prev_start, prev_end = merged_regions[-1] + if start - prev_end <= merge_gap_ms: + # Merge with previous region + merged_regions[-1] = (prev_start, end) + else: + merged_regions.append((start, end)) + regions = merged_regions + + return regions + + +def get_sound_regions( + audio: AudioSegment, + threshold_db: float = -40.0, + min_sound_duration_ms: int = 50, + threshold_strategy: str = 'noise_floor', + noise_floor_percentile: float = 10.0, + noise_floor_delta_db: float = 15.0 +) -> List[Tuple[int, int]]: + """ + Detect sound regions in audio using adaptive threshold. + + Args: + audio: Input audio segment + threshold_db: dB threshold below peak (used if strategy='peak_relative') + min_sound_duration_ms: Minimum duration of sound region to keep + threshold_strategy: 'peak_relative' or 'noise_floor' + noise_floor_percentile: Percentile for noise floor estimation + noise_floor_delta_db: dB above noise floor to set threshold + + Returns: + List of (start_ms, end_ms) tuples for sound regions + """ + return detect_sound_regions( + audio, + threshold_db=threshold_db, + min_sound_duration_ms=min_sound_duration_ms, + threshold_strategy=threshold_strategy, + noise_floor_percentile=noise_floor_percentile, + noise_floor_delta_db=noise_floor_delta_db + ) + + +def extract_sound_with_edges_trimmed( + audio: AudioSegment, + regions: List[Tuple[int, int]], + min_silence_to_trim_ms: int = 100, + buffer_ratio: float = 0.1 +) -> AudioSegment: + """ + Extract audio with ONLY leftmost and rightmost silence removed IF present. + + Trimming is ADAPTIVE: + - Only trims if edge silence >= min_silence_to_trim_ms + - Keeps a small percentage (buffer_ratio) of the silence to preserve transients + - Buffer size adapts to actual silence duration (not fixed) + + Preserves all internal structure and silence between sounds. + Perfect for periodic sounds (clock ticks, footsteps, typing). + + Args: + audio: Input audio segment + regions: List of (start_ms, end_ms) tuples for sound regions + min_silence_to_trim_ms: Only trim if edge silence is at least this long (default 100ms) + buffer_ratio: Keep this fraction of the silence as buffer (default 0.1 = 10%) + Example: 500ms silence -> keep 50ms buffer + + Returns: + Audio segment with edges trimmed (or original if no significant silence) + """ + if not regions: + # No sound detected - return original + return audio + + # Find the overall sound boundaries (first sound start, last sound end) + first_sound_start_ms = regions[0][0] + last_sound_end_ms = regions[-1][1] + audio_duration_ms = len(audio) + + # Calculate actual silence durations at edges + leading_silence_ms = first_sound_start_ms + trailing_silence_ms = audio_duration_ms - last_sound_end_ms + + # Adaptive trimming: only trim if there's significant silence + # Keep a small percentage as buffer to avoid cutting transients + if leading_silence_ms >= min_silence_to_trim_ms: + buffer_ms = max(200, int(leading_silence_ms * buffer_ratio)) # At least 200ms buffer + trim_start_ms = max(0, first_sound_start_ms - buffer_ms) + else: + # Not enough silence to trim - keep from start + trim_start_ms = 0 + + if trailing_silence_ms >= min_silence_to_trim_ms: + buffer_ms = max(200, int(trailing_silence_ms * buffer_ratio)) + trim_end_ms = min(audio_duration_ms, last_sound_end_ms + buffer_ms) + else: + # Not enough silence to trim - keep to end + trim_end_ms = audio_duration_ms + + # Extract the edge-trimmed portion (internal structure preserved) + trimmed_audio = audio[trim_start_ms:trim_end_ms] + + logger.debug( + f"Edge trim: {audio_duration_ms}ms -> {len(trimmed_audio)}ms " + f"(leading_silence={leading_silence_ms}ms, trailing_silence={trailing_silence_ms}ms, " + f"trim_start={trim_start_ms}ms, trim_end={trim_end_ms}ms)" + ) + + return trimmed_audio + + +def extract_all_sound_regions( + audio: AudioSegment, + regions: List[Tuple[int, int]], + crossfade_ms: int = 10, + padding_ms: int = 20 +) -> AudioSegment: + """ + Extract ALL sound portions and join them, removing ALL silence. + + WARNING: This destroys natural periodicity! Use trim_edges_only() instead + for most use cases. This function is kept for backward compatibility. + + Args: + audio: Input audio segment + regions: List of (start_ms, end_ms) tuples for sound regions + crossfade_ms: Crossfade duration when joining regions + padding_ms: Padding around each region to avoid cutting transients + + Returns: + Audio segment containing only sound portions (internal silence removed) + """ + if not regions: + return audio + + # Extract each region + extracted_parts = [] + for start_ms, end_ms in regions: + # Add padding to avoid cutting off transients + padded_start = max(0, start_ms - padding_ms) + padded_end = min(len(audio), end_ms + padding_ms) + part = audio[padded_start:padded_end] + extracted_parts.append(part) + + # Concatenate with crossfade + if len(extracted_parts) == 1: + return extracted_parts[0] + + result = extracted_parts[0] + for part in extracted_parts[1:]: + if len(result) > crossfade_ms and len(part) > crossfade_ms: + result = result.append(part, crossfade=crossfade_ms) + else: + result = result + part + + return result + + +def process_esc50_dataset( + audio_dir: str, + metadata_path: str, + output_dir: str, + threshold_db: float = -40.0, + min_sound_duration_ms: int = 50, + save_trimmed_audio: bool = True, + threshold_strategy: str = 'noise_floor', + noise_floor_percentile: float = 10.0, + noise_floor_delta_db: float = 15.0 +) -> pd.DataFrame: + """ + Process entire ESC-50 dataset and compute effective durations. + + Uses ADAPTIVE EDGE-ONLY trimming to preserve natural periodicity of sounds. + Only leading and trailing silence is removed IF significant (>=100ms). + Trimming is adaptive: keeps a small percentage of silence as buffer for transients. + All internal structure is preserved. + + Supports two threshold strategies for adaptive per-clip thresholding: + - 'peak_relative': threshold = peak_db + threshold_db (fixed offset from peak) + - 'noise_floor': threshold = percentile(db, p) + delta (adapts to noise floor) + + Args: + audio_dir: Path to ESC-50 audio directory + metadata_path: Path to ESC-50 metadata CSV + output_dir: Output directory for processed files + threshold_db: dB threshold for silence detection (peak_relative mode) + min_sound_duration_ms: Minimum sound duration to keep + save_trimmed_audio: Whether to save trimmed audio files + threshold_strategy: 'peak_relative' or 'noise_floor' (recommended) + noise_floor_percentile: Percentile for noise floor estimation (default 5) + noise_floor_delta_db: dB above noise floor to set threshold (default 8) + + Returns: + DataFrame with processed metadata + """ + # Load original metadata + original_metadata = pd.read_csv(metadata_path) + logger.info(f"Loaded metadata for {len(original_metadata)} clips") + + # Create output directories + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + if save_trimmed_audio: + trimmed_audio_dir = output_path / "trimmed_audio" + trimmed_audio_dir.mkdir(exist_ok=True) + + # Process each audio file + results = [] + + for _, row in tqdm(original_metadata.iterrows(), total=len(original_metadata), + desc="Processing ESC-50 clips"): + filename = row['filename'] + category = row['category'] + audio_path = Path(audio_dir) / filename + + try: + # Load audio + audio = AudioSegment.from_file(str(audio_path), format="wav") + raw_duration_s = len(audio) / 1000.0 + + # Detect sound regions (using adaptive threshold) + regions = get_sound_regions( + audio, + threshold_db=threshold_db, + min_sound_duration_ms=min_sound_duration_ms, + threshold_strategy=threshold_strategy, + noise_floor_percentile=noise_floor_percentile, + noise_floor_delta_db=noise_floor_delta_db + ) + + # Trim edges only (leftmost and rightmost silence) + # Adaptive trimming: only trims if silence >= 100ms, keeps 10% as buffer + trimmed_audio = extract_sound_with_edges_trimmed(audio, regions) + final_duration_s = len(trimmed_audio) / 1000.0 + + # Calculate peak amplitude and RMS from trimmed audio + samples = get_amplitude_array(trimmed_audio) + peak_amplitude = np.max(np.abs(samples)) + peak_amplitude_db = 20 * np.log10(peak_amplitude + 1e-10) + rms = np.sqrt(np.mean(samples ** 2)) + avg_rms_db = 20 * np.log10(rms + 1e-10) + + # Calculate effective duration (sum of sound regions) + effective_duration_s = sum(end - start for start, end in regions) / 1000.0 if regions else final_duration_s + + # Save trimmed audio + trimmed_filename = None + if save_trimmed_audio: + trimmed_filename = filename + trimmed_path = trimmed_audio_dir / trimmed_filename + trimmed_audio.export(str(trimmed_path), format="wav") + + # Store results + results.append({ + 'filename': filename, + 'category': category, + 'fold': row['fold'], + 'target': row['target'], + 'esc10': row['esc10'], + 'raw_duration_s': round(raw_duration_s, 4), + 'final_duration_s': round(final_duration_s, 4), + 'effective_duration_s': round(effective_duration_s, 4), + 'num_sound_regions': len(regions), + 'peak_amplitude_db': round(peak_amplitude_db, 2), + 'avg_rms_db': round(avg_rms_db, 2), + 'trimmed_filename': trimmed_filename if save_trimmed_audio else None, + 'threshold_strategy': threshold_strategy, + 'threshold_db_used': threshold_db if threshold_strategy == 'peak_relative' else None, + 'noise_floor_percentile': noise_floor_percentile if threshold_strategy == 'noise_floor' else None, + 'noise_floor_delta_db': noise_floor_delta_db if threshold_strategy == 'noise_floor' else None, + 'min_sound_duration_ms_used': min_sound_duration_ms + }) + + except Exception as e: + logger.error(f"Error processing {filename}: {e}") + results.append({ + 'filename': filename, + 'category': category, + 'fold': row['fold'], + 'target': row['target'], + 'esc10': row['esc10'], + 'raw_duration_s': None, + 'final_duration_s': None, + 'effective_duration_s': None, + 'num_sound_regions': 0, + 'peak_amplitude_db': None, + 'avg_rms_db': None, + 'trimmed_filename': None, + 'threshold_strategy': threshold_strategy, + 'threshold_db_used': threshold_db if threshold_strategy == 'peak_relative' else None, + 'noise_floor_percentile': noise_floor_percentile if threshold_strategy == 'noise_floor' else None, + 'noise_floor_delta_db': noise_floor_delta_db if threshold_strategy == 'noise_floor' else None, + 'min_sound_duration_ms_used': min_sound_duration_ms, + 'error': str(e) + }) + + # Create DataFrame + results_df = pd.DataFrame(results) + + # Save CSV + csv_path = output_path / "effective_durations.csv" + results_df.to_csv(csv_path, index=False) + logger.info(f"Saved effective durations to {csv_path}") + + # Print summary statistics + print_summary_statistics(results_df) + + return results_df + + +def print_summary_statistics(df: pd.DataFrame): + """Print summary statistics of the processed dataset.""" + print("\n" + "=" * 60) + print("ESC-50 Preprocessing Summary") + print("=" * 60) + + # Filter out errors + valid_df = df[df['effective_duration_s'].notna()] + + print(f"\nTotal clips processed: {len(df)}") + print(f"Successfully processed: {len(valid_df)}") + print(f"Errors: {len(df) - len(valid_df)}") + + print(f"\nRaw duration statistics:") + print(f" Mean: {valid_df['raw_duration_s'].mean():.3f}s") + print(f" Std: {valid_df['raw_duration_s'].std():.3f}s") + print(f" Min: {valid_df['raw_duration_s'].min():.3f}s") + print(f" Max: {valid_df['raw_duration_s'].max():.3f}s") + + print(f"\nFinal duration statistics (edges trimmed, internal structure preserved):") + print(f" Mean: {valid_df['final_duration_s'].mean():.3f}s") + print(f" Std: {valid_df['final_duration_s'].std():.3f}s") + print(f" Min: {valid_df['final_duration_s'].min():.3f}s") + print(f" Max: {valid_df['final_duration_s'].max():.3f}s") + + print(f"\nEffective duration statistics (sum of sound regions only):") + print(f" Mean: {valid_df['effective_duration_s'].mean():.3f}s") + print(f" Std: {valid_df['effective_duration_s'].std():.3f}s") + print(f" Min: {valid_df['effective_duration_s'].min():.3f}s") + print(f" Max: {valid_df['effective_duration_s'].max():.3f}s") + + # Compare effective vs final + print(f"\nComparison (final includes internal silences):") + print(f" Avg effective: {valid_df['effective_duration_s'].mean():.3f}s") + print(f" Avg final: {valid_df['final_duration_s'].mean():.3f}s") + print(f" Difference: {valid_df['final_duration_s'].mean() - valid_df['effective_duration_s'].mean():.3f}s (internal silences)") + + # Duration reduction + reduction = (1 - valid_df['final_duration_s'].mean() / valid_df['raw_duration_s'].mean()) * 100 + print(f"\nAverage edge trimming reduction: {reduction:.1f}%") + + # Per-category statistics + print("\nEffective duration by category (top 10 longest):") + category_stats = valid_df.groupby('category')['effective_duration_s'].agg(['mean', 'std', 'min', 'max']) + category_stats = category_stats.sort_values('mean', ascending=False) + print(category_stats.head(10).to_string()) + + print("\nEffective duration by category (top 10 shortest):") + print(category_stats.tail(10).to_string()) + + print("\n" + "=" * 60) + + +def load_config(config_path: str) -> dict: + """Load configuration from YAML file.""" + import yaml + with open(config_path, 'r') as f: + return yaml.safe_load(f) + + +def main(): + parser = argparse.ArgumentParser( + description="Preprocess ESC-50 dataset for duration task" + ) + parser.add_argument( + '--config', '-c', + type=str, + default='config.yaml', + help='Path to configuration file' + ) + parser.add_argument( + '--threshold-db', + type=float, + default=None, + help='dB threshold below peak for silence detection (default: -40)' + ) + parser.add_argument( + '--min-sound-ms', + type=int, + default=None, + help='Minimum sound duration in ms to keep (default: 50)' + ) + parser.add_argument( + '--output-dir', + type=str, + default=None, + help='Output directory (default: from config or ESC-50_preprocessed)' + ) + parser.add_argument( + '--no-trimmed-audio', + action='store_true', + help='Do not save trimmed audio files (only save CSV)' + ) + parser.add_argument( + '--threshold-strategy', + type=str, + choices=['peak_relative', 'noise_floor'], + default=None, + help='Threshold strategy: peak_relative (old) or noise_floor (adaptive, recommended)' + ) + parser.add_argument( + '--noise-floor-percentile', + type=float, + default=None, + help='Percentile for noise floor estimation (default: 10)' + ) + parser.add_argument( + '--noise-floor-delta-db', + type=float, + default=None, + help='dB above noise floor to set threshold (default: 15)' + ) + + args = parser.parse_args() + + # Load config + config = load_config(args.config) + + # Get ESC-50 paths from config + esc50_config = config.get('esc50', {}) + audio_dir = esc50_config.get('audio_path', '/home/debarpanb1/ESC-50_github/audio') + metadata_path = esc50_config.get('metadata_path', '/home/debarpanb1/ESC-50_github/meta/esc50.csv') + + # Get duration task config for preprocessing parameters + duration_config = config.get('tasks', {}).get('duration', {}) + + # Determine threshold and min sound duration + threshold_db = args.threshold_db + if threshold_db is None: + threshold_db = duration_config.get('amplitude_threshold_db', -40.0) + + min_sound_ms = args.min_sound_ms + if min_sound_ms is None: + min_sound_ms = duration_config.get('min_sound_duration_ms', 50) + + # Determine output directory + output_dir = args.output_dir + if output_dir is None: + output_dir = duration_config.get( + 'preprocessed_data_path', + '/home/debarpanb1/TREA_2.0/ESC-50_preprocessed' + ) + + # Determine threshold strategy (noise_floor is recommended/default) + threshold_strategy = args.threshold_strategy + if threshold_strategy is None: + threshold_strategy = duration_config.get('threshold_strategy', 'noise_floor') + + # Determine noise floor percentile + noise_floor_percentile = args.noise_floor_percentile + if noise_floor_percentile is None: + noise_floor_percentile = duration_config.get('noise_floor_percentile', 10.0) + + # Determine noise floor delta dB + noise_floor_delta_db = args.noise_floor_delta_db + if noise_floor_delta_db is None: + noise_floor_delta_db = duration_config.get('noise_floor_delta_db', 15.0) + + # Log configuration + logger.info("=" * 60) + logger.info("ESC-50 Preprocessing Configuration") + logger.info("=" * 60) + logger.info(f"Audio directory: {audio_dir}") + logger.info(f"Metadata path: {metadata_path}") + logger.info(f"Output directory: {output_dir}") + logger.info(f"Threshold strategy: {threshold_strategy}") + if threshold_strategy == 'peak_relative': + logger.info(f" Peak-relative threshold dB: {threshold_db}") + else: + logger.info(f" Noise floor percentile: {noise_floor_percentile}") + logger.info(f" Noise floor delta dB: {noise_floor_delta_db}") + logger.info(f"Min sound duration (ms): {min_sound_ms}") + logger.info(f"Adaptive edge trimming: only if silence >= 100ms, keep 10% buffer") + logger.info(f"Save trimmed audio: {not args.no_trimmed_audio}") + logger.info("=" * 60) + + # Process dataset + results_df = process_esc50_dataset( + audio_dir=audio_dir, + metadata_path=metadata_path, + output_dir=output_dir, + threshold_db=threshold_db, + min_sound_duration_ms=min_sound_ms, + save_trimmed_audio=not args.no_trimmed_audio, + threshold_strategy=threshold_strategy, + noise_floor_percentile=noise_floor_percentile, + noise_floor_delta_db=noise_floor_delta_db + ) + + logger.info(f"\nPreprocessing complete!") + logger.info(f"Results saved to: {output_dir}") + + return results_df + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d0d5e001607cce8ab33c36652ded8e587e9cecc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +pyyaml +pandas +pydub +numpy +pyloudnorm + diff --git a/run_llm_answers_all.sh b/run_llm_answers_all.sh new file mode 100644 index 0000000000000000000000000000000000000000..2323b5503e9e1c3e1e84f74cb64c8c120b2564e6 --- /dev/null +++ b/run_llm_answers_all.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Run llm_answer_generator.py across dataset folders and tasks +# Processes both MCQ and open_text CSVs for tasks: count, duration, order, volume + +set -euo pipefail +export CUDA_VISIBLE_DEVICES=7 +PY_SCRIPT="$(dirname "$0")/llm_answer_generator.py" +BASE_DIR="$(dirname "$0")" + +DATA_SPLITS=(train validation test_large test_ood) +TASKS=(count duration order volume) + +echo "Running LLM answer generation script across splits: ${DATA_SPLITS[*]} and tasks: ${TASKS[*]}" + +for split in "${DATA_SPLITS[@]}"; do + for task in "${TASKS[@]}"; do + # open_text file + ot_csv="${BASE_DIR}/dataset_v2/${split}/${task}/${task}_open_text.csv" + if [ -f "${ot_csv}" ]; then + echo "[OPEN_TEXT] Processing ${ot_csv}" + python "${PY_SCRIPT}" --input "${ot_csv}" --mode open_text --task "${task}" + else + echo "[OPEN_TEXT] Not found: ${ot_csv}" + fi + done +done + +echo "All tasks processed." diff --git a/run_pipeline.sh b/run_pipeline.sh new file mode 100644 index 0000000000000000000000000000000000000000..ca0304b7efeff615bcc76a0aa2426ae25b8669b7 --- /dev/null +++ b/run_pipeline.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +################################################################################ +# Temporal Reasoning Audio Dataset Generation Pipeline +# +# This script orchestrates the entire dataset creation process for all tasks. +################################################################################ + +set -e # Exit on error + +# Default configuration +CONFIG_FILE="config.yaml" +OUTPUT_DIR="" +TASKS="" +PYTHON_CMD="python" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Function to print colored messages +print_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Function to print usage +usage() { + cat << EOF +Usage: $0 [OPTIONS] + +Temporal Reasoning Audio Dataset Generation Pipeline + +OPTIONS: + -c, --config FILE Configuration file (default: config.yaml) + -o, --output DIR Output directory (overrides config) + -t, --tasks TASKS Specific tasks to run: count,duration,order,volume + (default: all enabled tasks) + -p, --python CMD Python command to use (default: python) + -h, --help Display this help message + +EXAMPLES: + # Run all tasks with default config + $0 + + # Run with custom config + $0 --config my_config.yaml + + # Run specific tasks only + $0 --tasks count,duration + + # Use custom output directory + $0 --output /path/to/output + + # Combine options + $0 --config custom.yaml --tasks count,order --output ./my_dataset + +EOF +} + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -c|--config) + CONFIG_FILE="$2" + shift 2 + ;; + -o|--output) + OUTPUT_DIR="$2" + shift 2 + ;; + -t|--tasks) + TASKS="$2" + shift 2 + ;; + -p|--python) + PYTHON_CMD="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + print_error "Unknown option: $1" + usage + exit 1 + ;; + esac +done + +# Get script directory +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +# Check if config file exists +if [ ! -f "$SCRIPT_DIR/$CONFIG_FILE" ]; then + print_error "Config file not found: $CONFIG_FILE" + exit 1 +fi + +# Print header +echo "" +echo "================================================================================" +echo " TEMPORAL REASONING AUDIO DATASET GENERATION PIPELINE" +echo "================================================================================" +echo "" +print_info "Configuration: $CONFIG_FILE" +print_info "Python command: $PYTHON_CMD" +[ -n "$OUTPUT_DIR" ] && print_info "Output directory: $OUTPUT_DIR" +[ -n "$TASKS" ] && print_info "Tasks to run: $TASKS" +echo "" + +# Check Python dependencies +print_info "Checking Python dependencies..." +$PYTHON_CMD -c "import yaml, pandas, pydub" 2>/dev/null +if [ $? -ne 0 ]; then + print_error "Missing required Python packages. Please install:" + echo " pip install pyyaml pandas pydub" + exit 1 +fi +print_success "Dependencies OK" +echo "" + +# Build Python command arguments +PYTHON_ARGS="$SCRIPT_DIR/main.py --config $SCRIPT_DIR/$CONFIG_FILE" +[ -n "$OUTPUT_DIR" ] && PYTHON_ARGS="$PYTHON_ARGS --output $OUTPUT_DIR" +if [ -n "$TASKS" ]; then + # Convert comma-separated to space-separated for Python argparse + TASKS_SPACE=$(echo $TASKS | tr ',' ' ') + PYTHON_ARGS="$PYTHON_ARGS --tasks $TASKS_SPACE" +fi + +# Run the pipeline +print_info "Starting pipeline..." +echo "" + +$PYTHON_CMD $PYTHON_ARGS + +if [ $? -eq 0 ]; then + echo "" + echo "================================================================================" + print_success "PIPELINE COMPLETED SUCCESSFULLY!" + echo "================================================================================" + echo "" +else + echo "" + echo "================================================================================" + print_error "PIPELINE FAILED!" + echo "================================================================================" + echo "" + exit 1 +fi diff --git a/synthetic_silences/silent_1.wav b/synthetic_silences/silent_1.wav new file mode 100644 index 0000000000000000000000000000000000000000..66c2e7638a753246a85deb470e2b3e1d75582997 --- /dev/null +++ b/synthetic_silences/silent_1.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed8ddf138c2c59409bb4f1dbbf3fc910b486752b0c389dbb5dac6a4e68b8cbe5 +size 263052 diff --git a/synthetic_silences/silent_10.wav b/synthetic_silences/silent_10.wav new file mode 100644 index 0000000000000000000000000000000000000000..4724ca6161cd57150bee6811f32ce9c46156ab43 --- /dev/null +++ b/synthetic_silences/silent_10.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35fab767d2262eb552485542c6e593a5d84b7080862c577b23c11385176c7767 +size 274840 diff --git a/synthetic_silences/silent_11.wav b/synthetic_silences/silent_11.wav new file mode 100644 index 0000000000000000000000000000000000000000..2ad586e586eedf828fb178cd846cf67aac788a72 --- /dev/null +++ b/synthetic_silences/silent_11.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b01397619b480a22261daa7b018b59b5fd1baf1e3d4ed81161908def25112f17 +size 324418 diff --git a/synthetic_silences/silent_12.wav b/synthetic_silences/silent_12.wav new file mode 100644 index 0000000000000000000000000000000000000000..30dcb1ac528fc33c2dcff4d09ff19dc0ff78dbd2 --- /dev/null +++ b/synthetic_silences/silent_12.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2da9f4814fd0c6d50aa68696079c8d0ee880ed37d583e88a20481fd88c54e612 +size 310108 diff --git a/synthetic_silences/silent_13.wav b/synthetic_silences/silent_13.wav new file mode 100644 index 0000000000000000000000000000000000000000..6cb9d4a234c1e97e8e631ce51f8b0da658784389 --- /dev/null +++ b/synthetic_silences/silent_13.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5999933e975cd5846ac152bf888a954ea9243fa2218429998d96ceffac54a7e0 +size 121474 diff --git a/synthetic_silences/silent_14.wav b/synthetic_silences/silent_14.wav new file mode 100644 index 0000000000000000000000000000000000000000..85f1dc7994d8ed041fb79697f4d38de130f0b7de --- /dev/null +++ b/synthetic_silences/silent_14.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42c8b935bd521534635cc4fea040023dbf420084b51d1e3529953d5d1593df48 +size 209182 diff --git a/synthetic_silences/silent_15.wav b/synthetic_silences/silent_15.wav new file mode 100644 index 0000000000000000000000000000000000000000..5fadc861261ebb5f1d4a5c15cb8f1695acfe129e --- /dev/null +++ b/synthetic_silences/silent_15.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00529829944fd650a368d6fe65e25a7f3d25d8d4ba932712b35dfa5608380c3e +size 160682 diff --git a/synthetic_silences/silent_16.wav b/synthetic_silences/silent_16.wav new file mode 100644 index 0000000000000000000000000000000000000000..632d3ff33b290e33be12c3d0c704bd0de921873a --- /dev/null +++ b/synthetic_silences/silent_16.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:469eb34930878ba69a3994da5c2160314ce0c8bf0157d83f4ad349052a0c197b +size 112534 diff --git a/synthetic_silences/silent_17.wav b/synthetic_silences/silent_17.wav new file mode 100644 index 0000000000000000000000000000000000000000..8deaf9e1dd5c48a1ee9619a9a20d7ac735c21424 --- /dev/null +++ b/synthetic_silences/silent_17.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d788262618a55e51d12b0c2220ced172c0edf9072569ab010d48adc01607215 +size 165986 diff --git a/synthetic_silences/silent_18.wav b/synthetic_silences/silent_18.wav new file mode 100644 index 0000000000000000000000000000000000000000..540fb9b0de36ec8fa26de2d6a3192e6459fae317 --- /dev/null +++ b/synthetic_silences/silent_18.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83b6ef068680eacd83ac3d0b2f282fb37e2f4f018b03e89ab9a129aeac27a054 +size 257330 diff --git a/synthetic_silences/silent_19.wav b/synthetic_silences/silent_19.wav new file mode 100644 index 0000000000000000000000000000000000000000..e3f7844be1bb3313134ecbd20c24ffab83fbbb7d Binary files /dev/null and b/synthetic_silences/silent_19.wav differ diff --git a/synthetic_silences/silent_2.wav b/synthetic_silences/silent_2.wav new file mode 100644 index 0000000000000000000000000000000000000000..19963f7550e8be64e6c30b6ddd9acd7a07f75036 --- /dev/null +++ b/synthetic_silences/silent_2.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01bbeb6e0c14200b30be0eb57484450ba5807954333fede2e4c59d32a7042eaf +size 310850 diff --git a/synthetic_silences/silent_20.wav b/synthetic_silences/silent_20.wav new file mode 100644 index 0000000000000000000000000000000000000000..37a8ead28e1e50e8df80d9b126f558eb022f8b4a --- /dev/null +++ b/synthetic_silences/silent_20.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cda1e6a66b8cca7fc408f90cb6b8e8c13294fc33e8735a23dd72f1d36f9a991b +size 140232 diff --git a/synthetic_silences/silent_3.wav b/synthetic_silences/silent_3.wav new file mode 100644 index 0000000000000000000000000000000000000000..36b70313945eed4f7c31f381e983ab42e59da022 --- /dev/null +++ b/synthetic_silences/silent_3.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d49fadd497b9af43be5afbb08070a6317000f15edc8924bf3c11b3fcbb140616 +size 227846 diff --git a/synthetic_silences/silent_4.wav b/synthetic_silences/silent_4.wav new file mode 100644 index 0000000000000000000000000000000000000000..4e2f6fca8acb5b8ffcaa2877cbc05e061c18d576 --- /dev/null +++ b/synthetic_silences/silent_4.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148c743ea43d3528a53395f579d4d337512de9d1fb3c5d5b66e55f3a5e9c4d0c +size 337068 diff --git a/synthetic_silences/silent_5.wav b/synthetic_silences/silent_5.wav new file mode 100644 index 0000000000000000000000000000000000000000..d3cf3e92fc6043841dff5c90db8682579383ec3e --- /dev/null +++ b/synthetic_silences/silent_5.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51976ce15c0272f14125acaa5529a88d6f085ce153ef64bdc662586e97cb5678 +size 205426 diff --git a/synthetic_silences/silent_6.wav b/synthetic_silences/silent_6.wav new file mode 100644 index 0000000000000000000000000000000000000000..4e6fd10a5b8c0e389a9bced498e6877d6c40cbb7 --- /dev/null +++ b/synthetic_silences/silent_6.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dda249ab269984ae15d0a78b582455c053b1cddafb78c792cafbcbf3f682a087 +size 329056 diff --git a/synthetic_silences/silent_7.wav b/synthetic_silences/silent_7.wav new file mode 100644 index 0000000000000000000000000000000000000000..3f5d956916aa183bdcb1da8d905e95d4ec36d36f --- /dev/null +++ b/synthetic_silences/silent_7.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27156909b1191624cff0b0478477f8c40e47581bbb0be24a84e9113bf88f36a1 +size 146876 diff --git a/synthetic_silences/silent_8.wav b/synthetic_silences/silent_8.wav new file mode 100644 index 0000000000000000000000000000000000000000..d7648541b0b4c02e45ae6a987c8809e2f69760b9 --- /dev/null +++ b/synthetic_silences/silent_8.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:815a88cc01def086ca4dc23c41359eea297ec39c179b114e6e608d27bd2d9a39 +size 216452 diff --git a/synthetic_silences/silent_9.wav b/synthetic_silences/silent_9.wav new file mode 100644 index 0000000000000000000000000000000000000000..7548a6b6a7b711be29fb62a46707dbab3bb27985 --- /dev/null +++ b/synthetic_silences/silent_9.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f88a22998b27a0e18e1801aa63ef4b83315c243762478c9cf149db4338ebafdb +size 307884 diff --git a/tasks/__pycache__/task_count.cpython-312.pyc b/tasks/__pycache__/task_count.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..060d76a8d6b961e73fc48a70c2e96c80c1beddc4 Binary files /dev/null and b/tasks/__pycache__/task_count.cpython-312.pyc differ diff --git a/tasks/__pycache__/task_duration.cpython-312.pyc b/tasks/__pycache__/task_duration.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b033e01fdb0abeb562cbe8197781a7d494a9115 Binary files /dev/null and b/tasks/__pycache__/task_duration.cpython-312.pyc differ diff --git a/tasks/__pycache__/task_order.cpython-312.pyc b/tasks/__pycache__/task_order.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b6b8ff6f573e32251f51b7cdd735e3575f51efe Binary files /dev/null and b/tasks/__pycache__/task_order.cpython-312.pyc differ diff --git a/tasks/__pycache__/task_volume.cpython-312.pyc b/tasks/__pycache__/task_volume.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44592fa746500ae1017fa07425ca33ae6c81a651 Binary files /dev/null and b/tasks/__pycache__/task_volume.cpython-312.pyc differ diff --git a/tasks/task_count.py b/tasks/task_count.py new file mode 100644 index 0000000000000000000000000000000000000000..f259c8a64ae617596fd293524d71e5b0917f4e98 --- /dev/null +++ b/tasks/task_count.py @@ -0,0 +1,472 @@ +""" +Task 1: Count - Generate counting questions + +This task joins multiple audio sources and asks questions about counting +the number of unique sound sources in the audio. +""" + +import csv +import random +from pathlib import Path +from typing import Dict, List + +import sys +sys.path.append(str(Path(__file__).parent.parent)) + +from utils import ( + AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator, + setup_logger, set_random_seed, generate_sample_durations_for_task, + generate_single_clip_duration, build_count_task_audio, + get_max_clip_num_to_be_joined +) + + +class CountTaskGenerator: + """Generator for counting task dataset.""" + + def __init__(self, config: Dict, logger): + """ + Initialize count task generator. + + Args: + config: Configuration dictionary + logger: Logger instance + """ + self.config = config + self.logger = logger + self.task_config = config['tasks']['count'] + + # Initialize components + self.dataset = ESC50Dataset( + config['esc50']['metadata_path'], + config['esc50']['audio_path'], + config # Pass config for class subset loading + ) + self.audio_processor = AudioProcessor( + crossfade_duration=config['audio']['crossfade_duration'], + silence_duration=config['audio']['silence_duration'], + with_silence=config['audio']['with_silence'], + normalize=config['audio']['normalize'], + normalize_target_dBFS=config['audio']['normalize_target_dBFS'], + synthetic_silence_path=config['synthetic_silence']['path'] + ) + self.question_generator = QuestionGenerator( + num_options=config['mcq']['num_options'], + option_labels=config['mcq']['option_labels'], + distractor_strategy=config['mcq']['distractor_strategy'] + ) + + # Initialize LLM question generator + self.llm_enabled = config.get('llm', {}).get('enabled', False) + self.llm_generator = LLMQuestionGenerator( + enabled=self.llm_enabled, + template_questions=self.task_config + ) + if self.llm_enabled: + logger.info("LLM question generation enabled (local Llama 3.1 8B)") + else: + logger.info("Using template-based question generation") + + # Duration settings from config + self.min_clip_duration = config['audio']['min_clip_duration'] + self.max_clip_duration = config['audio']['max_clip_duration'] + self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0) + self.min_silence_ms = config['audio'].get('min_silence_duration', 100) + self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500) + # Small crossfade within same-source repetitions (for consecutive mode) + self.crossfade_within_source_ms = config['audio'].get('crossfade_within_source', 50) + self.task_duration_hours = self.task_config['task_duration_size'] + + # Ordering mode: "random" or "consecutive" + # random: Clips shuffled (A B A C B A C) - tests sound recognition + # consecutive: Same-source grouped (AAA BBB CCC) - easier + self.ordering_mode = self.task_config.get('ordering_mode', 'random') + logger.info(f"Count task ordering mode: {self.ordering_mode}") + + # Set up output paths + self.output_base = Path(config['output']['base_path']) / 'count' + self.output_base.mkdir(parents=True, exist_ok=True) + self.audio_output = self.output_base / 'audios' + self.audio_output.mkdir(parents=True, exist_ok=True) + + def create_sampling_list(self, parent_list: List, n_sampling: int) -> List: + """ + Sample elements from parent list with replacement. + + Args: + parent_list: List to sample from + n_sampling: Number of samples + + Returns: + List of sampled elements + """ + return [random.choice(parent_list) for _ in range(n_sampling)] + + def generate_sample(self, sample_id: int, target_unique_count: int = None, target_duration_seconds: float = None) -> Dict: + """ + Generate a single count task sample. + + Pipeline for COUNT task: + 1. Use pre-generated target duration (or generate if not provided) + 2. Calculate max clips that can fit + 3. Pick N unique classes (N <= max_clips, since each source needs at least 1 clip) + 4. For each class, sample one audio clip + 5. Calculate repetitions to fill target duration + 6. Based on ordering_mode: + - "random": Shuffle clips (A B A C B A C) - tests recognition + - "consecutive": Group same-class (AAA BBB CCC) - easier + 7. Insert silences between clips + 8. Distribute remainder as random extra silences + + Args: + sample_id: Sample ID number + target_unique_count: Target number of unique sounds (for balanced distribution) + target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task) + + Returns: + Dictionary with sample metadata + """ + # Use pre-generated duration or generate one (backward compatibility) + if target_duration_seconds is not None: + clip_duration_seconds = target_duration_seconds + else: + clip_duration_seconds = generate_single_clip_duration( + self.min_clip_duration, + self.max_clip_duration + ) + + # Calculate max clips that can fit in target duration + max_clips, remainder_seconds = get_max_clip_num_to_be_joined( + clip_duration_seconds, + self.source_clip_duration, + self.min_silence_ms + ) + + # Ensure at least 1 clip + max_clips = max(1, max_clips) + + max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10) + + # Calculate valid range: n_unique_audios can be 1 to max_clips_per_sample + # but cannot exceed what physically fits or available categories + max_unique_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES)) + + if max_unique_for_sample < 1: + raise ValueError( + f"Sample {sample_id}: Cannot generate sample - max_unique_for_sample={max_unique_for_sample}. " + f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, " + f"available_categories={len(self.dataset.CATEGORIES)}, duration={clip_duration_seconds:.1f}s. " + f"Increase min_clip_duration or reduce max_clips_per_sample." + ) + + # Determine n_unique_audios - use target from balanced distribution or random + if target_unique_count is not None: + # Clamp target to what this specific sample duration can fit + # Short samples can't fit all possible answers, so we clamp down + n_unique_audios = min(target_unique_count, max_unique_for_sample) + + if n_unique_audios != target_unique_count: + self.logger.debug( + f"Sample {sample_id}: Clamped target from {target_unique_count} to {n_unique_audios} " + f"(duration={clip_duration_seconds:.1f}s can only fit {max_clips} clips)" + ) + else: + # No target specified - randomly select from valid range + n_unique_audios = random.randint(1, max_unique_for_sample) + + self.logger.debug( + f"Sample {sample_id}: target={clip_duration_seconds:.1f}s, max_clips={max_clips}, " + f"n_unique_audios={n_unique_audios}" + ) + + # Sample unique categories - use least-used categories for balanced distribution + selected_categories = self.dataset.get_least_used_categories(n_unique_audios) + + # Track usage of all selected categories + for cat in selected_categories: + self.dataset.category_usage_counts[cat] += 1 + + # Sample one file from each unique category + source_files = [] + source_paths = [] + source_categories = [] + + for category in selected_categories: + filename, filepath = self.dataset.sample_file_from_category(category) + source_files.append(filename) + source_paths.append(filepath) + source_categories.append(category) + + # Load unique source audios + source_audios = [] + for file_path in source_paths: + audio = self.audio_processor.load_audio(file_path) + source_audios.append(audio) + + # Build audio using configured ordering mode + final_audio, clip_sequence, build_metadata = build_count_task_audio( + source_audios, + source_categories, + clip_duration_seconds, + ordering_mode=self.ordering_mode, + source_clip_duration_seconds=self.source_clip_duration, + min_silence_ms=self.min_silence_ms, + max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms, + crossfade_within_source_ms=self.crossfade_within_source_ms + ) + + # Save the audio + output_audio_path = self.audio_output / f"{sample_id}.wav" + final_audio.export(str(output_audio_path), format="wav") + + # Generate questions (using LLM if enabled) + if self.llm_enabled and self.llm_generator: + llm_questions = self.llm_generator.generate_count_questions( + correct_count=n_unique_audios, + categories_present=list(set(clip_sequence)) + ) + mcq_question_text = llm_questions.get('mcq_question') + open_text_question_text = llm_questions.get('open_text_question') + else: + mcq_question_text = random.choice(self.task_config['mcq_questions']) + open_text_question_text = random.choice(self.task_config['open_text_questions']) + + # Generate MCQ with options + mcq_data = self.question_generator.generate_count_mcq( + mcq_question_text, + n_unique_audios, + self.dataset.CATEGORIES + ) + + # Generate open-text answer + open_text_data = self.question_generator.generate_count_open_text( + open_text_question_text, + n_unique_audios + ) + + # Create metadata + metadata = { + 'id': sample_id, + 'audio_path': str(output_audio_path.relative_to(self.output_base.parent)), + 'n_unique_sounds': n_unique_audios, + 'total_clips': build_metadata['total_clips'], + 'repetitions_per_source': build_metadata['repetitions_per_source'], + 'ordering_mode': self.ordering_mode, + 'source_files': source_files, + 'source_categories': source_categories, + 'clip_sequence': clip_sequence, + 'unique_categories': sorted(list(set(source_categories))), + 'target_duration_seconds': clip_duration_seconds, + 'actual_duration_seconds': len(final_audio) / 1000.0, + 'mcq_question': mcq_data['question'], + 'mcq_options': mcq_data['options'], + 'mcq_correct_answer': mcq_data['correct_answer'], + 'open_text_question': open_text_data['question'], + 'open_text_answer': open_text_data['correct_answer'], + 'llm_generated': self.llm_enabled + } + + self.logger.info( + f"Generated count sample {sample_id}: {n_unique_audios} unique sounds, " + f"{build_metadata['total_clips']} clips, {len(final_audio)/1000:.1f}s" + ) + + return metadata + + def generate_dataset(self) -> tuple: + """ + Generate the complete count task dataset. + + Returns: + Tuple of (mcq_csv_path, open_text_csv_path) + """ + # Generate sample durations upfront to exactly fill target duration + sample_durations = generate_sample_durations_for_task( + self.task_duration_hours, + self.min_clip_duration, + self.max_clip_duration + ) + num_samples = len(sample_durations) + self.logger.info(f"Generating {num_samples} count task samples (target: {self.task_duration_hours}h, actual: {sum(sample_durations)/3600:.2f}h)...") + + # Calculate max clips each sample can fit based on duration + max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10) + sample_max_clips = [] + for duration in sample_durations: + max_clips, _ = get_max_clip_num_to_be_joined( + duration, + self.source_clip_duration, + self.min_silence_ms + ) + # Limit to config max and available categories + max_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES)) + sample_max_clips.append(max_for_sample) + + # Create balanced distribution by assigning targets based on sample capacity + # Sort samples by capacity to assign higher targets to samples that can fit them + possible_answers = list(range(1, max_clips_per_sample + 1)) + samples_per_answer = num_samples // len(possible_answers) + remainder = num_samples % len(possible_answers) + + # Create list of (sample_idx, duration, max_clips_capacity) + sample_info = [(i, sample_durations[i], sample_max_clips[i]) for i in range(num_samples)] + + # Sort by capacity (descending) - assign high targets to high-capacity samples + sample_info.sort(key=lambda x: x[2], reverse=True) + + # Assign targets: distribute each answer count across samples + balanced_assignments = [None] * num_samples + assignment_pool = [] + + for answer in possible_answers: + count = samples_per_answer + (1 if remainder > 0 else 0) + assignment_pool.extend([answer] * count) + remainder = max(0, remainder - 1) + + # Reverse pool so we assign high targets first (to high-capacity samples) + assignment_pool.sort(reverse=True) + + for idx, (sample_idx, duration, capacity) in enumerate(sample_info): + # Assign target, clamped to sample's capacity + target = min(assignment_pool[idx], capacity) + balanced_assignments[sample_idx] = target + + # Log the actual distribution after capacity clamping + from collections import Counter + distribution = Counter(balanced_assignments) + self.logger.info(f"Balanced answer distribution (after capacity-aware assignment): {dict(sorted(distribution.items()))}") + + all_metadata = [] + + for i in range(num_samples): + metadata = self.generate_sample( + i, + target_unique_count=balanced_assignments[i], + target_duration_seconds=sample_durations[i] + ) + all_metadata.append(metadata) + + # Save MCQ CSV + mcq_csv_path = self.output_base / 'count_mcq.csv' + self._save_mcq_csv(all_metadata, mcq_csv_path) + + # Save open-text CSV + open_text_csv_path = self.output_base / 'count_open_text.csv' + self._save_open_text_csv(all_metadata, open_text_csv_path) + + # Save metadata CSV + metadata_csv_path = self.output_base / 'count_metadata.csv' + self._save_metadata_csv(all_metadata, metadata_csv_path) + + self.logger.info(f"Count task dataset generation complete!") + self.logger.info(f" - MCQ CSV: {mcq_csv_path}") + self.logger.info(f" - Open-text CSV: {open_text_csv_path}") + self.logger.info(f" - Metadata CSV: {metadata_csv_path}") + self.logger.info(f" - Audio files: {self.audio_output}") + + return mcq_csv_path, open_text_csv_path + + def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path): + """Save MCQ format CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'question', 'id', 'audio_path', + 'optionA', 'optionB', 'optionC', 'optionD', + 'correct', 'source_wavs', 'source_categories' + ]) + + # Data rows + for meta in metadata_list: + writer.writerow([ + meta['mcq_question'], + meta['id'], + meta['audio_path'], + meta['mcq_options']['A'], + meta['mcq_options']['B'], + meta['mcq_options']['C'], + meta['mcq_options']['D'], + meta['mcq_correct_answer'], + str(meta['source_files']), + str(meta['unique_categories']) + ]) + + def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path): + """Save open-text format CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'question', 'id', 'audio_path', 'answer', + 'source_wavs', 'source_categories' + ]) + + # Data rows + for meta in metadata_list: + writer.writerow([ + meta['open_text_question'], + meta['id'], + meta['audio_path'], + meta['open_text_answer'], + str(meta['source_files']), + str(meta['unique_categories']) + ]) + + def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path): + """Save detailed metadata CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'id', 'audio_path', 'total_clips', 'n_unique_sounds', + 'source_files', 'source_categories', 'unique_categories', + 'ordering_mode', 'target_duration_s', 'actual_duration_s', 'llm_generated' + ]) + + # Data rows + for meta in metadata_list: + writer.writerow([ + meta['id'], + meta['audio_path'], + meta['total_clips'], + meta['n_unique_sounds'], + str(meta['source_files']), + str(meta['source_categories']), + str(meta['unique_categories']), + meta.get('ordering_mode', 'random'), + meta.get('target_duration_seconds', 0), + meta.get('actual_duration_seconds', 0), + meta.get('llm_generated', False) + ]) + + +def main(config_path: str = None): + """Main entry point for count task generation.""" + import yaml + + # Load configuration + if config_path is None: + config_path = Path(__file__).parent.parent / 'config.yaml' + + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + # Set random seed + set_random_seed(config['random_seed']) + + # Setup logger + logger = setup_logger( + 'count_task', + log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']), + level=config['logging']['level'], + console_output=config['logging']['console_output'] + ) + + # Generate dataset + generator = CountTaskGenerator(config, logger) + generator.generate_dataset() + + +if __name__ == '__main__': + main() diff --git a/tasks/task_duration.py b/tasks/task_duration.py new file mode 100644 index 0000000000000000000000000000000000000000..d22e58ddff2a88132bdb9ddb497ca8b7416c4867 --- /dev/null +++ b/tasks/task_duration.py @@ -0,0 +1,820 @@ +""" +Task 2: Duration - Generate duration comparison questions + +This task creates audio samples where sources have different effective durations +and asks questions about which sound is heard for the longest or shortest time. + +Key features: +- Uses amplitude-filtered (preprocessed) audio clips with known effective durations +- First calculates max clips from total duration, then distributes slots +- Strategically distributes repetitions to ensure clear longest/shortest answers +- Consecutive ordering within sources, random order between sources +- Gap multipliers ensure unambiguous answers (e.g., longest is 1.5x longer than next) +- NO category preference - random selection to avoid bias +""" + +import csv +import random +import math +from pathlib import Path +from typing import Dict, List, Tuple, Optional +from collections import Counter + +import sys +sys.path.append(str(Path(__file__).parent.parent)) + +from utils import ( + AudioProcessor, PreprocessedESC50Dataset, QuestionGenerator, LLMQuestionGenerator, + setup_logger, set_random_seed, calculate_num_samples_for_task, + generate_single_clip_duration, get_max_clip_num_to_be_joined, + build_duration_task_audio, distribute_remainder_as_silences, + generate_sample_durations_for_task +) + + +class DurationTaskGenerator: + """Generator for duration comparison task dataset using preprocessed ESC-50.""" + + def __init__(self, config: Dict, logger): + """ + Initialize duration task generator. + + Args: + config: Configuration dictionary + logger: Logger instance + """ + self.config = config + self.logger = logger + self.task_config = config['tasks']['duration'] + + # Initialize preprocessed dataset (with effective durations) + self.dataset = PreprocessedESC50Dataset( + metadata_path=config['esc50']['metadata_path'], + audio_path=config['esc50']['audio_path'], + preprocessed_path=self.task_config['preprocessed_data_path'], + config=config # Pass config for class subset loading + ) + + # Calculate average effective duration from preprocessed data + self.avg_effective_duration = self.dataset.effective_df['effective_duration_s'].mean() + self.logger.info(f"Average effective duration: {self.avg_effective_duration:.2f}s") + + # Initialize audio processor + self.audio_processor = AudioProcessor( + crossfade_duration=config['audio']['crossfade_duration'], + silence_duration=config['audio']['silence_duration'], + with_silence=config['audio']['with_silence'], + normalize=config['audio']['normalize'], + normalize_target_dBFS=config['audio']['normalize_target_dBFS'], + synthetic_silence_path=config['synthetic_silence']['path'] + ) + + # Initialize question generator + self.question_generator = QuestionGenerator( + num_options=config['mcq']['num_options'], + option_labels=config['mcq']['option_labels'], + distractor_strategy=config['mcq']['distractor_strategy'] + ) + + # Initialize LLM question generator + self.llm_enabled = config.get('llm', {}).get('enabled', False) + self.llm_generator = LLMQuestionGenerator( + enabled=self.llm_enabled, + template_questions=self.task_config + ) + + # Duration settings from config + self.min_clip_duration = config['audio']['min_clip_duration'] + self.max_clip_duration = config['audio']['max_clip_duration'] + self.min_silence_ms = config['audio'].get('min_silence_duration', 100) + self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500) + self.crossfade_within_source_ms = config['audio'].get('crossfade_within_source', 50) + self.task_duration_hours = self.task_config['task_duration_size'] + + # Duration task specific settings + self.multiplier_longest = self.task_config.get('multiplier_longest', 1.5) + self.multiplier_shortest = self.task_config.get('multiplier_shortest', 0.75) + self.reject_if_gap_not_met = self.task_config.get('reject_if_gap_not_met', True) + self.sample_different_clips = self.task_config.get('sample_different_clips_same_class', True) + # Minimum effective duration per source (seconds) - clips shorter than this are harder to distinguish + self.min_effective_duration_per_source = self.task_config.get('min_effective_duration_per_source', 1.0) + + # Set up output paths + self.output_base = Path(config['output']['base_path']) / 'duration' + self.output_base.mkdir(parents=True, exist_ok=True) + self.audio_output = self.output_base / 'audios' + self.audio_output.mkdir(parents=True, exist_ok=True) + + # Statistics tracking + self.rejection_count = 0 + self.success_count = 0 + + def _calculate_max_clips_and_sources( + self, + target_duration_s: float, + question_type: str + ) -> Tuple[int, int, float]: + """ + Calculate max clips possible and choose n_sources from config that satisfies gap. + + Key principle: + 1. Calculate valid range of sources that can satisfy gap constraint + 2. Filter config values to only those within valid range + 3. Pick RANDOMLY from valid config values (ensures variety) + + For LONGEST: + - Target needs at least 2 clips to beat max_background by 1.5x + - max_sources = max_clips - 2 + 1 (backgrounds get 1 each) + - min_sources = 2 (need at least 1 background) + + For SHORTEST: + - Target gets 1 clip + - Each background needs at least 2 clips to be 2x target (1/0.5) + - max_sources = 1 + (max_clips - 1) // 2 + - min_sources = 2 + + Args: + target_duration_s: Target total audio duration + question_type: "longest" or "shortest" + + Returns: + Tuple of (max_clips, n_sources, remainder_s) + """ + # Get max clips using average effective duration + max_clips, remainder_s = get_max_clip_num_to_be_joined( + target_duration_s, + self.avg_effective_duration, + self.min_silence_ms + ) + + # Ensure at least 2 clips + max_clips = max(2, max_clips) + + # Get config values for n_sources + # If single int (e.g., 15), sample from [1, 15] like count/order tasks + # If list (e.g., [2,3,4]), sample from the list + num_sources_config = self.task_config.get('num_unique_sources', [2, 3, 4, 5]) + if isinstance(num_sources_config, int): + # Single int: create range [1, num_sources_config] + num_sources_config = list(range(1, num_sources_config + 1)) + + if question_type == "longest": + # Target needs at least 2 clips to reliably beat background by multiplier + # (with 1.5x multiplier, 2 clips of target vs 1 clip of background usually works) + min_target_clips = 2 + + # Minimum sources: need at least 1 background + target = 2 + min_valid_sources = 2 + + # Maximum sources: max_clips - min_target_clips + 1 + # (subtract target's clips, add 1 for the target itself) + max_valid_sources = max_clips - min_target_clips + 1 + + else: # shortest + # Target gets 1 clip + # Each background needs at least 2 clips to be >= 2x target (1/0.5 multiplier) + min_clips_per_background = 2 + + # Minimum sources: 2 (target + 1 background) + min_valid_sources = 2 + + # Maximum sources: how many backgrounds can we fit? + remaining_clips = max_clips - 1 # 1 for target + max_backgrounds = remaining_clips // min_clips_per_background + max_valid_sources = max_backgrounds + 1 # +1 for target + + # Filter config values to only valid ones + valid_config_sources = [ + n for n in num_sources_config + if min_valid_sources <= n <= max_valid_sources + ] + + if not valid_config_sources: + raise ValueError( + f"Duration task: No valid num_unique_sources for {question_type} question. " + f"Config values: {num_sources_config}, Valid range: [{min_valid_sources}, {max_valid_sources}]. " + f"max_clips={max_clips}, duration={target_duration_s:.1f}s. " + f"Increase min_clip_duration or adjust num_unique_sources config." + ) + + # Pick RANDOMLY from valid config values (ensures variety!) + n_sources = random.choice(valid_config_sources) + + # Validate final value + if n_sources < 2 or n_sources > len(self.dataset.CATEGORIES): + raise ValueError( + f"Duration task: Invalid n_sources={n_sources}. " + f"Must be in range [2, {len(self.dataset.CATEGORIES)}]" + ) + + self.logger.debug( + f"Max clips: {max_clips}, Question: {question_type}, " + f"Valid range: [{min_valid_sources}, {max_valid_sources}], " + f"Valid config: {valid_config_sources}, Selected: {n_sources}" + ) + + return max_clips, n_sources, remainder_s + + def _calculate_slot_distribution( + self, + max_clips: int, + n_sources: int, + effective_durations: Dict[str, float], + target_category: str, + question_type: str + ) -> Tuple[Dict[str, int], bool, Dict]: + """ + Calculate how many clips each source gets. + + For LONGEST: target gets (max_clips - n_backgrounds), backgrounds get 1 each + For SHORTEST: target gets 1, backgrounds share (max_clips - 1) + + Args: + max_clips: Maximum number of clips that fit + n_sources: Number of unique sources + effective_durations: Dict mapping category -> effective duration + target_category: The category that should be longest/shortest + question_type: "longest" or "shortest" + + Returns: + Tuple of (slot_distribution, gap_satisfied, metadata) + """ + categories = list(effective_durations.keys()) + background_categories = [c for c in categories if c != target_category] + n_backgrounds = len(background_categories) + + if question_type == "longest": + # Target gets max_clips - n_backgrounds + # Backgrounds get 1 each + target_clips = max_clips - n_backgrounds + target_clips = max(1, target_clips) # At least 1 + + slot_distribution = {target_category: target_clips} + for cat in background_categories: + slot_distribution[cat] = 1 + + # Verify gap: target_duration >= max_background × multiplier + target_duration = target_clips * effective_durations[target_category] + background_durations = [effective_durations[c] for c in background_categories] + max_background = max(background_durations) if background_durations else 0 + required_target = max_background * self.multiplier_longest + gap_satisfied = target_duration >= required_target + + metadata = { + 'target_clips': target_clips, + 'target_duration_s': target_duration, + 'max_background_s': max_background, + 'required_target_s': required_target, + 'multiplier': self.multiplier_longest + } + + else: # shortest + # Target gets 1 clip + # Backgrounds share (max_clips - 1) + remaining_clips = max_clips - 1 + clips_per_background = max(1, remaining_clips // n_backgrounds) + extra_clips = remaining_clips % n_backgrounds + + slot_distribution = {target_category: 1} + + for i, cat in enumerate(background_categories): + clips = clips_per_background + (1 if i < extra_clips else 0) + slot_distribution[cat] = clips + + # Verify gap: target_duration <= min_background × multiplier + target_duration = effective_durations[target_category] + background_durations = [ + slot_distribution[c] * effective_durations[c] + for c in background_categories + ] + min_background = min(background_durations) if background_durations else float('inf') + required_max_target = min_background * self.multiplier_shortest + + # CRITICAL: Target must still be at least min_effective_duration_per_source + # Otherwise clips that are too short (e.g., 0.03s) would be used and be indistinguishable + target_too_short = target_duration < self.min_effective_duration_per_source + gap_satisfied = (target_duration <= required_max_target) and (not target_too_short) + + metadata = { + 'target_clips': 1, + 'target_duration_s': target_duration, + 'min_background_s': min_background, + 'required_max_target_s': required_max_target, + 'multiplier': self.multiplier_shortest, + 'target_too_short': target_too_short + } + + return slot_distribution, gap_satisfied, metadata + + def _try_generate_sample( + self, + sample_id: int, + question_type: str, + max_retries: int = 5, + target_duration_seconds: float = None + ) -> Optional[Dict]: + """ + Try to generate a valid duration sample with retries. + + Args: + sample_id: Sample ID + question_type: "longest" or "shortest" + max_retries: Maximum retry attempts + target_duration_seconds: Pre-generated target duration + + Returns: + Metadata dict if successful, None if all retries failed + """ + for attempt in range(max_retries): + try: + result = self._generate_single_sample(sample_id, question_type, target_duration_seconds=target_duration_seconds) + if result is not None: + return result + except Exception as e: + self.logger.warning(f"Sample {sample_id} attempt {attempt+1} failed: {e}") + + return None + + def _generate_single_sample( + self, + sample_id: int, + question_type: str, + target_duration_seconds: float = None + ) -> Optional[Dict]: + """ + Generate a single duration task sample. + + Corrected Pipeline: + 1. Use pre-generated target duration (or generate if not provided) + 2. Calculate max_clips using get_max_clip_num_to_be_joined + 3. Based on max_clips and question_type, determine n_sources + 4. Select categories RANDOMLY (no bias toward short/long) + 5. Pick target category RANDOMLY from selected + 6. Get effective durations for all sources + 7. Calculate slot distribution based on max_clips + 8. Verify gap constraint + 9. Load audio clips and build final audio + + Args: + sample_id: Sample ID number + question_type: "longest" or "shortest" + target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task) + + Returns: + Dictionary with sample metadata, or None if failed + """ + # Step 1: Use pre-generated duration or generate one (backward compatibility) + if target_duration_seconds is not None: + target_duration_s = target_duration_seconds + else: + target_duration_s = generate_single_clip_duration( + self.min_clip_duration, + self.max_clip_duration + ) + + # Step 2 & 3: Calculate max_clips and n_sources + max_clips, n_sources, remainder_s = self._calculate_max_clips_and_sources( + target_duration_s, + question_type + ) + + # Step 4: Select categories RANDOMLY (using least-used for balance, but no duration preference) + all_categories = self.dataset.get_least_used_categories(n_sources) + + # Step 5: Pick target category RANDOMLY from selected (no bias!) + target_category = random.choice(all_categories) + self.dataset.category_usage_counts[target_category] += 1 + + # Step 6: Get effective durations by sampling one file per category + # Use min_effective_duration_per_source to avoid clips that are too short to distinguish + effective_durations = {} + selected_files = {} + + for category in all_categories: + filename, filepath, eff_dur = self.dataset.sample_file_from_category_with_duration( + category, + min_effective_duration=self.min_effective_duration_per_source + ) + effective_durations[category] = eff_dur + selected_files[category] = { + 'filename': filename, + 'filepath': filepath, + 'effective_duration_s': eff_dur + } + + # Step 7: Calculate slot distribution based on max_clips + slot_distribution, gap_satisfied, calc_metadata = self._calculate_slot_distribution( + max_clips=max_clips, + n_sources=n_sources, + effective_durations=effective_durations, + target_category=target_category, + question_type=question_type + ) + + # Step 8: If gap not satisfied, try adjustments + if not gap_satisfied: + # Try with different clips that have better durations + if self.sample_different_clips: + gap_satisfied = self._try_improve_gap_with_different_clips( + question_type=question_type, + target_category=target_category, + all_categories=all_categories, + max_clips=max_clips, + n_sources=n_sources, + effective_durations=effective_durations, + selected_files=selected_files, + slot_distribution=slot_distribution + ) + + if not gap_satisfied and self.reject_if_gap_not_met: + self.rejection_count += 1 + self.logger.debug( + f"Sample {sample_id} rejected: gap not satisfied " + f"(type={question_type}, max_clips={max_clips}, sources={n_sources})" + ) + return None + + # Step 9: Load audio clips based on slot distribution + source_audio_lists = {} + files_used = {} + + for category in all_categories: + reps = slot_distribution.get(category, 0) + if reps == 0: + continue + + # Get files for this category + if self.sample_different_clips and reps > 1: + filenames, filepaths, total_dur = self.dataset.sample_files_from_category_to_reach_duration( + category, + reps * effective_durations[category], + prefer_same_file=False + ) + else: + # Use same file repeated + file_info = selected_files[category] + filenames = [file_info['filename']] * reps + filepaths = [file_info['filepath']] * reps + + # Load audio segments + audio_list = [] + for fp in filepaths[:reps]: + audio = self.audio_processor.load_audio(fp) + audio_list.append(audio) + + # If we need more, cycle through + while len(audio_list) < reps: + audio_list.append(audio_list[len(audio_list) % len(audio_list)]) + + source_audio_lists[category] = audio_list[:reps] + files_used[category] = filenames[:reps] + + # Step 10: Build final audio + final_audio, category_sequence, build_metadata = build_duration_task_audio( + source_audio_lists=source_audio_lists, + slot_distribution=slot_distribution, + effective_durations=effective_durations, + target_total_duration_s=target_duration_s, + min_silence_between_sources_ms=self.min_silence_ms, + max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms, + crossfade_within_source_ms=self.crossfade_within_source_ms + ) + + # Save audio + output_audio_path = self.audio_output / f"{sample_id}.wav" + final_audio.export(str(output_audio_path), format="wav") + + # Step 11: Generate questions + correct_category = target_category + present_categories = all_categories + + mcq_question = self.task_config['mcq_questions'][question_type] + mcq_data = self.question_generator.generate_category_mcq( + mcq_question, + correct_category, + present_categories, + self.dataset.CATEGORIES + ) + + open_text_question = self.task_config['open_text_questions'][question_type] + open_text_data = self.question_generator.generate_category_open_text( + open_text_question, + correct_category + ) + + # Calculate actual effective durations + actual_effective_durations = { + cat: slot_distribution[cat] * effective_durations[cat] + for cat in all_categories + if cat in slot_distribution + } + + # Create metadata + metadata = { + 'id': sample_id, + 'audio_path': str(output_audio_path.relative_to(self.output_base.parent)), + 'question_type': question_type, + 'max_clips': max_clips, + 'n_unique_sources': n_sources, + 'target_category': target_category, + 'present_categories': present_categories, + 'source_order': build_metadata['source_order'], + 'slot_distribution': slot_distribution, + 'effective_durations_per_clip': effective_durations, + 'total_effective_durations': actual_effective_durations, + 'gap_satisfied': gap_satisfied, + 'multiplier_used': self.multiplier_longest if question_type == 'longest' else self.multiplier_shortest, + 'files_used': files_used, + 'target_duration_s': target_duration_s, + 'actual_duration_s': len(final_audio) / 1000.0, + 'timestamp_string': build_metadata.get('timestamp_string', ''), + 'source_timestamps': build_metadata.get('source_timestamps', []), + 'mcq_question': mcq_data['question'], + 'mcq_options': mcq_data['options'], + 'mcq_correct_answer': mcq_data['correct_answer'], + 'open_text_question': open_text_data['question'], + 'open_text_answer': open_text_data['correct_answer'], + 'calc_metadata': calc_metadata + } + + self.success_count += 1 + self.logger.info( + f"Generated duration sample {sample_id}: {question_type}, " + f"max_clips={max_clips}, sources={n_sources}, target={target_category}, " + f"slots={slot_distribution}, gap_satisfied={gap_satisfied}" + ) + + return metadata + + def _try_improve_gap_with_different_clips( + self, + question_type: str, + target_category: str, + all_categories: List[str], + max_clips: int, + n_sources: int, + effective_durations: Dict[str, float], + selected_files: Dict[str, Dict], + slot_distribution: Dict[str, int] + ) -> bool: + """ + Try to improve gap satisfaction by selecting different clips. + + For LONGEST: try clips with longer effective duration for target + For SHORTEST: try clips with shorter effective duration for target + + Args: + Various state from generate_sample + + Returns: + True if gap is now satisfied + """ + files = self.dataset.get_files_by_category_with_durations(target_category) + + if question_type == "longest": + # Try to find a longer clip for target category + files_sorted = sorted(files, key=lambda x: x['effective_duration_s'], reverse=True) + else: + # For shortest, try shorter clip for target + files_sorted = sorted(files, key=lambda x: x['effective_duration_s']) + + if files_sorted: + best = files_sorted[0] + effective_durations[target_category] = best['effective_duration_s'] + selected_files[target_category] = { + 'filename': best['filename'], + 'filepath': best['filepath'], + 'effective_duration_s': best['effective_duration_s'] + } + + # Recalculate slot distribution + new_slots, gap_satisfied, _ = self._calculate_slot_distribution( + max_clips=max_clips, + n_sources=n_sources, + effective_durations=effective_durations, + target_category=target_category, + question_type=question_type + ) + + if gap_satisfied: + slot_distribution.clear() + slot_distribution.update(new_slots) + + return gap_satisfied + + def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Optional[Dict]: + """ + Generate a single duration task sample with retries. + + Args: + sample_id: Sample ID number + target_question_type: Target question type for balanced distribution + target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task) + + Returns: + Dictionary with sample metadata, or None if failed + """ + question_type = target_question_type or random.choice( + self.task_config['question_types'] + ) + + return self._try_generate_sample(sample_id, question_type, target_duration_seconds=target_duration_seconds) + + def generate_dataset(self) -> tuple: + """ + Generate the complete duration task dataset. + + Uses generate_sample_durations_for_task() to pre-generate exact sample durations + that sum to exactly the target task duration. This guarantees: + - Exact coverage of target duration + - No estimation errors from average-based calculation + + Returns: + Tuple of (mcq_csv_path, open_text_csv_path) + """ + # Generate sample durations upfront (guarantees exact total duration) + sample_durations = generate_sample_durations_for_task( + self.task_duration_hours, + self.min_clip_duration, + self.max_clip_duration + ) + num_samples = len(sample_durations) + + self.logger.info( + f"Generating {num_samples} duration task samples " + f"(target: {self.task_duration_hours}h, exact fill)..." + ) + + # Create balanced question type distribution + question_types = self.task_config['question_types'] + balanced_types = [] + samples_per_type = num_samples // len(question_types) + remainder = num_samples % len(question_types) + + for qtype in question_types: + count = samples_per_type + (1 if remainder > 0 else 0) + balanced_types.extend([qtype] * count) + remainder = max(0, remainder - 1) + + random.shuffle(balanced_types) + type_dist = Counter(balanced_types) + self.logger.info(f"Balanced question type distribution: {dict(sorted(type_dist.items()))}") + + all_metadata = [] + sample_idx = 0 + type_idx = 0 + + while len(all_metadata) < num_samples and type_idx < len(balanced_types) * 2: + question_type = balanced_types[type_idx % len(balanced_types)] + target_duration = sample_durations[sample_idx] if sample_idx < len(sample_durations) else None + + metadata = self.generate_sample(sample_idx, question_type, target_duration_seconds=target_duration) + + if metadata is not None: + all_metadata.append(metadata) + sample_idx += 1 + + type_idx += 1 + + # Log progress + if len(all_metadata) % 50 == 0: + self.logger.info( + f"Progress: {len(all_metadata)}/{num_samples} samples, " + f"{self.rejection_count} rejections" + ) + + self.logger.info( + f"Generation complete: {len(all_metadata)} samples, " + f"{self.rejection_count} rejections " + f"({self.rejection_count/(len(all_metadata)+self.rejection_count)*100:.1f}% rejection rate)" + ) + + # Save CSVs + mcq_csv_path = self.output_base / 'duration_mcq.csv' + self._save_mcq_csv(all_metadata, mcq_csv_path) + + open_text_csv_path = self.output_base / 'duration_open_text.csv' + self._save_open_text_csv(all_metadata, open_text_csv_path) + + metadata_csv_path = self.output_base / 'duration_metadata.csv' + self._save_metadata_csv(all_metadata, metadata_csv_path) + + self.logger.info(f"Duration task dataset generation complete!") + self.logger.info(f" - MCQ CSV: {mcq_csv_path}") + self.logger.info(f" - Open-text CSV: {open_text_csv_path}") + self.logger.info(f" - Metadata CSV: {metadata_csv_path}") + self.logger.info(f" - Audio files: {self.audio_output}") + + return mcq_csv_path, open_text_csv_path + + def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path): + """Save MCQ format CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'question', 'id', 'audio_path', + 'optionA', 'optionB', 'optionC', 'optionD', + 'correct', 'question_type', 'max_clips', 'n_sources', + 'target_category', 'slot_distribution', 'effective_durations' + ]) + + for meta in metadata_list: + writer.writerow([ + meta['mcq_question'], + meta['id'], + meta['audio_path'], + meta['mcq_options']['A'], + meta['mcq_options']['B'], + meta['mcq_options']['C'], + meta['mcq_options']['D'], + meta['mcq_correct_answer'], + meta['question_type'], + meta['max_clips'], + meta['n_unique_sources'], + meta['target_category'], + str(meta['slot_distribution']), + str(meta['total_effective_durations']) + ]) + + def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path): + """Save open-text format CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'question', 'id', 'audio_path', 'answer', + 'question_type', 'max_clips', 'n_sources', + 'target_category', 'effective_durations' + ]) + + for meta in metadata_list: + writer.writerow([ + meta['open_text_question'], + meta['id'], + meta['audio_path'], + meta['open_text_answer'], + meta['question_type'], + meta['max_clips'], + meta['n_unique_sources'], + meta['target_category'], + str(meta['total_effective_durations']) + ]) + + def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path): + """Save detailed metadata CSV with effective durations and timestamps.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'id', 'audio_path', 'question_type', 'max_clips', 'n_sources', + 'target_category', 'present_categories', 'source_order', + 'slot_distribution', 'effective_durations_per_clip', + 'total_effective_durations', 'gap_satisfied', 'multiplier_used', + 'target_duration_s', 'actual_duration_s', 'clip_timestamps', 'files_used' + ]) + + for meta in metadata_list: + writer.writerow([ + meta['id'], + meta['audio_path'], + meta['question_type'], + meta['max_clips'], + meta['n_unique_sources'], + meta['target_category'], + str(meta['present_categories']), + str(meta['source_order']), + str(meta['slot_distribution']), + str(meta['effective_durations_per_clip']), + str(meta['total_effective_durations']), + meta['gap_satisfied'], + meta['multiplier_used'], + round(meta['target_duration_s'], 2), + round(meta['actual_duration_s'], 2), + meta.get('timestamp_string', ''), + str(meta['files_used']) + ]) + + +def main(config_path: str = None): + """Main entry point for duration task generation.""" + import yaml + + if config_path is None: + config_path = Path(__file__).parent.parent / 'config.yaml' + + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + set_random_seed(config['random_seed']) + + logger = setup_logger( + 'duration_task', + log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']), + level=config['logging']['level'], + console_output=config['logging']['console_output'] + ) + + generator = DurationTaskGenerator(config, logger) + generator.generate_dataset() + + +if __name__ == '__main__': + main() diff --git a/tasks/task_order.py b/tasks/task_order.py new file mode 100644 index 0000000000000000000000000000000000000000..436c201b9b55d940226cb6abbb576bb40d617e7a --- /dev/null +++ b/tasks/task_order.py @@ -0,0 +1,598 @@ +""" +Task 3: Order - Generate temporal ordering questions + +This task joins multiple audio sources and asks questions about their temporal order +(first, last, what comes after, what comes before). +""" + +import csv +import random +import math +from pathlib import Path +from typing import Dict, List + +import sys +sys.path.append(str(Path(__file__).parent.parent)) + +from utils import ( + AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator, + setup_logger, set_random_seed, calculate_num_samples_for_task, + generate_single_clip_duration, get_max_clip_num_to_be_joined, + build_clip_sequence_with_silences, generate_sample_durations_for_task +) + + +class OrderTaskGenerator: + """Generator for temporal ordering task dataset.""" + + def __init__(self, config: Dict, logger): + """ + Initialize order task generator. + + Args: + config: Configuration dictionary + logger: Logger instance + """ + self.config = config + self.logger = logger + self.task_config = config['tasks']['order'] + + # Initialize components + self.dataset = ESC50Dataset( + config['esc50']['metadata_path'], + config['esc50']['audio_path'], + config # Pass config for class subset loading + ) + self.audio_processor = AudioProcessor( + crossfade_duration=config['audio']['crossfade_duration'], + silence_duration=config['audio']['silence_duration'], + with_silence=config['audio']['with_silence'], + normalize=config['audio']['normalize'], + normalize_target_dBFS=config['audio']['normalize_target_dBFS'], + synthetic_silence_path=config['synthetic_silence']['path'] + ) + self.question_generator = QuestionGenerator( + num_options=config['mcq']['num_options'], + option_labels=config['mcq']['option_labels'], + distractor_strategy=config['mcq']['distractor_strategy'] + ) + + # Initialize LLM question generator + self.llm_enabled = config.get('llm', {}).get('enabled', False) + self.llm_generator = LLMQuestionGenerator( + enabled=self.llm_enabled, + template_questions=self.task_config + ) + + # Duration settings from config + self.min_clip_duration = config['audio']['min_clip_duration'] + self.max_clip_duration = config['audio']['max_clip_duration'] + # Duration of individual source clips (ESC-50 default is 5s) + self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0) + self.min_silence_ms = config['audio'].get('min_silence_duration', 100) + self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500) + self.crossfade_ms = config['audio'].get('crossfade_duration', 0) + self.task_duration_hours = self.task_config['task_duration_size'] + + # Order task specific settings + self.allow_source_repetition = self.task_config.get('allow_source_repetition', False) + self.min_clips_for_second = self.task_config.get('min_clips_for_second_questions', 4) + + # Set up output paths + self.output_base = Path(config['output']['base_path']) / 'order' + self.output_base.mkdir(parents=True, exist_ok=True) + self.audio_output = self.output_base / 'audios' + self.audio_output.mkdir(parents=True, exist_ok=True) + + def _get_valid_question_types(self, n_clips: int) -> List[str]: + """ + Get question types valid for the given number of clips. + + "second" and "second_last" require at least min_clips_for_second clips. + + Args: + n_clips: Number of clips in the sample + + Returns: + List of valid question types + """ + all_types = self.task_config['question_types'] + + # Filter based on n_clips + valid_types = [] + for qtype in all_types: + if qtype in ['second', 'second_last']: + if n_clips >= self.min_clips_for_second: + valid_types.append(qtype) + elif qtype in ['after', 'before']: + if n_clips >= 2: + valid_types.append(qtype) + else: # first, last + valid_types.append(qtype) + + return valid_types if valid_types else ['first', 'last'] + + def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Dict: + """ + Generate a single order task sample. + + Pipeline: pick dataset -> pick class -> pick audio clip -> get duration -> + concatenate clips to reach target duration -> modulo to get num clips -> + inserting silences randomly based on remainder. + + Args: + sample_id: Sample ID number + target_question_type: Target question type for balanced distribution + target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task) + + Returns: + Dictionary with sample metadata + """ + # Use pre-generated duration or generate one (backward compatibility) + if target_duration_seconds is not None: + clip_duration_seconds = target_duration_seconds + else: + clip_duration_seconds = generate_single_clip_duration( + self.min_clip_duration, + self.max_clip_duration + ) + + # Calculate how many clips we need using the new helper + max_clips, remainder_seconds = get_max_clip_num_to_be_joined( + clip_duration_seconds, + self.source_clip_duration, + self.min_silence_ms + ) + + max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10) + + # Silence reduction strategy: subsample from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)] + # This ensures we use close to max_clips that fit, reducing excessive silence + + # Calculate valid range for this sample's duration + min_clips_for_sample = max(2, max_clips - 3) # At least 2, preferably max_clips-3 + max_clips_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES)) + + # Validate range + if max_clips_for_sample < 2: + raise ValueError( + f"Sample {sample_id}: Cannot generate order task - need at least 2 clips. " + f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, " + f"duration={clip_duration_seconds:.1f}s. Increase min_clip_duration." + ) + + if min_clips_for_sample > max_clips_for_sample: + raise ValueError( + f"Sample {sample_id}: Invalid clip range - min_clips ({min_clips_for_sample}) > max_clips ({max_clips_for_sample}). " + f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, duration={clip_duration_seconds:.1f}s" + ) + + # Randomly select from valid range (NO balanced pool for order task) + n_clips = random.randint(min_clips_for_sample, max_clips_for_sample) + + # Get valid question types for this n_clips + valid_question_types = self._get_valid_question_types(n_clips) + + if not valid_question_types: + raise ValueError( + f"Sample {sample_id}: No valid question types for n_clips={n_clips}. " + f"This should not happen - check _get_valid_question_types implementation." + ) + + # Pre-select question type to determine answer position + if target_question_type is not None: + if target_question_type not in valid_question_types: + raise ValueError( + f"Sample {sample_id}: target_question_type='{target_question_type}' not valid for n_clips={n_clips}. " + f"Valid types: {valid_question_types}. Balanced distribution should only assign valid types." + ) + question_type = target_question_type + else: + question_type = random.choice(valid_question_types) + + # Determine answer position based on question type + if question_type == 'first': + answer_position = 0 + elif question_type == 'last': + answer_position = n_clips - 1 + elif question_type == 'second': + answer_position = 1 # 0-indexed, so position 1 is second + elif question_type == 'second_last': + answer_position = n_clips - 2 # Second to last + elif question_type == 'after': + # Answer is after a reference, so position 1 to n-1 + answer_position = random.randint(1, n_clips - 1) if n_clips >= 2 else 0 + else: # before + # Answer is before a reference, so position 0 to n-2 + answer_position = random.randint(0, n_clips - 2) if n_clips >= 2 else 0 + + # Select answer category from least-used categories + answer_category = self.dataset.get_least_used_categories(1)[0] + + # Sample remaining categories, ensuring balanced distribution + if n_clips <= len(self.dataset.CATEGORIES): + other_categories = self.dataset.get_least_used_categories( + n_clips - 1, + exclude=[answer_category] + ) + else: + # Need more clips than unique categories - sample with some repetition + other_categories = self.dataset.get_least_used_categories( + min(n_clips - 1, len(self.dataset.CATEGORIES) - 1), + exclude=[answer_category] + ) + # Add random repetitions if needed + while len(other_categories) < n_clips - 1: + other_categories.append(random.choice(self.dataset.CATEGORIES)) + + # Arrange categories with answer at correct position + selected_categories = [] + other_idx = 0 + for i in range(n_clips): + if i == answer_position: + selected_categories.append(answer_category) + else: + selected_categories.append(other_categories[other_idx]) + other_idx += 1 + + # Track usage of answer category + self.dataset.category_usage_counts[answer_category] += 1 + + # Sample one file from each category and load audio + audio_segments = [] + filenames_list = [] + + for category in selected_categories: + filename, filepath = self.dataset.sample_file_from_category(category) + audio = self.audio_processor.load_audio(filepath) + audio_segments.append(audio) + filenames_list.append(filename) + + # Build final audio with guaranteed silences between clips + output_audio_path = self.audio_output / f"{sample_id}.wav" + final_audio = build_clip_sequence_with_silences( + audio_segments, + clip_duration_seconds, + min_silence_ms=self.min_silence_ms, + max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms, + crossfade_ms=self.crossfade_ms + ) + + # Save the audio + final_audio.export(str(output_audio_path), format="wav") + + # Determine correct answer and generate questions based on question type + # CRITICAL BUG FIX: Verify answer_category is actually at answer_position + if selected_categories[answer_position] != answer_category: + self.logger.error(f"Sample {sample_id}: Answer mismatch! Expected {answer_category} at position {answer_position}, got {selected_categories[answer_position]}") + # Force correct by using actual category at answer_position + correct_category = selected_categories[answer_position] + else: + correct_category = answer_category + + if question_type == 'first': + mcq_question = self.task_config['mcq_questions']['first'] + open_text_question = self.task_config['open_text_questions']['first'] + + elif question_type == 'last': + mcq_question = self.task_config['mcq_questions']['last'] + open_text_question = self.task_config['open_text_questions']['last'] + + elif question_type == 'second': + mcq_question = self.task_config['mcq_questions']['second'] + open_text_question = self.task_config['open_text_questions']['second'] + + elif question_type == 'second_last': + mcq_question = self.task_config['mcq_questions']['second_last'] + open_text_question = self.task_config['open_text_questions']['second_last'] + + elif question_type == 'after': + # Reference is the sound before answer_position + if answer_position > 0: + reference_category = selected_categories[answer_position - 1] + mcq_question = self.task_config['mcq_questions']['after'].format(sound1=reference_category) + open_text_question = self.task_config['open_text_questions']['after'].format(sound1=reference_category) + else: + # Fallback shouldn't happen but handle gracefully + mcq_question = self.task_config['mcq_questions']['first'] + open_text_question = self.task_config['open_text_questions']['first'] + + else: # before + # Reference is the sound after answer_position + if answer_position < n_clips - 1: + reference_category = selected_categories[answer_position + 1] + mcq_question = self.task_config['mcq_questions']['before'].format(sound2=reference_category) + open_text_question = self.task_config['open_text_questions']['before'].format(sound2=reference_category) + else: + # Fallback to 'first' if only 1 clip + correct_category = selected_categories[0] + mcq_question = self.task_config['mcq_questions']['first'] + open_text_question = self.task_config['open_text_questions']['first'] + question_type = 'first' + + # Generate MCQ + mcq_data = self.question_generator.generate_category_mcq( + mcq_question, + correct_category, + selected_categories, + self.dataset.CATEGORIES + ) + + # Generate open-text question + open_text_data = self.question_generator.generate_category_open_text( + open_text_question, + correct_category + ) + + # Also generate a sequence question for open-text + sequence_question = self.task_config['open_text_questions']['sequence'] + sequence_data = self.question_generator.generate_sequence_open_text( + sequence_question, + selected_categories + ) + + # Create metadata + metadata = { + 'id': sample_id, + 'audio_path': str(output_audio_path.relative_to(self.output_base.parent)), + 'n_clips': n_clips, + 'question_type': question_type, + 'audio_sequence': selected_categories, + 'correct_answer_category': correct_category, + 'source_files': filenames_list, + 'mcq_question': mcq_data['question'], + 'mcq_options': mcq_data['options'], + 'mcq_correct_answer': mcq_data['correct_answer'], + 'open_text_question': open_text_data['question'], + 'open_text_answer': open_text_data['correct_answer'], + 'sequence_question': sequence_data['question'], + 'sequence_answer': sequence_data['correct_answer'] + } + + self.logger.info(f"Generated order sample {sample_id}: {question_type}, {n_clips} clips") + + return metadata + + def generate_dataset(self) -> tuple: + """ + Generate the complete order task dataset. + + Uses generate_sample_durations_for_task() to pre-generate exact sample durations + that sum to exactly the target task duration. This guarantees: + - Exact coverage of target duration + - No estimation errors from average-based calculation + + Returns: + Tuple of (mcq_csv_path, open_text_csv_path, sequence_csv_path) + """ + # Generate sample durations upfront (guarantees exact total duration) + sample_durations = generate_sample_durations_for_task( + self.task_duration_hours, + self.min_clip_duration, + self.max_clip_duration + ) + num_samples = len(sample_durations) + + self.logger.info(f"Generating {num_samples} order task samples (target: {self.task_duration_hours}h, exact fill)...") + + # Calculate effective max clips each sample can use (accounting for silence reduction) + # This matches the logic in generate_sample() + max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10) + sample_effective_max_clips = [] + + for duration in sample_durations: + max_clips, _ = get_max_clip_num_to_be_joined( + duration, + self.source_clip_duration, + self.min_silence_ms + ) + # Apply the same constraints as generate_sample() + effective_max = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES)) + sample_effective_max_clips.append(effective_max) + + # Create capacity-aware balanced question type distribution + # Categorize question types by clip requirements + question_types = self.task_config['question_types'] + + # Separate into tiers based on clip requirements + basic_types = ['first', 'last', 'after', 'before'] # Need >= 2 clips + advanced_types = ['second', 'second_last'] # Need >= min_clips_for_second + + # Count how many samples can support each tier (use effective max, not raw max) + samples_for_basic = sum(1 for emc in sample_effective_max_clips if emc >= 2) + samples_for_advanced = sum(1 for emc in sample_effective_max_clips if emc >= self.min_clips_for_second) + + # Create list of (sample_idx, duration, effective_max_clips) + sample_info = [(i, sample_durations[i], sample_effective_max_clips[i]) for i in range(num_samples)] + + # Sort by capacity (descending) - assign advanced types to high-capacity samples + sample_info.sort(key=lambda x: x[2], reverse=True) + + # Calculate distribution: prefer advanced types for longer clips + samples_per_type = num_samples // len(question_types) + remainder = num_samples % len(question_types) + + # Build assignment pool - advanced types first (for high-capacity samples) + assignment_pool = [] + for qtype in advanced_types: + count = samples_per_type + (1 if remainder > 0 else 0) + assignment_pool.extend([qtype] * count) + remainder = max(0, remainder - 1) + + for qtype in basic_types: + count = samples_per_type + (1 if remainder > 0 else 0) + assignment_pool.extend([qtype] * count) + remainder = max(0, remainder - 1) + + # Assign question types based on capacity + balanced_assignments = [None] * num_samples + + for idx, (sample_idx, duration, capacity) in enumerate(sample_info): + target_qtype = assignment_pool[idx] + + # Validate and adjust if needed + valid_types = self._get_valid_question_types(capacity) + + if target_qtype not in valid_types: + # Assign a valid alternative - prefer similar types + if target_qtype in advanced_types and any(t in valid_types for t in basic_types): + # Downgrade to basic type + target_qtype = random.choice([t for t in basic_types if t in valid_types]) + else: + # Fallback to any valid type + target_qtype = random.choice(valid_types) + + balanced_assignments[sample_idx] = target_qtype + + # Log the actual distribution after capacity-aware assignment + from collections import Counter + type_dist = Counter(balanced_assignments) + self.logger.info(f"Balanced question type distribution (after capacity-aware assignment): {dict(sorted(type_dist.items()))}") + + all_metadata = [] + + for i, target_duration in enumerate(sample_durations): + metadata = self.generate_sample(i, target_question_type=balanced_assignments[i], target_duration_seconds=target_duration) + all_metadata.append(metadata) # Save MCQ CSV + mcq_csv_path = self.output_base / 'order_mcq.csv' + self._save_mcq_csv(all_metadata, mcq_csv_path) + + # Save open-text CSV + open_text_csv_path = self.output_base / 'order_open_text.csv' + self._save_open_text_csv(all_metadata, open_text_csv_path) + + # Save sequence CSV + sequence_csv_path = self.output_base / 'order_sequence.csv' + self._save_sequence_csv(all_metadata, sequence_csv_path) + + # Save metadata CSV + metadata_csv_path = self.output_base / 'order_metadata.csv' + self._save_metadata_csv(all_metadata, metadata_csv_path) + + self.logger.info(f"Order task dataset generation complete!") + self.logger.info(f" - MCQ CSV: {mcq_csv_path}") + self.logger.info(f" - Open-text CSV: {open_text_csv_path}") + self.logger.info(f" - Sequence CSV: {sequence_csv_path}") + self.logger.info(f" - Metadata CSV: {metadata_csv_path}") + self.logger.info(f" - Audio files: {self.audio_output}") + + return mcq_csv_path, open_text_csv_path, sequence_csv_path + + def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path): + """Save MCQ format CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'question', 'id', 'audio_path', + 'optionA', 'optionB', 'optionC', 'optionD', + 'correct', 'question_type', 'audio_sequence' + ]) + + # Data rows + for meta in metadata_list: + writer.writerow([ + meta['mcq_question'], + meta['id'], + meta['audio_path'], + meta['mcq_options']['A'], + meta['mcq_options']['B'], + meta['mcq_options']['C'], + meta['mcq_options']['D'], + meta['mcq_correct_answer'], + meta['question_type'], + str(meta['audio_sequence']) + ]) + + def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path): + """Save open-text format CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'question', 'id', 'audio_path', 'answer', + 'question_type', 'audio_sequence' + ]) + + # Data rows + for meta in metadata_list: + writer.writerow([ + meta['open_text_question'], + meta['id'], + meta['audio_path'], + meta['open_text_answer'], + meta['question_type'], + str(meta['audio_sequence']) + ]) + + def _save_sequence_csv(self, metadata_list: List[Dict], output_path: Path): + """Save sequence question CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'question', 'id', 'audio_path', 'answer', 'audio_sequence' + ]) + + # Data rows + for meta in metadata_list: + writer.writerow([ + meta['sequence_question'], + meta['id'], + meta['audio_path'], + meta['sequence_answer'], + str(meta['audio_sequence']) + ]) + + def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path): + """Save detailed metadata CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'id', 'audio_path', 'n_clips', 'question_type', + 'audio_sequence', 'correct_answer', 'source_files' + ]) + + # Data rows + for meta in metadata_list: + writer.writerow([ + meta['id'], + meta['audio_path'], + meta['n_clips'], + meta['question_type'], + str(meta['audio_sequence']), + meta['correct_answer_category'], + str(meta['source_files']) + ]) + + +def main(config_path: str = None): + """Main entry point for order task generation.""" + import yaml + + # Load configuration + if config_path is None: + config_path = Path(__file__).parent.parent / 'config.yaml' + + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + # Set random seed + set_random_seed(config['random_seed']) + + # Setup logger + logger = setup_logger( + 'order_task', + log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']), + level=config['logging']['level'], + console_output=config['logging']['console_output'] + ) + + # Generate dataset + generator = OrderTaskGenerator(config, logger) + generator.generate_dataset() + + +if __name__ == '__main__': + main() diff --git a/tasks/task_volume.py b/tasks/task_volume.py new file mode 100644 index 0000000000000000000000000000000000000000..e0f6a6c20877948c9853936f70b6cbf414465d22 --- /dev/null +++ b/tasks/task_volume.py @@ -0,0 +1,732 @@ +""" +Task 4: Volume - Generate volume comparison questions + +This task joins multiple audio sources with different volume levels +and asks questions about the loudest or softest sound. +""" + +import csv +import random +import math +from pathlib import Path +from typing import Dict, List, Tuple, Optional + +import sys +sys.path.append(str(Path(__file__).parent.parent)) + +from utils import ( + AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator, + setup_logger, set_random_seed, calculate_num_samples_for_task, + generate_single_clip_duration, get_max_clip_num_to_be_joined, + build_clip_sequence_with_silences, generate_sample_durations_for_task, + get_lufs_loudness, normalize_to_lufs +) + + +class VolumeTaskGenerator: + """Generator for volume comparison task dataset.""" + + def __init__(self, config: Dict, logger): + """ + Initialize volume task generator. + + Args: + config: Configuration dictionary + logger: Logger instance + """ + self.config = config + self.logger = logger + self.task_config = config['tasks']['volume'] + + # Initialize components + self.dataset = ESC50Dataset( + config['esc50']['metadata_path'], + config['esc50']['audio_path'], + config # Pass config for class subset loading + ) + self.audio_processor = AudioProcessor( + crossfade_duration=config['audio']['crossfade_duration'], + silence_duration=config['audio']['silence_duration'], + with_silence=config['audio']['with_silence'], + normalize=config['audio']['normalize'], + normalize_target_dBFS=config['audio']['normalize_target_dBFS'], + synthetic_silence_path=config['synthetic_silence']['path'] + ) + self.question_generator = QuestionGenerator( + num_options=config['mcq']['num_options'], + option_labels=config['mcq']['option_labels'], + distractor_strategy=config['mcq']['distractor_strategy'] + ) + + # Initialize LLM question generator + self.llm_enabled = config.get('llm', {}).get('enabled', False) + self.llm_generator = LLMQuestionGenerator( + enabled=self.llm_enabled, + template_questions=self.task_config + ) + + # Duration settings from config + self.min_clip_duration = config['audio']['min_clip_duration'] + self.max_clip_duration = config['audio']['max_clip_duration'] + # Duration of individual source clips (ESC-50 default is 5s) + self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0) + self.min_silence_ms = config['audio'].get('min_silence_duration', 100) + self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500) + self.crossfade_ms = config['audio'].get('crossfade_duration', 0) + self.task_duration_hours = self.task_config['task_duration_size'] + + # Volume task specific settings + self.normalize_to_baseline = self.task_config.get('normalize_to_baseline', True) + self.baseline_dBFS = self.task_config.get('baseline_dBFS', -20.0) + self.use_same_clip_different_volumes = self.task_config.get('use_same_clip_different_volumes', False) + self.repetitions_per_source = self.task_config.get('repetitions_per_source', [2, 3, 4]) + if isinstance(self.repetitions_per_source, int): + self.repetitions_per_source = [self.repetitions_per_source] + + # Volume gap multipliers (similar to duration task) + self.multiplier_max_loudness = self.task_config.get('multiplier_max_loudness', 1.5) + self.multiplier_min_loudness = self.task_config.get('multiplier_min_loudness', 0.5) + self.reject_if_gap_not_met = self.task_config.get('reject_if_gap_not_met', True) + + # LUFS vs dBFS loudness measurement option + # LUFS (Loudness Units Full Scale) measures PERCEIVED loudness + # dBFS measures RMS amplitude - does NOT account for frequency sensitivity + # LUFS is recommended for comparing different sound types + self.use_lufs = self.task_config.get('use_lufs', True) + self.baseline_lufs = self.task_config.get('baseline_lufs', -23.0) # EBU R128 standard + + # Set up output paths + self.output_base = Path(config['output']['base_path']) / 'volume' + self.output_base.mkdir(parents=True, exist_ok=True) + self.audio_output = self.output_base / 'audios' + self.audio_output.mkdir(parents=True, exist_ok=True) + + # Create balanced sampling pool for num_clips + self.clips_count_pool = [] + + def _normalize_to_baseline(self, audio: "AudioSegment") -> "AudioSegment": + """ + Normalize audio to the baseline loudness level. + + Uses LUFS (perceived loudness) if use_lufs=True, otherwise dBFS. + This ensures all clips start from the same perceived loudness before + applying volume adjustments. + + Args: + audio: Input audio segment + + Returns: + Normalized audio segment + """ + if not self.normalize_to_baseline: + return audio + + if self.use_lufs: + # Use LUFS-based normalization (perceived loudness) + normalized = normalize_to_lufs(audio, self.baseline_lufs) + self.logger.debug( + f"Normalized to baseline LUFS: {get_lufs_loudness(audio):.2f} -> {get_lufs_loudness(normalized):.2f} LUFS" + ) + return normalized + else: + # Use dBFS normalization (RMS amplitude) + change_in_dBFS = self.baseline_dBFS - audio.dBFS + normalized = audio.apply_gain(change_in_dBFS) + self.logger.debug( + f"Normalized to baseline dBFS: {audio.dBFS:.2f} -> {normalized.dBFS:.2f} dBFS" + ) + return normalized + + def _get_amplitude_loudness(self, audio: "AudioSegment") -> float: + """ + Get the loudness of an audio clip. + + Uses LUFS (perceived loudness) if use_lufs=True, otherwise dBFS. + + Args: + audio: Input audio segment + + Returns: + Loudness in LUFS or dBFS depending on configuration + """ + if self.use_lufs: + return get_lufs_loudness(audio) + else: + return audio.dBFS + + def _verify_loudness_gap( + self, + volume_levels: List[float], + question_type: str + ) -> Tuple[bool, int, Dict]: + """ + Verify that loudness gap constraint is satisfied. + + For MAX_LOUDNESS: max_volume >= second_max × multiplier_max + For MIN_LOUDNESS: min_volume <= second_min × multiplier_min + + Since we work with dB (logarithmic), the gap is in dB difference: + - For max: max_dB - second_max_dB >= required_gap_dB + - For min: second_min_dB - min_dB >= required_gap_dB + + The multiplier translates to dB: 1.5x linear = ~3.5dB, 2x = ~6dB + + Args: + volume_levels: List of volume adjustments in dB + question_type: "max_loudness" or "min_loudness" + + Returns: + Tuple of (gap_satisfied, answer_idx, metadata) + """ + import math + + sorted_levels = sorted(volume_levels, reverse=True) # Highest first + + if question_type == "max_loudness": + max_level = sorted_levels[0] + second_max = sorted_levels[1] if len(sorted_levels) > 1 else sorted_levels[0] + + # Convert multiplier to dB difference + # multiplier 1.5 means 1.5x louder in amplitude = 20*log10(1.5) ≈ 3.5 dB + required_gap_dB = 20 * math.log10(self.multiplier_max_loudness) + actual_gap_dB = max_level - second_max + + gap_satisfied = actual_gap_dB >= required_gap_dB + answer_idx = volume_levels.index(max_level) + + metadata = { + 'max_level_dB': max_level, + 'second_max_dB': second_max, + 'required_gap_dB': required_gap_dB, + 'actual_gap_dB': actual_gap_dB, + 'multiplier': self.multiplier_max_loudness + } + + else: # min_loudness + min_level = sorted_levels[-1] + second_min = sorted_levels[-2] if len(sorted_levels) > 1 else sorted_levels[-1] + + # For min, we want min to be multiplier times softer + # multiplier 0.5 means 0.5x amplitude = 20*log10(0.5) ≈ -6 dB + # So second_min - min_level should be >= 6 dB + required_gap_dB = abs(20 * math.log10(self.multiplier_min_loudness)) + actual_gap_dB = second_min - min_level + + gap_satisfied = actual_gap_dB >= required_gap_dB + answer_idx = volume_levels.index(min_level) + + metadata = { + 'min_level_dB': min_level, + 'second_min_dB': second_min, + 'required_gap_dB': required_gap_dB, + 'actual_gap_dB': actual_gap_dB, + 'multiplier': self.multiplier_min_loudness + } + + return gap_satisfied, answer_idx, metadata + + def generate_volume_levels(self, n_clips: int, question_type: str = None) -> List[float]: + """ + Generate volume levels dynamically based on multiplier constraints. + + The levels are generated to ensure proper gap for the question type: + - For max_loudness: the loudest is clearly distinguishable (gap = multiplier_max) + - For min_loudness: the softest is clearly distinguishable (gap = multiplier_min) + + Args: + n_clips: Number of clips + question_type: "max_loudness" or "min_loudness" to ensure proper gap + + Returns: + List of volume adjustments in dB (integers) + """ + # Base spacing between adjacent volume levels (minimum audible difference) + # 6 dB = 2x amplitude, 12 dB = 4x amplitude (clearly distinguishable) + min_diff = 12 # 12 dB is a VERY noticeable difference (4x perceived loudness) + + # Calculate required gap based on multiplier (round up to nearest int) + if question_type == "max_loudness": + required_gap = int(math.ceil(20 * math.log10(self.multiplier_max_loudness))) + elif question_type == "min_loudness": + required_gap = int(math.ceil(abs(20 * math.log10(self.multiplier_min_loudness)))) + else: + required_gap = min_diff + + # Ensure gap is at least min_diff + required_gap = max(required_gap, min_diff) + + if question_type == "max_loudness": + # Generate levels where max has clear gap from others + # Max level (answer) at a high value - MUCH louder + max_level = 18 # dB adjustment = ~8x louder than baseline + + # Other levels should be at least required_gap below max + # Spread them out with min_diff spacing + other_levels = [] + current_level = max_level - required_gap + for i in range(n_clips - 1): + other_levels.append(current_level) + current_level -= min_diff + + selected_levels = other_levels + [max_level] + + elif question_type == "min_loudness": + # Generate levels where min has clear gap from others + # Min level (answer) at a low value - MUCH quieter + min_level = -24 # dB adjustment = ~1/16th of baseline volume + + # Other levels should be at least required_gap above min + # Spread them out with min_diff spacing + other_levels = [] + current_level = min_level + required_gap + for i in range(n_clips - 1): + other_levels.append(current_level) + current_level += min_diff + + selected_levels = [min_level] + other_levels + + else: + # Default: evenly spaced levels centered around 0 + total_range = (n_clips - 1) * min_diff + start_level = -total_range // 2 + selected_levels = [start_level + i * min_diff for i in range(n_clips)] + + # Shuffle to randomize order in the audio + random.shuffle(selected_levels) + + return selected_levels + + def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Dict: + """ + Generate a single volume task sample. + + Pipeline: + 1. Pick dataset -> pick class -> pick audio clip + 2. NORMALIZE all clips to baseline dBFS (critical for controlled comparison) + 3. Apply different volume adjustments to each clip + 4. Concatenate clips with silences + + Optionally: use same clip with different volume levels if configured. + + Args: + sample_id: Sample ID number + target_question_type: Target question type for balanced distribution + target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task) + + Returns: + Dictionary with sample metadata + """ + # Use pre-generated duration or generate one (backward compatibility) + if target_duration_seconds is not None: + clip_duration_seconds = target_duration_seconds + else: + clip_duration_seconds = generate_single_clip_duration( + self.min_clip_duration, + self.max_clip_duration + ) + + # Calculate how many clips we need using the new helper + max_clips, remainder_seconds = get_max_clip_num_to_be_joined( + clip_duration_seconds, + self.source_clip_duration, + self.min_silence_ms + ) + + max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10) + + # Silence reduction strategy: subsample from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)] + # This ensures we use close to max_clips that fit, reducing excessive silence + + # Calculate valid range for this sample's duration + min_clips_for_sample = max(2, max_clips - 3) # At least 2, preferably max_clips-3 + max_clips_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES)) + + # Validate range + if max_clips_for_sample < 2: + raise ValueError( + f"Sample {sample_id}: Cannot generate volume task - need at least 2 clips. " + f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, " + f"duration={clip_duration_seconds:.1f}s. Increase min_clip_duration." + ) + + if min_clips_for_sample > max_clips_for_sample: + raise ValueError( + f"Sample {sample_id}: Invalid clip range - min_clips ({min_clips_for_sample}) > max_clips ({max_clips_for_sample}). " + f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, duration={clip_duration_seconds:.1f}s" + ) + + # Randomly select from valid range (NO balanced pool for volume task) + n_clips = random.randint(min_clips_for_sample, max_clips_for_sample) + n_clips = max(2, n_clips) # Ensure at least 2 for volume comparison + + # Pre-select question type to determine answer position + # Use target question type if provided, otherwise randomly select + if target_question_type is not None: + question_type = target_question_type + else: + question_type = random.choice(self.task_config['question_types']) + + # Generate volume levels and verify gap constraint + max_attempts = 10 + gap_satisfied = False + volume_levels = None + gap_metadata = None + + for attempt in range(max_attempts): + volume_levels = self.generate_volume_levels(n_clips, question_type) + gap_satisfied, answer_idx, gap_metadata = self._verify_loudness_gap( + volume_levels, question_type + ) + + if gap_satisfied: + break + + self.logger.debug( + f"Sample {sample_id} attempt {attempt+1}: gap not satisfied, " + f"required={gap_metadata['required_gap_dB']:.1f}dB, " + f"actual={gap_metadata['actual_gap_dB']:.1f}dB" + ) + + if not gap_satisfied and self.reject_if_gap_not_met: + self.logger.warning( + f"Sample {sample_id} rejected: loudness gap not satisfied after {max_attempts} attempts" + ) + return None + + # Determine answer position based on question type + if question_type == 'max_loudness': + answer_idx = volume_levels.index(max(volume_levels)) + else: # min_loudness + answer_idx = volume_levels.index(min(volume_levels)) + + # Select answer category from least-used categories + answer_category = self.dataset.get_least_used_categories(1)[0] + + # Determine if using same clip with different volumes + if self.use_same_clip_different_volumes: + # Use ONE source clip repeated at different volume levels + selected_categories = [answer_category] * n_clips + # Track usage + self.dataset.category_usage_counts[answer_category] += 1 + correct_category = answer_category + else: + # Use different source clips (original behavior) + # Sample remaining categories, ensuring balanced distribution + if n_clips <= len(self.dataset.CATEGORIES): + other_categories = self.dataset.get_least_used_categories( + n_clips - 1, + exclude=[answer_category] + ) + else: + # Need more clips than unique categories + other_categories = self.dataset.get_least_used_categories( + min(n_clips - 1, len(self.dataset.CATEGORIES) - 1), + exclude=[answer_category] + ) + # Add random repetitions if needed + while len(other_categories) < n_clips - 1: + other_categories.append(random.choice(self.dataset.CATEGORIES)) + + # Arrange categories with answer at correct position + selected_categories = [] + other_idx = 0 + for i in range(n_clips): + if i == answer_idx: + selected_categories.append(answer_category) + else: + selected_categories.append(other_categories[other_idx]) + other_idx += 1 + + # Track usage of answer category + self.dataset.category_usage_counts[answer_category] += 1 + + # CRITICAL BUG FIX: Verify answer_category is actually at answer_idx + if selected_categories[answer_idx] != answer_category: + self.logger.error(f"Sample {sample_id}: Answer mismatch! Expected {answer_category} at index {answer_idx}, got {selected_categories[answer_idx]}") + correct_category = selected_categories[answer_idx] + else: + correct_category = answer_category + + # Sample files and process audio + audio_segments = [] + filenames_list = [] + original_loudness = [] + final_loudness = [] + + if self.use_same_clip_different_volumes: + # Load one file and repeat it with different volumes + filename, filepath = self.dataset.sample_file_from_category(answer_category) + base_audio = self.audio_processor.load_audio(filepath) + original_loudness_val = self._get_amplitude_loudness(base_audio) + + # Normalize to baseline first + base_audio_normalized = self._normalize_to_baseline(base_audio) + + for i in range(n_clips): + # Apply volume adjustment to normalized audio + audio_adjusted = self.audio_processor.adjust_volume( + base_audio_normalized, + volume_levels[i] + ) + audio_segments.append(audio_adjusted) + filenames_list.append(filename) + original_loudness.append(original_loudness_val) + final_loudness.append(self._get_amplitude_loudness(audio_adjusted)) + else: + # Use different files (original behavior but with normalization) + for i, category in enumerate(selected_categories): + filename, filepath = self.dataset.sample_file_from_category(category) + audio = self.audio_processor.load_audio(filepath) + + # Record original loudness + orig_loud = self._get_amplitude_loudness(audio) + original_loudness.append(orig_loud) + + # STEP 1: Normalize to baseline dBFS + audio_normalized = self._normalize_to_baseline(audio) + + # STEP 2: Apply volume adjustment (relative to baseline) + audio_adjusted = self.audio_processor.adjust_volume( + audio_normalized, + volume_levels[i] + ) + + audio_segments.append(audio_adjusted) + filenames_list.append(filename) + final_loudness.append(self._get_amplitude_loudness(audio_adjusted)) + + # Build final audio with guaranteed silences between clips + output_audio_path = self.audio_output / f"{sample_id}.wav" + final_audio = build_clip_sequence_with_silences( + audio_segments, + clip_duration_seconds, + min_silence_ms=self.min_silence_ms, + max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms, + crossfade_ms=self.crossfade_ms + ) + + # Save the audio + final_audio.export(str(output_audio_path), format="wav") + + # Generate MCQ + mcq_question = self.task_config['mcq_questions'][question_type] + mcq_data = self.question_generator.generate_category_mcq( + mcq_question, + correct_category, + selected_categories, + self.dataset.CATEGORIES + ) + + # Generate open-text question + open_text_question = self.task_config['open_text_questions'][question_type] + open_text_data = self.question_generator.generate_category_open_text( + open_text_question, + correct_category + ) + + # Create category to volume mapping + category_volumes = { + selected_categories[i]: volume_levels[i] + for i in range(n_clips) + } + + # Create metadata + metadata = { + 'id': sample_id, + 'audio_path': str(output_audio_path.relative_to(self.output_base.parent)), + 'n_clips': n_clips, + 'question_type': question_type, + 'audio_sequence': selected_categories, + 'volume_levels_db': volume_levels, + 'category_volumes': category_volumes, + 'correct_answer_category': correct_category, + 'correct_volume_db': volume_levels[answer_idx], + 'source_files': filenames_list, + 'use_same_clip': self.use_same_clip_different_volumes, + 'baseline_dBFS': self.baseline_dBFS if self.normalize_to_baseline else None, + 'original_loudness_dBFS': original_loudness, + 'final_loudness_dBFS': final_loudness, + 'gap_satisfied': gap_satisfied, + 'gap_metadata': gap_metadata, + 'mcq_question': mcq_data['question'], + 'mcq_options': mcq_data['options'], + 'mcq_correct_answer': mcq_data['correct_answer'], + 'open_text_question': open_text_data['question'], + 'open_text_answer': open_text_data['correct_answer'] + } + + self.logger.info( + f"Generated volume sample {sample_id}: {question_type}, {n_clips} clips, " + f"volumes={volume_levels}, gap_satisfied={gap_satisfied}, " + f"gap={gap_metadata['actual_gap_dB']:.1f}dB (required={gap_metadata['required_gap_dB']:.1f}dB)" + ) + + return metadata + + def generate_dataset(self) -> tuple: + """ + Generate the complete volume task dataset. + + Uses generate_sample_durations_for_task() to pre-generate exact sample durations + that sum to exactly the target task duration. This guarantees: + - Exact coverage of target duration + - No estimation errors from average-based calculation + + Returns: + Tuple of (mcq_csv_path, open_text_csv_path) + """ + # Generate sample durations upfront (guarantees exact total duration) + sample_durations = generate_sample_durations_for_task( + self.task_duration_hours, + self.min_clip_duration, + self.max_clip_duration + ) + num_samples = len(sample_durations) + + self.logger.info(f"Generating {num_samples} volume task samples (target: {self.task_duration_hours}h, exact fill)...") + + # Create balanced question type distribution (NO clips balancing for volume task) + question_types = self.task_config['question_types'] + balanced_question_types = [] + samples_per_type = num_samples // len(question_types) + remainder = num_samples % len(question_types) + + for qtype in question_types: + count = samples_per_type + (1 if remainder > 0 else 0) + balanced_question_types.extend([qtype] * count) + remainder = max(0, remainder - 1) + + random.shuffle(balanced_question_types) + from collections import Counter + type_dist = Counter(balanced_question_types) + self.logger.info(f"Balanced question type distribution: {dict(sorted(type_dist.items()))}") + + all_metadata = [] + + for i, target_duration in enumerate(sample_durations): + metadata = self.generate_sample(i, target_question_type=balanced_question_types[i], target_duration_seconds=target_duration) + all_metadata.append(metadata) # Save MCQ CSV + mcq_csv_path = self.output_base / 'volume_mcq.csv' + self._save_mcq_csv(all_metadata, mcq_csv_path) + + # Save open-text CSV + open_text_csv_path = self.output_base / 'volume_open_text.csv' + self._save_open_text_csv(all_metadata, open_text_csv_path) + + # Save metadata CSV + metadata_csv_path = self.output_base / 'volume_metadata.csv' + self._save_metadata_csv(all_metadata, metadata_csv_path) + + self.logger.info(f"Volume task dataset generation complete!") + self.logger.info(f" - MCQ CSV: {mcq_csv_path}") + self.logger.info(f" - Open-text CSV: {open_text_csv_path}") + self.logger.info(f" - Metadata CSV: {metadata_csv_path}") + self.logger.info(f" - Audio files: {self.audio_output}") + + return mcq_csv_path, open_text_csv_path + + def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path): + """Save MCQ format CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'question', 'id', 'audio_path', + 'optionA', 'optionB', 'optionC', 'optionD', + 'correct', 'question_type', 'audio_sequence', + 'category_volumes' + ]) + + # Data rows + for meta in metadata_list: + writer.writerow([ + meta['mcq_question'], + meta['id'], + meta['audio_path'], + meta['mcq_options']['A'], + meta['mcq_options']['B'], + meta['mcq_options']['C'], + meta['mcq_options']['D'], + meta['mcq_correct_answer'], + meta['question_type'], + str(meta['audio_sequence']), + str(meta['category_volumes']) + ]) + + def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path): + """Save open-text format CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'question', 'id', 'audio_path', 'answer', + 'question_type', 'audio_sequence', 'category_volumes' + ]) + + # Data rows + for meta in metadata_list: + writer.writerow([ + meta['open_text_question'], + meta['id'], + meta['audio_path'], + meta['open_text_answer'], + meta['question_type'], + str(meta['audio_sequence']), + str(meta['category_volumes']) + ]) + + def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path): + """Save detailed metadata CSV.""" + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'id', 'audio_path', 'n_clips', 'question_type', + 'audio_sequence', 'volume_levels_db', 'correct_answer', + 'correct_volume_db', 'source_files' + ]) + + # Data rows + for meta in metadata_list: + writer.writerow([ + meta['id'], + meta['audio_path'], + meta['n_clips'], + meta['question_type'], + str(meta['audio_sequence']), + str(meta['volume_levels_db']), + meta['correct_answer_category'], + meta['correct_volume_db'], + str(meta['source_files']) + ]) + + +def main(config_path: str = None): + """Main entry point for volume task generation.""" + import yaml + + # Load configuration + if config_path is None: + config_path = Path(__file__).parent.parent / 'config.yaml' + + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + # Set random seed + set_random_seed(config['random_seed']) + + # Setup logger + logger = setup_logger( + 'volume_task', + log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']), + level=config['logging']['level'], + console_output=config['logging']['console_output'] + ) + + # Generate dataset + generator = VolumeTaskGenerator(config, logger) + generator.generate_dataset() + + +if __name__ == '__main__': + main() diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4055395fae745a361a7c727ab53d2f8e600c2528 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,50 @@ +""" +Utility module initialization. +""" + +from .audio_utils import ( + AudioProcessor, set_random_seed, + calculate_num_samples_for_task, generate_sample_durations_for_task, + generate_single_clip_duration, + concatenate_to_target_duration, + get_max_clip_num_to_be_joined, + build_clip_sequence_with_silences, + distribute_remainder_as_silences, + repeat_clips_to_fill_duration, + build_consecutive_sources_for_count_task, + build_random_order_for_count_task, + build_count_task_audio, + calculate_duration_slot_distribution, + build_duration_task_audio, + get_lufs_loudness, + normalize_to_lufs +) +from .dataset_utils import ESC50Dataset, PreprocessedESC50Dataset +from .logger import setup_logger +from .question_utils import QuestionGenerator +from .llm_utils import LLMQuestionGenerator + +__all__ = [ + 'AudioProcessor', + 'ESC50Dataset', + 'PreprocessedESC50Dataset', + 'QuestionGenerator', + 'LLMQuestionGenerator', + 'setup_logger', + 'set_random_seed', + 'calculate_num_samples_for_task', + 'generate_sample_durations_for_task', + 'generate_single_clip_duration', + 'concatenate_to_target_duration', + 'get_max_clip_num_to_be_joined', + 'build_clip_sequence_with_silences', + 'distribute_remainder_as_silences', + 'repeat_clips_to_fill_duration', + 'build_consecutive_sources_for_count_task', + 'build_random_order_for_count_task', + 'build_count_task_audio', + 'calculate_duration_slot_distribution', + 'build_duration_task_audio', + 'get_lufs_loudness', + 'normalize_to_lufs' +] diff --git a/utils/__pycache__/__init__.cpython-312.pyc b/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ac995fb537fd12783a69c4d26e1c607d7e36653 Binary files /dev/null and b/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/utils/__pycache__/__init__.cpython-314.pyc b/utils/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e743794b40fb63ca4b61ec235c1c46cb39a826b4 Binary files /dev/null and b/utils/__pycache__/__init__.cpython-314.pyc differ diff --git a/utils/__pycache__/audio_utils.cpython-312.pyc b/utils/__pycache__/audio_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e139155f6d0c00c7dc4931a5709605bb30068c29 Binary files /dev/null and b/utils/__pycache__/audio_utils.cpython-312.pyc differ diff --git a/utils/__pycache__/audio_utils.cpython-314.pyc b/utils/__pycache__/audio_utils.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9aef7a956b6283fd570a66c381e86f437f924e2d Binary files /dev/null and b/utils/__pycache__/audio_utils.cpython-314.pyc differ diff --git a/utils/__pycache__/dataset_utils.cpython-312.pyc b/utils/__pycache__/dataset_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f3fe4a5a67246ddfeb0894e8eedb80576d3235d Binary files /dev/null and b/utils/__pycache__/dataset_utils.cpython-312.pyc differ diff --git a/utils/__pycache__/llm_utils.cpython-312.pyc b/utils/__pycache__/llm_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd0c68dfe55d519a35e8b1f110a0b27ad9260066 Binary files /dev/null and b/utils/__pycache__/llm_utils.cpython-312.pyc differ diff --git a/utils/__pycache__/logger.cpython-312.pyc b/utils/__pycache__/logger.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3a211e0850c189222a266363f2be2795405d9c3 Binary files /dev/null and b/utils/__pycache__/logger.cpython-312.pyc differ diff --git a/utils/__pycache__/question_utils.cpython-312.pyc b/utils/__pycache__/question_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a159f4f06b4621d4ef967d410f3b6ae20aa298b5 Binary files /dev/null and b/utils/__pycache__/question_utils.cpython-312.pyc differ diff --git a/utils/audio_utils.py b/utils/audio_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..46492f9b3596a90013dbca0345e6cc16886dd475 --- /dev/null +++ b/utils/audio_utils.py @@ -0,0 +1,1388 @@ +""" +Audio processing utilities for temporal reasoning dataset generation. +""" + +import os +import random +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +from pydub import AudioSegment + +try: + import pyloudnorm as pyln + PYLOUDNORM_AVAILABLE = True +except ImportError: + PYLOUDNORM_AVAILABLE = False + +from .logger import setup_logger + +logger = setup_logger(__name__) + + +def get_lufs_loudness(audio: AudioSegment) -> float: + """ + Calculate integrated LUFS loudness (perceived loudness) of an audio segment. + + LUFS (Loudness Units Full Scale) is the broadcast standard for measuring + perceived loudness. It accounts for human hearing sensitivity to different + frequencies using K-weighting. + + Args: + audio: Input audio segment (pydub AudioSegment) + + Returns: + Loudness in LUFS (negative values, typically -70 to 0) + Returns dBFS if pyloudnorm is not available (fallback) + """ + if not PYLOUDNORM_AVAILABLE: + logger.warning("pyloudnorm not available, falling back to dBFS") + return audio.dBFS + + # Convert pydub AudioSegment to numpy array + samples = np.array(audio.get_array_of_samples()) + + # Handle stereo by reshaping + if audio.channels == 2: + samples = samples.reshape((-1, 2)) + + # Normalize to float [-1, 1] + if audio.sample_width == 1: + samples = samples.astype(np.float64) / 128.0 - 1.0 + elif audio.sample_width == 2: + samples = samples.astype(np.float64) / 32768.0 + elif audio.sample_width == 4: + samples = samples.astype(np.float64) / 2147483648.0 + else: + samples = samples.astype(np.float64) / 32768.0 # default to 16-bit + + # Create meter with sample rate + meter = pyln.Meter(audio.frame_rate) + + # Measure integrated loudness + try: + loudness = meter.integrated_loudness(samples) + # Handle -inf for silent audio + if np.isinf(loudness): + loudness = -70.0 # Return very quiet value instead of -inf + return loudness + except Exception as e: + logger.warning(f"LUFS measurement failed: {e}, falling back to dBFS") + return audio.dBFS + + +def normalize_to_lufs(audio: AudioSegment, target_lufs: float = -23.0) -> AudioSegment: + """ + Normalize audio to a target LUFS level (perceived loudness normalization). + + This is superior to dBFS normalization for comparing different sound types + because it accounts for human hearing sensitivity. + + Args: + audio: Input audio segment + target_lufs: Target loudness level in LUFS (default: -23 LUFS, EBU R128 standard) + + Returns: + Loudness-normalized audio segment + """ + if not PYLOUDNORM_AVAILABLE: + logger.warning("pyloudnorm not available, falling back to dBFS normalization") + change_db = target_lufs - audio.dBFS + return audio.apply_gain(change_db) + + current_lufs = get_lufs_loudness(audio) + + # Calculate required gain change + gain_db = target_lufs - current_lufs + + # Apply gain + normalized = audio.apply_gain(gain_db) + + logger.debug(f"Normalized LUFS: {current_lufs:.2f} -> {get_lufs_loudness(normalized):.2f} LUFS") + + return normalized + + +class AudioProcessor: + """Handles audio loading, processing, and concatenation.""" + + def __init__( + self, + crossfade_duration: int = 500, + silence_duration: int = 1000, + with_silence: bool = True, + normalize: bool = False, + normalize_target_dBFS: float = -20.0, + synthetic_silence_path: Optional[str] = None + ): + """ + Initialize the audio processor. + + Args: + crossfade_duration: Duration of crossfade in milliseconds + silence_duration: Duration of silence between clips in milliseconds + with_silence: Whether to add silence between clips + normalize: Whether to normalize audio levels + normalize_target_dBFS: Target dBFS level for normalization + synthetic_silence_path: Path to synthetic silence audio files + """ + self.crossfade_duration = crossfade_duration + self.silence_duration = silence_duration + self.with_silence = with_silence + self.normalize = normalize + self.normalize_target_dBFS = normalize_target_dBFS + self.synthetic_silence_path = synthetic_silence_path + self._silence_cache = {} + + def load_audio(self, audio_path: str) -> AudioSegment: + """ + Load an audio file. + + Args: + audio_path: Path to the audio file + + Returns: + Loaded audio segment + """ + try: + audio = AudioSegment.from_file(audio_path, format="wav") + logger.debug(f"Loaded audio: {audio_path}, duration: {len(audio)}ms") + return audio + except Exception as e: + logger.error(f"Error loading audio {audio_path}: {e}") + raise + + def normalize_audio(self, audio: AudioSegment, target_dBFS: Optional[float] = None) -> AudioSegment: + """ + Normalize audio to a target dBFS level. + + Args: + audio: Input audio segment + target_dBFS: Target dBFS level (uses default if None) + + Returns: + Normalized audio segment + """ + if target_dBFS is None: + target_dBFS = self.normalize_target_dBFS + + change_in_dBFS = target_dBFS - audio.dBFS + normalized = audio.apply_gain(change_in_dBFS) + logger.debug(f"Normalized audio: {audio.dBFS:.2f} dBFS -> {normalized.dBFS:.2f} dBFS") + return normalized + + def adjust_volume(self, audio: AudioSegment, volume_db: float) -> AudioSegment: + """ + Adjust audio volume by a specific dB amount. + + Args: + audio: Input audio segment + volume_db: Volume adjustment in dB (positive = louder, negative = quieter) + + Returns: + Volume-adjusted audio segment + """ + adjusted = audio.apply_gain(volume_db) + logger.debug(f"Adjusted volume by {volume_db} dB: {audio.dBFS:.2f} -> {adjusted.dBFS:.2f} dBFS") + return adjusted + + def get_silence(self, duration: Optional[int] = None) -> AudioSegment: + """ + Get a silence audio segment, using synthetic silence if available. + + Args: + duration: Duration in milliseconds (uses default if None) + + Returns: + Silence audio segment + """ + if duration is None: + duration = self.silence_duration + + # Check cache first + if duration in self._silence_cache: + return self._silence_cache[duration] + + # Try to load synthetic silence + if self.synthetic_silence_path and os.path.exists(self.synthetic_silence_path): + silence_files = list(Path(self.synthetic_silence_path).glob("*.wav")) + if silence_files: + silence = self.load_audio(str(random.choice(silence_files))) + # Adjust duration if needed + if len(silence) < duration: + # Repeat the silence + repetitions = (duration // len(silence)) + 1 + silence = silence * repetitions + silence = silence[:duration] + self._silence_cache[duration] = silence + logger.debug(f"Using synthetic silence: {duration}ms") + return silence + + # Fall back to pure silence + silence = AudioSegment.silent(duration=duration) + self._silence_cache[duration] = silence + logger.debug(f"Using pure silence: {duration}ms") + return silence + + def concatenate_audios( + self, + audio_list: List[AudioSegment], + normalize_each: bool = False, + volume_adjustments: Optional[List[float]] = None + ) -> AudioSegment: + """ + Concatenate multiple audio segments with crossfade and optional silence. + + Args: + audio_list: List of audio segments to concatenate + normalize_each: Whether to normalize each audio before concatenation + volume_adjustments: Optional list of volume adjustments (in dB) for each audio + + Returns: + Concatenated audio segment + """ + if not audio_list: + raise ValueError("audio_list cannot be empty") + + if len(audio_list) == 1: + audio = audio_list[0] + if normalize_each and self.normalize: + audio = self.normalize_audio(audio) + if volume_adjustments and len(volume_adjustments) > 0: + audio = self.adjust_volume(audio, volume_adjustments[0]) + return audio + + # Process first audio + merged = audio_list[0] + if normalize_each and self.normalize: + merged = self.normalize_audio(merged) + if volume_adjustments and len(volume_adjustments) > 0: + merged = self.adjust_volume(merged, volume_adjustments[0]) + + # Concatenate remaining audios + for i, audio in enumerate(audio_list[1:], start=1): + # Process current audio + current = audio + if normalize_each and self.normalize: + current = self.normalize_audio(current) + if volume_adjustments and len(volume_adjustments) > i: + current = self.adjust_volume(current, volume_adjustments[i]) + + # Add silence if configured + if self.with_silence: + silence = self.get_silence() + # Crossfade between audio and silence for smooth transition + merged = merged.append(silence, crossfade=self.crossfade_duration) + + # Append current audio WITHOUT crossfade to avoid cutting it + # The crossfade with silence already provides smooth transition + merged = merged.append(current, crossfade=0) + + logger.debug(f"Concatenated {len(audio_list)} audio segments, total duration: {len(merged)}ms") + return merged + + def concatenate_audio_files( + self, + audio_paths: List[str], + output_path: str, + normalize_each: bool = False, + volume_adjustments: Optional[List[float]] = None, + target_durations: Optional[List[float]] = None + ) -> Tuple[AudioSegment, dict]: + """ + Load, concatenate, and save multiple audio files. + + Args: + audio_paths: List of paths to audio files + output_path: Path to save the concatenated audio + normalize_each: Whether to normalize each audio before concatenation + volume_adjustments: Optional list of volume adjustments (in dB) for each audio + target_durations: Optional list of target durations (in seconds) for each clip + + Returns: + Tuple of (concatenated audio segment, metadata dict) + """ + # Load all audio files + audio_segments = [] + for i, path in enumerate(audio_paths): + audio = self.load_audio(path) + + # Adjust duration if specified + if target_durations and i < len(target_durations): + target_ms = int(target_durations[i] * 1000) + audio = trim_or_repeat_audio(audio, target_ms) + logger.debug(f"Adjusted clip {i} to {len(audio)}ms (target: {target_ms}ms)") + + audio_segments.append(audio) + + # Concatenate + merged = self.concatenate_audios(audio_segments, normalize_each, volume_adjustments) + + # Save + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + merged.export(str(output_path), format="wav") + logger.info(f"Saved concatenated audio: {output_path}") + + # Create metadata + metadata = { + "output_path": str(output_path), + "source_files": audio_paths, + "num_sources": len(audio_paths), + "total_duration_ms": len(merged), + "total_duration_s": len(merged) / 1000.0, + "individual_durations_ms": [len(a) for a in audio_segments], + "individual_durations_s": [len(a) / 1000.0 for a in audio_segments], + "target_durations_s": target_durations if target_durations else [], + "volume_adjustments_db": volume_adjustments if volume_adjustments else [] + } + + return merged, metadata + + +def generate_sample_durations_for_task( + task_duration_hours: float, + min_clip_duration: float, + max_clip_duration: float +) -> list: + """ + Generate sample durations that exactly fill the target task duration. + + Algorithm: + 1. Start with remaining = total_seconds + 2. While remaining >= min_clip_duration: + - Sample d ~ Uniform(min, min(max, remaining)) + - Append d to durations list + - Subtract d from remaining + 3. Return shuffled list of durations + + This ensures: + - Total of all durations ≈ task_duration (within min_clip_duration tolerance) + - Each duration is uniformly sampled within valid range + - No overshoot of target duration + + Args: + task_duration_hours: Total duration for the task in hours + min_clip_duration: Minimum duration per clip in seconds + max_clip_duration: Maximum duration per clip in seconds + + Returns: + List of sample durations in seconds (shuffled) + """ + task_duration_seconds = task_duration_hours * 3600 + remaining = task_duration_seconds + durations = [] + + while remaining >= min_clip_duration: + # Cap max at remaining to avoid overshoot + effective_max = min(max_clip_duration, remaining) + + # If remaining is less than min, we can't fit another sample + if effective_max < min_clip_duration: + break + + # Sample uniformly within valid range + d = random.uniform(min_clip_duration, effective_max) + durations.append(d) + remaining -= d + + # Shuffle to randomize order (durations were generated sequentially) + random.shuffle(durations) + + total_duration = sum(durations) + logger.info(f"Task duration target: {task_duration_hours}h ({task_duration_seconds:.1f}s)") + logger.info(f"Generated {len(durations)} sample durations, total: {total_duration:.1f}s") + logger.info(f"Duration range: [{min(durations):.1f}s, {max(durations):.1f}s], " + f"mean: {total_duration/len(durations):.1f}s") + logger.info(f"Unused remainder: {remaining:.1f}s ({remaining/task_duration_seconds*100:.2f}%)") + + return durations + + +def calculate_num_samples_for_task( + task_duration_hours: float, + min_clip_duration: float, + max_clip_duration: float +) -> int: + """ + Calculate number of samples needed to fill the task duration. + + DEPRECATED: Use generate_sample_durations_for_task() instead for exact duration filling. + This function is kept for backward compatibility but uses average-based estimation. + + Args: + task_duration_hours: Total duration for the task in hours + min_clip_duration: Minimum duration per clip in seconds + max_clip_duration: Maximum duration per clip in seconds + + Returns: + Number of samples to generate (estimate) + """ + task_duration_seconds = task_duration_hours * 3600 + avg_clip_duration = (min_clip_duration + max_clip_duration) / 2 + num_samples = int(task_duration_seconds / avg_clip_duration) + + logger.info(f"Task duration: {task_duration_hours}h ({task_duration_seconds}s)") + logger.info(f"Avg clip duration: {avg_clip_duration}s (min: {min_clip_duration}s, max: {max_clip_duration}s)") + logger.info(f"Calculated number of samples: {num_samples}") + + return max(1, num_samples) # At least 1 sample + + +def generate_single_clip_duration( + min_duration: float, + max_duration: float +) -> float: + """ + Generate a random clip duration between min and max. + + Args: + min_duration: Minimum duration in seconds + max_duration: Maximum duration in seconds + + Returns: + Random duration in seconds + """ + return random.uniform(min_duration, max_duration) + + +def concatenate_to_target_duration( + base_audio: AudioSegment, + target_duration_seconds: float, + crossfade_ms: int = 0 +) -> AudioSegment: + """ + Concatenate a base audio clip to reach target duration. + + This takes a 5-second ESC-50 clip and repeats it to create a longer clip. + + Args: + base_audio: Original 5s audio segment + target_duration_seconds: Target duration in seconds + crossfade_ms: Crossfade between repetitions in milliseconds + + Returns: + Audio segment of target duration + """ + target_duration_ms = int(target_duration_seconds * 1000) + base_duration_ms = len(base_audio) + + if target_duration_ms <= base_duration_ms: + # Just trim if target is shorter + return base_audio[:target_duration_ms] + + # Calculate number of repetitions needed + num_repetitions = (target_duration_ms // base_duration_ms) + 1 + + # Concatenate with crossfade + result = base_audio + for i in range(1, num_repetitions): + if crossfade_ms > 0: + result = result.append(base_audio, crossfade=crossfade_ms) + else: + result = result + base_audio + + # Stop if we've reached target + if len(result) >= target_duration_ms: + break + + # Trim to exact duration + return result[:target_duration_ms] + + +def set_random_seed(seed: int): + """Set random seed for reproducibility.""" + random.seed(seed) + np.random.seed(seed) + logger.info(f"Random seed set to: {seed}") + + +def get_max_clip_num_to_be_joined( + target_duration_seconds: float, + source_clip_duration_seconds: float, + min_silence_ms: int = 100 +) -> Tuple[int, float]: + """ + Calculate the maximum number of source clips needed to reach target duration. + + Pipeline: pick dataset -> pick class -> pick audio clip -> get duration -> + concatenate clips to reach target duration -> modulo to get num clips -> + inserting silences randomly based on remainder. + + Args: + target_duration_seconds: Target total duration in seconds + source_clip_duration_seconds: Duration of each source clip (e.g., 5s for ESC-50) + min_silence_ms: Minimum silence between clips in milliseconds + + Returns: + Tuple of (num_clips_needed, remainder_seconds_for_silences) + - num_clips_needed: How many source clips to concatenate + - remainder_seconds_for_silences: Extra time to distribute as random silences + + Example: + target=30s, source=5s -> (6, 0.0) - exactly 6 clips, no extra silence + target=32s, source=5s -> (6, 2.0) - 6 clips + 2s distributed as silences + """ + target_ms = target_duration_seconds * 1000 + source_ms = source_clip_duration_seconds * 1000 + + # Account for minimum silence between each pair of clips + # If we have N clips, we have (N-1) gaps for silence + # Each gap needs at least min_silence_ms + + # Start by computing raw number of clips (floor division) + num_clips = int(target_ms // source_ms) + num_clips = max(1, num_clips) # At least 1 clip + + # Total audio content from clips + clips_duration_ms = num_clips * source_ms + + # Minimum required silence for gaps + num_gaps = max(0, num_clips - 1) + min_total_silence_ms = num_gaps * min_silence_ms + + # Check if we need to reduce clips to fit silences + while num_clips > 1 and (clips_duration_ms + min_total_silence_ms) > target_ms: + num_clips -= 1 + clips_duration_ms = num_clips * source_ms + num_gaps = num_clips - 1 + min_total_silence_ms = num_gaps * min_silence_ms + + # Calculate remainder for extra silences + remainder_ms = target_ms - clips_duration_ms - min_total_silence_ms + remainder_seconds = max(0, remainder_ms / 1000.0) + + logger.debug( + f"get_max_clip_num: target={target_duration_seconds}s, source={source_clip_duration_seconds}s " + f"-> {num_clips} clips, {remainder_seconds:.3f}s remainder for extra silences" + ) + + return num_clips, remainder_seconds + + +def build_clip_sequence_with_silences( + audio_segments: List[AudioSegment], + target_duration_seconds: float, + min_silence_ms: int = 100, + max_extra_silence_per_gap_ms: int = 500, + crossfade_ms: int = 0 +) -> AudioSegment: + """ + Build a final audio clip by concatenating segments with guaranteed silences. + + Ensures: + 1. All clips are joined with at least min_silence_ms between them + 2. Any remainder duration is distributed as random extra silences in gaps + 3. Final duration matches target_duration_seconds exactly + + Args: + audio_segments: List of audio segments to concatenate + target_duration_seconds: Target total duration in seconds + min_silence_ms: Minimum silence between each pair of clips (always inserted) + max_extra_silence_per_gap_ms: Maximum extra silence to add per gap + crossfade_ms: Crossfade duration in ms (applied when joining) + + Returns: + Concatenated audio segment of exact target duration + """ + if not audio_segments: + raise ValueError("audio_segments cannot be empty") + + target_ms = int(target_duration_seconds * 1000) + + if len(audio_segments) == 1: + # Single clip: just trim/repeat to target + audio = audio_segments[0] + if len(audio) >= target_ms: + return audio[:target_ms] + else: + # Repeat to reach target + return concatenate_to_target_duration(audio, target_duration_seconds, crossfade_ms) + + # Calculate total audio content duration + total_audio_ms = sum(len(seg) for seg in audio_segments) + num_gaps = len(audio_segments) - 1 + + # Minimum silence needed + min_total_silence_ms = num_gaps * min_silence_ms + + # Available time for extra silences + available_extra_ms = target_ms - total_audio_ms - min_total_silence_ms + + if available_extra_ms < 0: + # Not enough room - need to trim clips + logger.warning( + f"Clips too long for target duration. Total audio: {total_audio_ms}ms, " + f"target: {target_ms}ms. Will trim final result." + ) + available_extra_ms = 0 + + # Distribute extra silence randomly across gaps + extra_silences_ms = distribute_remainder_as_silences( + available_extra_ms, + num_gaps, + max_extra_silence_per_gap_ms + ) + + # Build the final audio + result = audio_segments[0] + + for i, audio in enumerate(audio_segments[1:]): + # Calculate total silence for this gap + gap_silence_ms = min_silence_ms + extra_silences_ms[i] + + # Add silence + silence = AudioSegment.silent(duration=gap_silence_ms) + + if crossfade_ms > 0 and crossfade_ms < gap_silence_ms: + # Crossfade audio->silence for smooth transition, but NOT silence->audio + result = result.append(silence, crossfade=crossfade_ms) + result = result.append(audio, crossfade=0) # No crossfade to avoid cutting audio + else: + result = result + silence + audio + + # Trim to exact target duration + if len(result) > target_ms: + result = result[:target_ms] + elif len(result) < target_ms: + # Pad with silence if slightly short + padding = AudioSegment.silent(duration=target_ms - len(result)) + result = result + padding + + logger.debug( + f"Built clip sequence: {len(audio_segments)} segments, " + f"final duration: {len(result)}ms (target: {target_ms}ms)" + ) + + return result + + +def distribute_remainder_as_silences( + remainder_ms: float, + num_gaps: int, + max_per_gap_ms: int = 500 +) -> List[int]: + """ + Distribute remainder time as random silences across gaps. + + Args: + remainder_ms: Total extra time to distribute (in ms) + num_gaps: Number of gaps between clips + max_per_gap_ms: Maximum extra silence per gap + + Returns: + List of extra silence durations (in ms) for each gap + """ + if num_gaps <= 0: + return [] + + remainder_ms = int(max(0, remainder_ms)) + + if remainder_ms == 0: + return [0] * num_gaps + + # Generate random weights for distribution + weights = [random.random() for _ in range(num_gaps)] + total_weight = sum(weights) + + if total_weight == 0: + # Fallback to uniform distribution + weights = [1.0] * num_gaps + total_weight = num_gaps + + # Distribute proportionally, respecting max_per_gap + extra_silences = [] + remaining = remainder_ms + + for i, w in enumerate(weights): + if i == num_gaps - 1: + # Last gap gets whatever is left + extra = min(remaining, max_per_gap_ms) + else: + proportion = w / total_weight + extra = int(remainder_ms * proportion) + extra = min(extra, max_per_gap_ms, remaining) + + extra_silences.append(extra) + remaining -= extra + total_weight -= w + + # If there's still remainder (due to max_per_gap limits), do another pass + while remaining > 0: + for i in range(num_gaps): + if extra_silences[i] < max_per_gap_ms and remaining > 0: + add = min(remaining, max_per_gap_ms - extra_silences[i]) + extra_silences[i] += add + remaining -= add + if remaining > 0: + # Can't distribute more (all gaps at max) + break + + logger.debug(f"Distributed {remainder_ms}ms across {num_gaps} gaps: {extra_silences}") + + return extra_silences + + +def repeat_clips_to_fill_duration( + source_audios: List[AudioSegment], + source_categories: List[str], + target_duration_seconds: float, + source_clip_duration_seconds: float = 5.0, + min_silence_ms: int = 100 +) -> Tuple[List[AudioSegment], List[str], int]: + """ + Repeat source clips to fill target duration, cycling through all sources. + + This ensures all unique sources appear and are repeated proportionally. + + Args: + source_audios: List of unique source audio segments + source_categories: List of category names corresponding to source_audios + target_duration_seconds: Target total duration + source_clip_duration_seconds: Duration of each source clip + min_silence_ms: Minimum silence between clips + + Returns: + Tuple of (expanded_audio_list, expanded_categories, num_clips) + """ + num_clips, remainder = get_max_clip_num_to_be_joined( + target_duration_seconds, + source_clip_duration_seconds, + min_silence_ms + ) + + num_sources = len(source_audios) + + if num_sources == 0: + raise ValueError("source_audios cannot be empty") + + # Build expanded lists by cycling through sources + expanded_audios = [] + expanded_categories = [] + + for i in range(num_clips): + idx = i % num_sources + expanded_audios.append(source_audios[idx]) + expanded_categories.append(source_categories[idx]) + + logger.debug( + f"Repeated {num_sources} sources to {num_clips} clips for " + f"{target_duration_seconds}s target duration" + ) + + return expanded_audios, expanded_categories, num_clips + + +def build_consecutive_sources_for_count_task( + source_audios: List[AudioSegment], + source_categories: List[str], + target_duration_seconds: float, + source_clip_duration_seconds: float = 5.0, + min_silence_between_sources_ms: int = 100, + max_extra_silence_per_gap_ms: int = 500, + crossfade_within_source_ms: int = 50 +) -> Tuple[AudioSegment, List[str], dict]: + """ + Build audio for COUNT task with consecutive same-class clips. + + For count task, same-class clips must be consecutive (AAA BBB CCC) so they + are perceived as ONE sound source. Silences are only inserted BETWEEN + different classes, not within same-class repetitions. + + Pipeline: pick classes -> for each class concatenate clips consecutively -> + insert silences only between different classes -> distribute remainder + + Args: + source_audios: List of unique source audio segments (one per class) + source_categories: List of category names + target_duration_seconds: Target total duration + source_clip_duration_seconds: Duration of each source clip + min_silence_between_sources_ms: Minimum silence between different sources + max_extra_silence_per_gap_ms: Max extra silence per gap for remainder distribution + crossfade_within_source_ms: Small crossfade within same-source repetitions + + Returns: + Tuple of (final_audio, category_sequence, metadata_dict) + """ + target_ms = int(target_duration_seconds * 1000) + source_ms = int(source_clip_duration_seconds * 1000) + num_sources = len(source_audios) + + if num_sources == 0: + raise ValueError("source_audios cannot be empty") + + # Calculate total clips needed + num_clips, remainder_seconds = get_max_clip_num_to_be_joined( + target_duration_seconds, + source_clip_duration_seconds, + min_silence_between_sources_ms + ) + + # Safety check: if more sources than clips can fit, warn + if num_sources > num_clips: + logger.warning( + f"More sources ({num_sources}) than clips that fit ({num_clips}). " + f"Each source needs at least 1 clip, so output may exceed target duration. " + f"Consider capping n_unique_audios <= max_clips in task_count.py" + ) + # Each source gets exactly 1 rep if there are more sources than clips + num_clips = num_sources # This will exceed target but ensures each source is included + + # Distribute clips across sources as evenly as possible + # Each source gets at least 1 clip since num_sources <= num_clips + base_reps = num_clips // num_sources + extra_reps = num_clips % num_sources + + repetitions_per_source = [] + for i in range(num_sources): + reps = base_reps + (1 if i < extra_reps else 0) + repetitions_per_source.append(reps) + + # Shuffle repetition assignment to add variety + random.shuffle(repetitions_per_source) + + # Build each source's audio block (consecutive clips of same class) + source_blocks = [] + category_sequence = [] + + for i, (audio, category, reps) in enumerate(zip(source_audios, source_categories, repetitions_per_source)): + if reps == 0: + continue + + # Concatenate same-source clips with minimal/no gap (just small crossfade) + block = audio + for _ in range(reps - 1): + if crossfade_within_source_ms > 0: + block = block.append(audio, crossfade=crossfade_within_source_ms) + else: + block = block + audio + + source_blocks.append(block) + category_sequence.append(category) + + # Now we have N source blocks, need to join them with silences + # Number of gaps = num_source_blocks - 1 + num_gaps = len(source_blocks) - 1 + + if num_gaps <= 0: + # Only one source block + final_audio = source_blocks[0] + else: + # Calculate total audio duration from blocks + total_blocks_ms = sum(len(block) for block in source_blocks) + min_total_silence_ms = num_gaps * min_silence_between_sources_ms + + # Available for extra silences + available_extra_ms = target_ms - total_blocks_ms - min_total_silence_ms + available_extra_ms = max(0, available_extra_ms) + + # Distribute extra silence across gaps + extra_silences = distribute_remainder_as_silences( + available_extra_ms, + num_gaps, + max_extra_silence_per_gap_ms + ) + + # Build final audio with silences between source blocks + final_audio = source_blocks[0] + for i, block in enumerate(source_blocks[1:]): + gap_silence_ms = min_silence_between_sources_ms + extra_silences[i] + silence = AudioSegment.silent(duration=gap_silence_ms) + final_audio = final_audio + silence + block + + # Trim or pad to exact target duration + if len(final_audio) > target_ms: + final_audio = final_audio[:target_ms] + elif len(final_audio) < target_ms: + padding = AudioSegment.silent(duration=target_ms - len(final_audio)) + final_audio = final_audio + padding + + # Create metadata + metadata = { + 'num_unique_sources': num_sources, + 'total_clips': num_clips, + 'ordering_mode': 'consecutive', + 'repetitions_per_source': dict(zip(source_categories, repetitions_per_source)), + 'target_duration_ms': target_ms, + 'actual_duration_ms': len(final_audio), + 'num_gaps_between_sources': num_gaps + } + + logger.debug( + f"Count task (consecutive): {num_sources} sources, {num_clips} total clips, " + f"reps={repetitions_per_source}, duration={len(final_audio)}ms" + ) + + return final_audio, category_sequence, metadata + + +def build_random_order_for_count_task( + source_audios: List[AudioSegment], + source_categories: List[str], + target_duration_seconds: float, + source_clip_duration_seconds: float = 5.0, + min_silence_ms: int = 100, + max_extra_silence_per_gap_ms: int = 500 +) -> Tuple[AudioSegment, List[str], dict]: + """ + Build audio for COUNT task with RANDOM ordering of clips. + + Clips from different sources are shuffled randomly (A B A C B A C...). + This tests whether the model can recognize recurring sounds as the same source. + Silences are inserted between ALL clips (same or different source). + + Pipeline: + 1. Calculate total clips needed + 2. Distribute clips across sources + 3. Create expanded list with all clip instances + 4. Shuffle randomly + 5. Insert silences between ALL clips + 6. Distribute remainder as extra random silences + + Args: + source_audios: List of unique source audio segments (one per class) + source_categories: List of category names + target_duration_seconds: Target total duration + source_clip_duration_seconds: Duration of each source clip + min_silence_ms: Minimum silence between ALL clips + max_extra_silence_per_gap_ms: Max extra silence per gap + + Returns: + Tuple of (final_audio, clip_sequence, metadata_dict) + """ + target_ms = int(target_duration_seconds * 1000) + source_ms = int(source_clip_duration_seconds * 1000) + num_sources = len(source_audios) + + if num_sources == 0: + raise ValueError("source_audios cannot be empty") + + # Calculate total clips needed + num_clips, remainder_seconds = get_max_clip_num_to_be_joined( + target_duration_seconds, + source_clip_duration_seconds, + min_silence_ms + ) + + # Safety check: if more sources than clips can fit, warn and cap sources + if num_sources > num_clips: + logger.warning( + f"More sources ({num_sources}) than clips that fit ({num_clips}). " + f"Each source needs at least 1 clip, so output may exceed target duration. " + f"Consider capping n_unique_audios <= max_clips in task_count.py" + ) + # Each source gets exactly 1 rep if there are more sources than clips + num_clips = num_sources # This will exceed target but ensures each source is included + + # Distribute clips across sources as evenly as possible + base_reps = num_clips // num_sources # At least 1 since num_sources <= num_clips (after cap) + extra_reps = num_clips % num_sources + + repetitions_per_source = [] + for i in range(num_sources): + reps = base_reps + (1 if i < extra_reps else 0) + repetitions_per_source.append(reps) + + # Build expanded list of (audio, category) pairs + expanded_clips = [] + for audio, category, reps in zip(source_audios, source_categories, repetitions_per_source): + for _ in range(reps): + expanded_clips.append((audio, category)) + + # Shuffle the clips randomly + random.shuffle(expanded_clips) + + # Extract shuffled audios and categories + shuffled_audios = [clip[0] for clip in expanded_clips] + clip_sequence = [clip[1] for clip in expanded_clips] + + # Build final audio with silences between ALL clips + final_audio = build_clip_sequence_with_silences( + shuffled_audios, + target_duration_seconds, + min_silence_ms=min_silence_ms, + max_extra_silence_per_gap_ms=max_extra_silence_per_gap_ms, + crossfade_ms=0 # No crossfade for random ordering + ) + + # Create metadata + metadata = { + 'num_unique_sources': num_sources, + 'total_clips': len(expanded_clips), + 'ordering_mode': 'random', + 'repetitions_per_source': dict(zip(source_categories, repetitions_per_source)), + 'clip_sequence': clip_sequence, + 'target_duration_ms': target_ms, + 'actual_duration_ms': len(final_audio), + 'num_gaps': len(expanded_clips) - 1 + } + + logger.debug( + f"Count task (random): {num_sources} sources, {len(expanded_clips)} clips, " + f"sequence={clip_sequence[:5]}..., duration={len(final_audio)}ms" + ) + + return final_audio, clip_sequence, metadata + + +def build_count_task_audio( + source_audios: List[AudioSegment], + source_categories: List[str], + target_duration_seconds: float, + ordering_mode: str = "random", + source_clip_duration_seconds: float = 5.0, + min_silence_ms: int = 100, + max_extra_silence_per_gap_ms: int = 500, + crossfade_within_source_ms: int = 50 +) -> Tuple[AudioSegment, List[str], dict]: + """ + Build audio for COUNT task with configurable ordering mode. + + Args: + source_audios: List of unique source audio segments (one per class) + source_categories: List of category names + target_duration_seconds: Target total duration + ordering_mode: "random" or "consecutive" + - "random": Clips shuffled (A B A C B A C) - tests sound recognition + - "consecutive": Same-source grouped (AAA BBB CCC) - easier + source_clip_duration_seconds: Duration of each source clip + min_silence_ms: Minimum silence between clips + max_extra_silence_per_gap_ms: Max extra silence per gap + crossfade_within_source_ms: Crossfade for consecutive mode only + + Returns: + Tuple of (final_audio, clip_sequence, metadata_dict) + """ + if ordering_mode == "consecutive": + return build_consecutive_sources_for_count_task( + source_audios, + source_categories, + target_duration_seconds, + source_clip_duration_seconds, + min_silence_ms, + max_extra_silence_per_gap_ms, + crossfade_within_source_ms + ) + else: # random (default) + return build_random_order_for_count_task( + source_audios, + source_categories, + target_duration_seconds, + source_clip_duration_seconds, + min_silence_ms, + max_extra_silence_per_gap_ms + ) + + +# ============================================================================= +# DURATION TASK FUNCTIONS +# ============================================================================= + +def calculate_duration_slot_distribution( + target_total_duration_s: float, + effective_durations: Dict[str, float], + target_category: str, + question_type: str, + multiplier_longest: float = 1.5, + multiplier_shortest: float = 0.5, + min_silence_between_sources_ms: int = 100 +) -> Tuple[Dict[str, int], bool, Dict]: + """ + Calculate how many repetitions each source gets for duration task. + + For LONGEST: target gets max repetitions, backgrounds get 1 each + For SHORTEST: target gets 1, backgrounds share remaining duration + + Args: + target_total_duration_s: Target total audio duration + effective_durations: Dict mapping category -> effective duration in seconds + target_category: The category that should be longest/shortest + question_type: "longest" or "shortest" + multiplier_longest: target >= max_background * this + multiplier_shortest: target <= min_background * this + min_silence_between_sources_ms: Minimum silence between different sources + + Returns: + Tuple of (slot_distribution, gap_satisfied, metadata) + slot_distribution: Dict mapping category -> number of repetitions + gap_satisfied: Whether the duration gap constraint is met + metadata: Additional info about the calculation + """ + categories = list(effective_durations.keys()) + n_sources = len(categories) + + if n_sources < 2: + # Single source - always satisfies constraint + reps = max(1, int(target_total_duration_s / effective_durations[target_category])) + return {target_category: reps}, True, {'note': 'single_source'} + + # Total silence between sources + total_silence_s = (n_sources - 1) * min_silence_between_sources_ms / 1000.0 + available_for_audio_s = target_total_duration_s - total_silence_s + + background_categories = [c for c in categories if c != target_category] + + if question_type == "longest": + # Backgrounds get 1 rep each + background_duration_s = sum(effective_durations[c] for c in background_categories) + + # Remaining for target + remaining_for_target_s = available_for_audio_s - background_duration_s + target_duration_per_rep = effective_durations[target_category] + + # Calculate reps for target + target_reps = max(1, int(remaining_for_target_s / target_duration_per_rep)) + actual_target_duration = target_reps * target_duration_per_rep + + # Verify gap + max_background_duration = max(effective_durations[c] for c in background_categories) + required_target_duration = max_background_duration * multiplier_longest + gap_satisfied = actual_target_duration >= required_target_duration + + slot_distribution = {c: 1 for c in background_categories} + slot_distribution[target_category] = target_reps + + metadata = { + 'available_for_audio_s': available_for_audio_s, + 'background_duration_s': background_duration_s, + 'remaining_for_target_s': remaining_for_target_s, + 'target_reps': target_reps, + 'actual_target_duration_s': actual_target_duration, + 'max_background_duration_s': max_background_duration, + 'required_target_duration_s': required_target_duration, + 'multiplier_used': multiplier_longest + } + + else: # shortest + # Target gets 1 rep + target_duration_s = effective_durations[target_category] + + # Remaining for backgrounds + remaining_for_backgrounds_s = available_for_audio_s - target_duration_s + + # Distribute remaining to backgrounds as evenly as possible + # while ensuring each background is longer than target * 1/multiplier + slot_distribution = {target_category: 1} + + # Calculate minimum required duration for each background + min_background_required = target_duration_s / multiplier_shortest + + background_reps = {} + for cat in background_categories: + eff_dur = effective_durations[cat] + # How many reps needed to exceed min_background_required? + min_reps = max(1, int(min_background_required / eff_dur) + 1) + background_reps[cat] = min_reps + + # Check if we have room for all backgrounds + total_background_needed = sum( + background_reps[c] * effective_durations[c] + for c in background_categories + ) + + if total_background_needed <= remaining_for_backgrounds_s: + # Distribute extra reps + extra_available = remaining_for_backgrounds_s - total_background_needed + + # Add extra reps to backgrounds proportionally + while extra_available > 0: + added_any = False + for cat in background_categories: + eff_dur = effective_durations[cat] + if extra_available >= eff_dur: + background_reps[cat] += 1 + extra_available -= eff_dur + added_any = True + if not added_any: + break + + slot_distribution.update(background_reps) + gap_satisfied = True + else: + # Not enough room - use minimum reps anyway + slot_distribution.update(background_reps) + gap_satisfied = False + + # Calculate actual durations + actual_durations = { + cat: slot_distribution[cat] * effective_durations[cat] + for cat in categories + } + min_background_actual = min( + actual_durations[c] for c in background_categories + ) + + # Re-verify gap + gap_satisfied = actual_durations[target_category] <= min_background_actual * multiplier_shortest + + metadata = { + 'available_for_audio_s': available_for_audio_s, + 'target_duration_s': target_duration_s, + 'remaining_for_backgrounds_s': remaining_for_backgrounds_s, + 'min_background_required_s': min_background_required, + 'actual_durations_s': actual_durations, + 'min_background_actual_s': min_background_actual, + 'multiplier_used': multiplier_shortest + } + + return slot_distribution, gap_satisfied, metadata + + +def build_duration_task_audio( + source_audio_lists: Dict[str, List[AudioSegment]], + slot_distribution: Dict[str, int], + effective_durations: Dict[str, float], + target_total_duration_s: float, + min_silence_between_sources_ms: int = 100, + max_extra_silence_per_gap_ms: int = 500, + crossfade_within_source_ms: int = 50 +) -> Tuple[AudioSegment, List[str], Dict]: + """ + Build audio for DURATION task with consecutive ordering per source. + + Structure: [SourceA × n] + silence + [SourceB × m] + silence + ... + Order of sources is randomized to avoid patterns. + + Args: + source_audio_lists: Dict mapping category -> list of audio segments + slot_distribution: Dict mapping category -> number of repetitions + effective_durations: Dict mapping category -> effective duration per clip + target_total_duration_s: Target total duration + min_silence_between_sources_ms: Min silence between different sources + max_extra_silence_per_gap_ms: Max extra silence per gap + crossfade_within_source_ms: Crossfade between same-source repetitions + + Returns: + Tuple of (final_audio, category_sequence, metadata) + """ + categories = list(slot_distribution.keys()) + + # Randomize source order + random.shuffle(categories) + + # Build audio blocks for each source + source_blocks = [] + category_sequence = [] + actual_durations = {} + block_durations_ms = [] # Track duration of each block for timestamp calculation + + for category in categories: + reps = slot_distribution[category] + audio_list = source_audio_lists[category] + + if reps == 0: + continue + + # Build block for this source + block = audio_list[0] + for i in range(1, reps): + # Use same clip or cycle through available clips + next_clip = audio_list[i % len(audio_list)] + + # Crossfade within same source + if crossfade_within_source_ms > 0: + if len(block) > crossfade_within_source_ms and len(next_clip) > crossfade_within_source_ms: + block = block.append(next_clip, crossfade=crossfade_within_source_ms) + else: + block = block + next_clip + else: + block = block + next_clip + + source_blocks.append((category, block)) + block_durations_ms.append(len(block)) + category_sequence.extend([category] * reps) + actual_durations[category] = len(block) / 1000.0 + + # Calculate total audio duration and available extra silence + total_audio_ms = sum(len(block) for _, block in source_blocks) + num_gaps = len(source_blocks) - 1 + min_total_silence_ms = num_gaps * min_silence_between_sources_ms + + target_ms = int(target_total_duration_s * 1000) + available_extra_ms = target_ms - total_audio_ms - min_total_silence_ms + + # Distribute extra silence + if available_extra_ms > 0 and num_gaps > 0: + extra_silences = distribute_remainder_as_silences( + available_extra_ms, + num_gaps, + max_extra_silence_per_gap_ms + ) + else: + extra_silences = [0] * max(num_gaps, 1) + + # Concatenate with silences and track timestamps + source_timestamps = [] # List of (category, start_ms, end_ms) + current_position_ms = 0 + + if len(source_blocks) == 1: + final_audio = source_blocks[0][1] + cat, block = source_blocks[0] + source_timestamps.append((cat, 0, len(block))) + else: + final_audio = source_blocks[0][1] + cat, block = source_blocks[0] + source_timestamps.append((cat, 0, len(block))) + current_position_ms = len(block) + + for i, (cat, block) in enumerate(source_blocks[1:]): + gap_silence_ms = min_silence_between_sources_ms + extra_silences[i] + silence = AudioSegment.silent(duration=gap_silence_ms) + + # Prefer crossfading from audio -> silence for a smooth transition, + # but avoid crossfading silence -> audio (it cuts the start of the next clip). + # Conditions for safe crossfade: + # - crossfade length should be less than gap silence + # - both segments must be longer than crossfade + crossfade_ms = min(500, gap_silence_ms) + if crossfade_ms > 0 and crossfade_ms < gap_silence_ms and len(final_audio) > crossfade_ms and len(block) > crossfade_ms: + final_audio = final_audio.append(silence, crossfade=crossfade_ms) + # Append next block without crossfade to avoid trimming its start + final_audio = final_audio.append(block, crossfade=0) + # Track timestamp after silence (start of block) + start_ms = current_position_ms + gap_silence_ms + end_ms = start_ms + len(block) + source_timestamps.append((cat, start_ms, end_ms)) + current_position_ms = end_ms + else: + # Fall back to simple concatenation + final_audio = final_audio + silence + block + start_ms = current_position_ms + gap_silence_ms + end_ms = start_ms + len(block) + source_timestamps.append((cat, start_ms, end_ms)) + current_position_ms = end_ms + + # Adjust to target duration + if len(final_audio) > target_ms: + final_audio = final_audio[:target_ms] + elif len(final_audio) < target_ms: + padding = AudioSegment.silent(duration=target_ms - len(final_audio)) + final_audio = final_audio + padding + + # Build timestamp string: "category1 start-end, category2 start-end, ..." + timestamp_parts = [] + for cat, start_ms, end_ms in source_timestamps: + start_s = round(start_ms / 1000.0, 2) + end_s = round(end_ms / 1000.0, 2) + duration_s = round((end_ms - start_ms) / 1000.0, 2) + timestamp_parts.append(f"{cat} {start_s}s-{end_s}s ({duration_s}s)") + timestamp_string = ", ".join(timestamp_parts) + + metadata = { + 'source_order': [cat for cat, _ in source_blocks], + 'slot_distribution': slot_distribution, + 'actual_durations_s': actual_durations, + 'total_audio_ms': total_audio_ms, + 'num_gaps': num_gaps, + 'final_duration_ms': len(final_audio), + 'source_timestamps': source_timestamps, # List of (category, start_ms, end_ms) + 'timestamp_string': timestamp_string # Human-readable format + } + + logger.debug( + f"Duration task audio: {len(source_blocks)} sources, " + f"order={metadata['source_order']}, duration={len(final_audio)}ms" + ) + + return final_audio, category_sequence, metadata diff --git a/utils/dataset_utils.py b/utils/dataset_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..49a7465aee42369fce4761a4d5527d3a5f02817f --- /dev/null +++ b/utils/dataset_utils.py @@ -0,0 +1,536 @@ +""" +ESC-50 dataset utilities for loading and sampling audio data. +""" + +import csv +import json +import random +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import pandas as pd + +from .logger import setup_logger + +logger = setup_logger(__name__) + + +def load_or_create_class_subset(config: dict, all_categories: List[str]) -> List[str]: + """ + Load persisted class subset or create a new one. + + Args: + config: Configuration dictionary with dataset.use_class_subset, etc. + all_categories: List of all available categories + + Returns: + List of category names to use (either subset or all) + """ + dataset_config = config.get('dataset', {}) + use_subset = dataset_config.get('use_class_subset', False) + + if not use_subset: + logger.info(f"Using all {len(all_categories)} classes") + return all_categories + + num_classes = dataset_config.get('num_classes_subset', len(all_categories)) + persist_path = Path(dataset_config.get('subset_persist_path', 'class_subset.json')) + subset_seed = dataset_config.get('subset_seed', 42) + + # Try to load existing subset + if persist_path.exists(): + try: + with open(persist_path, 'r') as f: + data = json.load(f) + subset = data.get('classes', []) + + # Validate subset + if len(subset) == num_classes and all(c in all_categories for c in subset): + logger.info(f"Loaded persisted class subset from {persist_path}: {len(subset)} classes") + return subset + else: + logger.warning(f"Invalid persisted subset, regenerating...") + except Exception as e: + logger.warning(f"Failed to load persisted subset: {e}, regenerating...") + + # Create new subset + random.seed(subset_seed) + subset = random.sample(all_categories, min(num_classes, len(all_categories))) + subset.sort() # Sort for consistency + + # Persist subset + persist_path.parent.mkdir(parents=True, exist_ok=True) + with open(persist_path, 'w') as f: + json.dump({ + 'classes': subset, + 'num_classes': len(subset), + 'seed': subset_seed, + 'total_available': len(all_categories) + }, f, indent=2) + + logger.info(f"Created and persisted new class subset: {len(subset)} classes to {persist_path}") + return subset + + +class ESC50Dataset: + """Handler for ESC-50 dataset.""" + + # All 50 ESC-50 sound categories + ALL_CATEGORIES = [ + 'dog', 'chirping_birds', 'vacuum_cleaner', 'thunderstorm', 'door_wood_knock', + 'can_opening', 'crow', 'clapping', 'fireworks', 'chainsaw', 'airplane', + 'mouse_click', 'pouring_water', 'train', 'sheep', 'water_drops', 'church_bells', + 'clock_alarm', 'keyboard_typing', 'wind', 'footsteps', 'frog', 'cow', + 'brushing_teeth', 'car_horn', 'crackling_fire', 'helicopter', 'drinking_sipping', + 'rain', 'insects', 'laughing', 'hen', 'engine', 'breathing', 'crying_baby', + 'hand_saw', 'coughing', 'glass_breaking', 'snoring', 'toilet_flush', 'pig', + 'washing_machine', 'clock_tick', 'sneezing', 'rooster', 'sea_waves', 'siren', + 'cat', 'door_wood_creaks', 'crickets' + ] + + def __init__(self, metadata_path: str, audio_path: str, config: Optional[dict] = None): + """ + Initialize ESC-50 dataset handler. + + Args: + metadata_path: Path to esc50.csv metadata file + audio_path: Path to audio directory + config: Optional configuration dict with dataset.use_class_subset settings + """ + self.metadata_path = Path(metadata_path) + self.audio_path = Path(audio_path) + self.config = config or {} + self.df = None + self.category_to_target = {} + self.target_to_category = {} + + # Load class subset if configured + self.CATEGORIES = load_or_create_class_subset(self.config, self.ALL_CATEGORIES) + self.category_usage_counts = {cat: 0 for cat in self.CATEGORIES} + + self.load_metadata() + + def load_metadata(self): + """Load ESC-50 metadata CSV.""" + try: + self.df = pd.read_csv(self.metadata_path) + logger.info(f"Loaded ESC-50 metadata: {len(self.df)} files") + + # Create category mappings + for target, category in zip(self.df['target'], self.df['category']): + self.category_to_target[category] = target + self.target_to_category[target] = category + + logger.info(f"Found {len(self.category_to_target)} unique categories") + except Exception as e: + logger.error(f"Error loading metadata: {e}") + raise + + def get_files_by_category(self, category: str) -> List[str]: + """ + Get all audio files for a specific category. + + Args: + category: Sound category name + + Returns: + List of filenames for the category + """ + if category not in self.category_to_target: + raise ValueError(f"Unknown category: {category}") + + target = self.category_to_target[category] + files = self.df[self.df['target'] == target]['filename'].tolist() + return files + + def get_files_by_target(self, target: int) -> List[str]: + """ + Get all audio files for a specific target ID. + + Args: + target: Target class ID (0-49) + + Returns: + List of filenames for the target + """ + files = self.df[self.df['target'] == target]['filename'].tolist() + return files + + def sample_categories(self, n: int, exclude: Optional[List[str]] = None) -> List[str]: + """ + Sample n unique random categories from the active subset. + + Args: + n: Number of categories to sample + exclude: Optional list of categories to exclude + + Returns: + List of sampled category names + """ + available = [c for c in self.CATEGORIES if c not in (exclude or [])] + if n > len(available): + raise ValueError(f"Cannot sample {n} categories from subset, only {len(available)} available (subset size: {len(self.CATEGORIES)})") + return random.sample(available, n) + + def sample_targets(self, n: int, exclude: Optional[List[int]] = None) -> List[int]: + """ + Sample n unique random targets from the active subset. + + Args: + n: Number of targets to sample + exclude: Optional list of targets to exclude + + Returns: + List of sampled target IDs corresponding to categories in the subset + """ + # Get targets corresponding to categories in the subset + available_targets = [self.category_to_target[cat] for cat in self.CATEGORIES] + available = [t for t in available_targets if t not in (exclude or [])] + if n > len(available): + raise ValueError(f"Cannot sample {n} targets from subset, only {len(available)} available (subset size: {len(self.CATEGORIES)})") + return random.sample(available, n) + + def sample_file_from_category(self, category: str) -> Tuple[str, str]: + """ + Sample a random audio file from a category. + + Args: + category: Sound category name + + Returns: + Tuple of (filename, full_path) + """ + files = self.get_files_by_category(category) + filename = random.choice(files) + full_path = str(self.audio_path / filename) + return filename, full_path + + def sample_file_from_target(self, target: int) -> Tuple[str, str, str]: + """ + Sample a random audio file from a target. + + Args: + target: Target class ID + + Returns: + Tuple of (filename, category, full_path) + """ + files = self.get_files_by_target(target) + filename = random.choice(files) + category = self.target_to_category[target] + full_path = str(self.audio_path / filename) + return filename, category, full_path + + def get_category_from_filename(self, filename: str) -> str: + """Get category name from filename.""" + row = self.df[self.df['filename'] == filename] + if len(row) == 0: + raise ValueError(f"Unknown filename: {filename}") + return row.iloc[0]['category'] + + def get_file_path(self, filename: str) -> str: + """Get full path for a filename.""" + return str(self.audio_path / filename) + + def sample_categories_balanced(self, n: int, exclude: Optional[List[str]] = None, + answer_category: Optional[str] = None) -> List[str]: + """ + Sample n unique categories with balanced usage tracking. + + This method ensures that over many samples, all categories appear + roughly equally as answers by preferentially sampling underused categories. + + Args: + n: Number of categories to sample + exclude: Optional list of categories to exclude + answer_category: If provided, ensures this category is included and tracks it + + Returns: + List of sampled category names with answer_category first if provided + """ + available = [c for c in self.CATEGORIES if c not in (exclude or [])] + if n > len(available): + raise ValueError(f"Cannot sample {n} categories, only {len(available)} available") + + if answer_category: + # Track answer category usage + self.category_usage_counts[answer_category] += 1 + + # Remove answer category from available and sample the rest + available = [c for c in available if c != answer_category] + other_categories = random.sample(available, n - 1) + return [answer_category] + other_categories + else: + # Sample without specific answer category + return random.sample(available, n) + + def get_least_used_categories(self, n: int, exclude: Optional[List[str]] = None) -> List[str]: + """ + Get n categories that have been used least as answers. + + Args: + n: Number of categories to get + exclude: Optional list of categories to exclude + + Returns: + List of least-used category names + """ + available = [c for c in self.CATEGORIES if c not in (exclude or [])] + if n > len(available): + raise ValueError(f"Cannot get {n} categories, only {len(available)} available") + + # Sort by usage count (ascending) and take n least used + sorted_categories = sorted(available, key=lambda c: self.category_usage_counts[c]) + + # Among least used, get all with same minimum count + min_count = self.category_usage_counts[sorted_categories[0]] + candidates = [c for c in sorted_categories if self.category_usage_counts[c] == min_count] + + if len(candidates) >= n: + # Randomly sample from least used + return random.sample(candidates, n) + else: + # Take all minimum and fill with next tier + result = candidates.copy() + remaining = n - len(result) + next_tier = [c for c in sorted_categories if c not in candidates][:remaining] + result.extend(next_tier) + return result + + def get_category_usage_stats(self) -> Dict[str, int]: + """Get current category usage statistics.""" + return self.category_usage_counts.copy() + + def reset_category_usage(self): + """Reset category usage tracking.""" + self.category_usage_counts = {cat: 0 for cat in self.CATEGORIES} + logger.info("Reset category usage tracking") + + +class PreprocessedESC50Dataset(ESC50Dataset): + """ + Handler for preprocessed ESC-50 dataset with effective durations. + + Extends ESC50Dataset to use trimmed audio files and effective duration + metadata from amplitude-based preprocessing. + """ + + def __init__( + self, + metadata_path: str, + audio_path: str, + preprocessed_path: str, + config: Optional[dict] = None + ): + """ + Initialize preprocessed ESC-50 dataset handler. + + Args: + metadata_path: Path to original esc50.csv metadata file + audio_path: Path to original audio directory (fallback) + preprocessed_path: Path to preprocessed data directory + config: Optional configuration dict with dataset.use_class_subset settings + """ + super().__init__(metadata_path, audio_path, config) + + self.preprocessed_path = Path(preprocessed_path) + self.trimmed_audio_path = self.preprocessed_path / "trimmed_audio" + self.effective_durations_path = self.preprocessed_path / "effective_durations.csv" + + # Load effective durations + self.effective_df = None + self.load_effective_durations() + + def load_effective_durations(self): + """Load effective durations from preprocessed CSV.""" + try: + self.effective_df = pd.read_csv(self.effective_durations_path) + logger.info(f"Loaded effective durations for {len(self.effective_df)} clips") + + # Create quick lookup dictionaries + self.filename_to_effective = dict( + zip(self.effective_df['filename'], self.effective_df['effective_duration_s']) + ) + self.filename_to_category = dict( + zip(self.effective_df['filename'], self.effective_df['category']) + ) + + # Category-level statistics + self.category_effective_stats = self.effective_df.groupby('category').agg({ + 'effective_duration_s': ['mean', 'std', 'min', 'max', 'count'] + }).round(4) + self.category_effective_stats.columns = ['mean', 'std', 'min', 'max', 'count'] + + logger.info("Created effective duration lookup tables") + + except Exception as e: + logger.error(f"Error loading effective durations: {e}") + raise + + def get_effective_duration(self, filename: str) -> float: + """ + Get effective duration for a specific file. + + Args: + filename: Audio filename + + Returns: + Effective duration in seconds + """ + if filename not in self.filename_to_effective: + logger.warning(f"No effective duration for {filename}, using default 5.0s") + return 5.0 + return self.filename_to_effective[filename] + + def get_category_effective_stats(self, category: str) -> Dict: + """ + Get effective duration statistics for a category. + + Args: + category: Category name + + Returns: + Dict with mean, std, min, max, count + """ + if category not in self.category_effective_stats.index: + return {'mean': 5.0, 'std': 0.0, 'min': 5.0, 'max': 5.0, 'count': 0} + + stats = self.category_effective_stats.loc[category] + return { + 'mean': stats['mean'], + 'std': stats['std'], + 'min': stats['min'], + 'max': stats['max'], + 'count': int(stats['count']) + } + + def get_files_by_category_with_durations(self, category: str) -> List[Dict]: + """ + Get all files for a category with their effective durations. + + Args: + category: Category name + + Returns: + List of dicts with filename, effective_duration_s, filepath + """ + cat_df = self.effective_df[self.effective_df['category'] == category] + + results = [] + for _, row in cat_df.iterrows(): + results.append({ + 'filename': row['filename'], + 'effective_duration_s': row['effective_duration_s'], + 'filepath': str(self.trimmed_audio_path / row['filename']), + 'raw_duration_s': row['raw_duration_s'], + 'peak_amplitude_db': row['peak_amplitude_db'] + }) + + return results + + def sample_file_from_category_with_duration( + self, + category: str, + min_effective_duration: float = None, + max_effective_duration: float = None + ) -> Tuple[str, str, float]: + """ + Sample a file from category with optional duration constraints. + + Args: + category: Category name + min_effective_duration: Minimum effective duration (optional) + max_effective_duration: Maximum effective duration (optional) + + Returns: + Tuple of (filename, filepath, effective_duration_s) + """ + files = self.get_files_by_category_with_durations(category) + + # Filter by duration if constraints provided + if min_effective_duration is not None: + files = [f for f in files if f['effective_duration_s'] >= min_effective_duration] + if max_effective_duration is not None: + files = [f for f in files if f['effective_duration_s'] <= max_effective_duration] + + if not files: + # Fallback to any file from category + logger.warning(f"No files match duration constraints for {category}, using any file") + files = self.get_files_by_category_with_durations(category) + + selected = random.choice(files) + return selected['filename'], selected['filepath'], selected['effective_duration_s'] + + def sample_files_from_category_to_reach_duration( + self, + category: str, + target_duration_s: float, + prefer_same_file: bool = True + ) -> Tuple[List[str], List[str], float]: + """ + Sample files from a category to reach a target total effective duration. + + Args: + category: Category name + target_duration_s: Target total effective duration + prefer_same_file: If True, try repeating same file first + + Returns: + Tuple of (filenames_list, filepaths_list, actual_total_duration_s) + """ + files = self.get_files_by_category_with_durations(category) + + if not files: + raise ValueError(f"No files found for category: {category}") + + selected_filenames = [] + selected_filepaths = [] + total_duration = 0.0 + + if prefer_same_file: + # Sort by effective duration descending (prefer longer clips) + files_sorted = sorted(files, key=lambda x: x['effective_duration_s'], reverse=True) + selected_file = files_sorted[0] + + # Calculate how many repetitions needed + reps_needed = max(1, int(target_duration_s / selected_file['effective_duration_s']) + 1) + + for _ in range(reps_needed): + selected_filenames.append(selected_file['filename']) + selected_filepaths.append(selected_file['filepath']) + total_duration += selected_file['effective_duration_s'] + + if total_duration >= target_duration_s: + break + else: + # Use different files + random.shuffle(files) + file_idx = 0 + + while total_duration < target_duration_s: + selected_file = files[file_idx % len(files)] + selected_filenames.append(selected_file['filename']) + selected_filepaths.append(selected_file['filepath']) + total_duration += selected_file['effective_duration_s'] + file_idx += 1 + + # Safety limit + if file_idx > 100: + logger.warning(f"Hit safety limit when sampling files for {category}") + break + + return selected_filenames, selected_filepaths, total_duration + + def get_categories_sorted_by_effective_duration(self, ascending: bool = True) -> List[str]: + """ + Get categories sorted by their mean effective duration. + + Args: + ascending: If True, shortest first; if False, longest first + + Returns: + List of category names sorted by mean effective duration + """ + sorted_stats = self.category_effective_stats.sort_values('mean', ascending=ascending) + return sorted_stats.index.tolist() + diff --git a/utils/llm_utils.py b/utils/llm_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f75706f7aa3124af1093ed793a438b1585817a72 --- /dev/null +++ b/utils/llm_utils.py @@ -0,0 +1,144 @@ +""" +LLM-based question generation utilities. + +Supports multiple LLM providers for generating natural, lexically consistent questions. +""" + +import os +import random +from typing import Dict, List, Optional, Tuple +import json + +from .logger import setup_logger + +logger = setup_logger(__name__) + + +class LLMQuestionGenerator: + """Generate questions using local Llama 3.1 8B Instruct LLM.""" + + def __init__( + self, + enabled: bool = False, + template_questions: Optional[Dict] = None + ): + """ + Initialize LLM question generator. + + Args: + enabled: Whether LLM generation is enabled + template_questions: Template questions for fallback + """ + self.enabled = enabled + self.template_questions = template_questions or {} + + if not self.enabled: + logger.info("LLM generation disabled, using templates") + return + + # TODO: Initialize local Llama 3.1 8B model connection + # This will be implemented based on your local LLM setup + logger.info("LLM generation enabled (local Llama 3.1 8B)") + logger.warning("Local LLM integration not yet implemented, falling back to templates") + + + def generate_count_questions( + self, + correct_count: int, + categories_present: List[str], + generate_both: bool = True + ) -> Dict: + """ + Generate count task questions. + + Args: + correct_count: Correct number of unique sounds + categories_present: List of sound categories in the audio + generate_both: Whether to generate both MCQ and open-text + + Returns: + Dictionary with mcq_question and/or open_text_question + """ + # TODO: Implement LLM generation when enabled + # For now, always use templates + return self._generate_count_template(correct_count) + + def generate_category_questions( + self, + task_type: str, + correct_category: str, + categories_present: List[str], + context: Optional[Dict] = None + ) -> Dict: + """ + Generate questions where the answer is a sound category. + + Args: + task_type: Type of task (duration, order, volume) + correct_category: Correct answer category + categories_present: All categories in the audio + context: Additional context (e.g., question_type, reference_sound) + + Returns: + Dictionary with mcq_question and open_text_question + """ + # TODO: Implement LLM generation when enabled + # For now, always use templates + return self._generate_category_template(task_type, correct_category, context) + + def _generate_count_template(self, correct_count: int) -> Dict: + """Generate count questions from templates.""" + mcq_templates = self.template_questions.get("count", {}).get("mcq", [ + "What is the number of distinct sound sources in the audio file?", + "How many different types of sounds can be identified in this recording?" + ]) + open_templates = self.template_questions.get("count", {}).get("open_text", [ + "How many distinct sound sources are present in the audio?", + "Count the number of unique sounds in this recording." + ]) + + return { + "mcq_question": random.choice(mcq_templates), + "open_text_question": random.choice(open_templates) + } + + def _generate_category_template( + self, + task_type: str, + correct_category: str, + context: Optional[Dict] + ) -> Dict: + """Generate category questions from templates.""" + context = context or {} + + if task_type == "duration": + q_type = context.get("question_type", "shortest") + mcq_q = f"Which of the following sounds is heard for the {q_type} duration?" + open_q = f"Which sound is heard for the {q_type} duration in the audio?" + + elif task_type == "order": + q_subtype = context.get("question_subtype", "first") + if q_subtype == "first": + mcq_q = "Which sound appears first in the audio clip?" + open_q = "What is the first sound you hear in the audio?" + elif q_subtype == "last": + mcq_q = "Which sound appears last in the audio clip?" + open_q = "What is the last sound you hear in the audio?" + elif q_subtype == "after": + ref = context.get("reference_sound", "") + mcq_q = f"Which sound comes after {ref}?" + open_q = f"What sound comes after {ref}?" + else: + ref = context.get("reference_sound", "") + mcq_q = f"Which sound comes before {ref}?" + open_q = f"What sound comes before {ref}?" + + else: # volume + q_type = context.get("question_type", "loudest") + mcq_q = f"Which sound is the {q_type} in the audio?" + open_q = f"Identify the {q_type} sound in the audio clip." + + return { + "mcq_question": mcq_q, + "open_text_question": open_q + } diff --git a/utils/logger.py b/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..56757f0620ddd902d4e4fe837b890daa4b71fa09 --- /dev/null +++ b/utils/logger.py @@ -0,0 +1,57 @@ +""" +Logging utilities for the temporal reasoning dataset pipeline. +""" + +import logging +import sys +from pathlib import Path +from typing import Optional + + +def setup_logger( + name: str, + log_file: Optional[str] = None, + level: str = "INFO", + console_output: bool = True +) -> logging.Logger: + """ + Set up a logger with file and/or console handlers. + + Args: + name: Name of the logger + log_file: Path to log file (if None, only console logging) + level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + console_output: Whether to output logs to console + + Returns: + Configured logger instance + """ + logger = logging.getLogger(name) + logger.setLevel(getattr(logging, level.upper())) + + # Remove existing handlers to avoid duplicates + logger.handlers = [] + + # Create formatter + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + # Console handler + if console_output: + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(getattr(logging, level.upper())) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + + # File handler + if log_file: + log_path = Path(log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(getattr(logging, level.upper())) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + return logger diff --git a/utils/question_utils.py b/utils/question_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fbe1a2193ca2079523a115a1b514ac6c4caaedf3 --- /dev/null +++ b/utils/question_utils.py @@ -0,0 +1,263 @@ +""" +Question generation utilities for MCQ and open-text formats. +""" + +import random +from typing import Dict, List, Optional, Tuple + +from .logger import setup_logger + +logger = setup_logger(__name__) + + +class QuestionGenerator: + """Generates questions in MCQ and open-text formats.""" + + def __init__( + self, + num_options: int = 4, + option_labels: Optional[List[str]] = None, + distractor_strategy: str = "balanced" + ): + """ + Initialize question generator. + + Args: + num_options: Number of MCQ options + option_labels: Labels for options (e.g., ['A', 'B', 'C', 'D']) + distractor_strategy: Strategy for generating distractor options + - "present_only": only use sounds present in audio + - "mixed": mix of present and absent sounds + - "balanced": balanced distribution + """ + self.num_options = num_options + self.option_labels = option_labels or ["A", "B", "C", "D"] + self.distractor_strategy = distractor_strategy + + if len(self.option_labels) != num_options: + raise ValueError(f"Number of option labels must match num_options ({num_options})") + + def generate_count_mcq( + self, + question_template: str, + correct_count: int, + all_categories: List[str] + ) -> Dict: + """ + Generate an MCQ for counting task. + + Args: + question_template: Question text template + correct_count: Correct number of unique sounds + all_categories: List of all available categories + + Returns: + Dictionary with question, options, and correct answer + """ + # Generate options (including the correct answer) + options = self._generate_count_options(correct_count) + + # Shuffle options + random.shuffle(options) + + # Find correct answer label + correct_label = self.option_labels[options.index(correct_count)] + + # Create option mapping + option_map = {label: value for label, value in zip(self.option_labels, options)} + + return { + "question": question_template, + "options": option_map, + "correct_answer": correct_label, + "correct_value": correct_count + } + + def generate_count_open_text( + self, + question_template: str, + correct_count: int + ) -> Dict: + """ + Generate an open-text question for counting task. + + Args: + question_template: Question text template + correct_count: Correct number of unique sounds + + Returns: + Dictionary with question and correct answer + """ + return { + "question": question_template, + "correct_answer": str(correct_count) + } + + def generate_category_mcq( + self, + question_template: str, + correct_category: str, + present_categories: List[str], + all_categories: List[str] + ) -> Dict: + """ + Generate an MCQ where answer is a sound category. + + Args: + question_template: Question text template + correct_category: Correct category + present_categories: Categories present in the audio + all_categories: All available categories + + Returns: + Dictionary with question, options, and correct answer + """ + # Generate distractor options + distractors = self._generate_category_distractors( + correct_category, + present_categories, + all_categories, + self.num_options - 1 + ) + + # Combine with correct answer + options = [correct_category] + distractors + random.shuffle(options) + + # Find correct answer label + correct_label = self.option_labels[options.index(correct_category)] + + # Create option mapping + option_map = {label: value for label, value in zip(self.option_labels, options)} + + return { + "question": question_template, + "options": option_map, + "correct_answer": correct_label, + "correct_value": correct_category + } + + def generate_category_open_text( + self, + question_template: str, + correct_category: str + ) -> Dict: + """ + Generate an open-text question where answer is a sound category. + + Args: + question_template: Question text template + correct_category: Correct category + + Returns: + Dictionary with question and correct answer + """ + return { + "question": question_template, + "correct_answer": correct_category + } + + def generate_sequence_open_text( + self, + question_template: str, + sequence: List[str] + ) -> Dict: + """ + Generate an open-text question for sequence/ordering. + + Args: + question_template: Question text template + sequence: List of categories in order + + Returns: + Dictionary with question and correct answer + """ + return { + "question": question_template, + "correct_answer": ", ".join(sequence) + } + + def _generate_count_options(self, correct_count: int) -> List[int]: + """ + Generate count options including the correct count. + + Args: + correct_count: Correct count value + + Returns: + List of count options + """ + options = [correct_count] + + # Generate distractors (minimum count is 1, not 0) + possible_values = list(range(1, max(correct_count + 3, 12))) + possible_values = [v for v in possible_values if v != correct_count] + + distractors = random.sample(possible_values, min(self.num_options - 1, len(possible_values))) + options.extend(distractors) + + return options[:self.num_options] + + def _generate_category_distractors( + self, + correct_category: str, + present_categories: List[str], + all_categories: List[str], + num_distractors: int + ) -> List[str]: + """ + Generate distractor categories based on strategy. + + Args: + correct_category: Correct category + present_categories: Categories present in audio + all_categories: All available categories + num_distractors: Number of distractors to generate + + Returns: + List of distractor categories + """ + present_non_answer = [c for c in present_categories if c != correct_category] + absent_categories = [c for c in all_categories if c not in present_categories] + + distractors = [] + + if self.distractor_strategy == "present_only": + # Only use categories present in the audio + if len(present_non_answer) >= num_distractors: + distractors = random.sample(present_non_answer, num_distractors) + else: + distractors = present_non_answer.copy() + # Fill remaining with random absent categories + remaining = num_distractors - len(distractors) + distractors.extend(random.sample(absent_categories, min(remaining, len(absent_categories)))) + + elif self.distractor_strategy == "mixed": + # Mix of present and absent (random proportion) + num_present = random.randint(0, min(len(present_non_answer), num_distractors)) + num_absent = num_distractors - num_present + + if num_present > 0: + distractors.extend(random.sample(present_non_answer, min(num_present, len(present_non_answer)))) + if num_absent > 0: + distractors.extend(random.sample(absent_categories, min(num_absent, len(absent_categories)))) + + else: # balanced + # Balanced distribution: 0, 1, or 2 present sounds as distractors + num_present_distractor = random.choice([0, 1, 2]) + num_present_distractor = min(num_present_distractor, len(present_non_answer), num_distractors) + num_absent_distractor = num_distractors - num_present_distractor + + if num_present_distractor > 0: + distractors.extend(random.sample(present_non_answer, num_present_distractor)) + if num_absent_distractor > 0: + distractors.extend(random.sample(absent_categories, min(num_absent_distractor, len(absent_categories)))) + + # Fill remaining slots if needed + while len(distractors) < num_distractors: + remaining_options = [c for c in all_categories if c not in distractors and c != correct_category] + if not remaining_options: + break + distractors.append(random.choice(remaining_options)) + + return distractors[:num_distractors]