malay-36 commited on Dec 18, 2025

Commit

fec9168

verified ·

1 Parent(s): 1140d11

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +19 -0
DOCS.md +1296 -0
README.md +112 -0
config.yaml +348 -0
llm_answer_generator.py +268 -0
main.py +272 -0
preprocess_esc50.py +714 -0
requirements.txt +6 -0
run_llm_answers_all.sh +28 -0
run_pipeline.sh +166 -0
synthetic_silences/silent_1.wav +3 -0
synthetic_silences/silent_10.wav +3 -0
synthetic_silences/silent_11.wav +3 -0
synthetic_silences/silent_12.wav +3 -0
synthetic_silences/silent_13.wav +3 -0
synthetic_silences/silent_14.wav +3 -0
synthetic_silences/silent_15.wav +3 -0
synthetic_silences/silent_16.wav +3 -0
synthetic_silences/silent_17.wav +3 -0
synthetic_silences/silent_18.wav +3 -0
synthetic_silences/silent_19.wav +0 -0
synthetic_silences/silent_2.wav +3 -0
synthetic_silences/silent_20.wav +3 -0
synthetic_silences/silent_3.wav +3 -0
synthetic_silences/silent_4.wav +3 -0
synthetic_silences/silent_5.wav +3 -0
synthetic_silences/silent_6.wav +3 -0
synthetic_silences/silent_7.wav +3 -0
synthetic_silences/silent_8.wav +3 -0
synthetic_silences/silent_9.wav +3 -0
tasks/__pycache__/task_count.cpython-312.pyc +0 -0
tasks/__pycache__/task_duration.cpython-312.pyc +0 -0
tasks/__pycache__/task_order.cpython-312.pyc +0 -0
tasks/__pycache__/task_volume.cpython-312.pyc +0 -0
tasks/task_count.py +472 -0
tasks/task_duration.py +820 -0
tasks/task_order.py +598 -0
tasks/task_volume.py +732 -0
utils/__init__.py +50 -0
utils/__pycache__/__init__.cpython-312.pyc +0 -0
utils/__pycache__/__init__.cpython-314.pyc +0 -0
utils/__pycache__/audio_utils.cpython-312.pyc +0 -0
utils/__pycache__/audio_utils.cpython-314.pyc +0 -0
utils/__pycache__/dataset_utils.cpython-312.pyc +0 -0
utils/__pycache__/llm_utils.cpython-312.pyc +0 -0
utils/__pycache__/logger.cpython-312.pyc +0 -0
utils/__pycache__/question_utils.cpython-312.pyc +0 -0
utils/audio_utils.py +1388 -0
utils/dataset_utils.py +536 -0
utils/llm_utils.py +144 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_1.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_10.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_11.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_12.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_13.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_14.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_15.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_16.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_17.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_18.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_2.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_20.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_3.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_4.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_5.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_6.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_7.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_8.wav filter=lfs diff=lfs merge=lfs -text
+synthetic_silences/silent_9.wav filter=lfs diff=lfs merge=lfs -text

DOCS.md ADDED Viewed

	@@ -0,0 +1,1296 @@

+# TREA 2.0 - Technical Documentation
+Comprehensive technical documentation for the TREA 2.0 audio dataset generation pipeline. This document covers the complete implementation including algorithms, mathematical formulations, configuration parameters, preprocessing details, and capacity-aware balancing mechanisms.
+**For Quick Start Guide**: See [README.md](README.md)
+---
+## Table of Contents
+1. [Pipeline Overview](#pipeline-overview)
+2. [How Sample Durations Are Generated](#how-sample-durations-are-generated)
+3. [Configuration Reference](#configuration-reference)
+4. [ESC-50 Preprocessing](#esc-50-preprocessing-duration-task-only)
+5. [Audio Utilities](#audio-utilities)
+6. [Task: COUNT](#task-count)
+7. [Task: DURATION](#task-duration)
+8. [Task: ORDER](#task-order)
+9. [Task: VOLUME](#task-volume)
+10. [Deterministic Balancing Mechanisms](#deterministic-balancing-mechanisms)
+11. [Rejection Logic and Retry Mechanisms](#rejection-logic-and-retry-mechanisms)
+12. [Command-Line Arguments](#command-line-arguments)
+13. [Summary](#summary)
+---
+## Pipeline Overview
+### Architecture
+The pipeline generates four types of audio-based question-answering samples:
+| Task | Question Type | Example Question |
+|------|---------------|------------------|
+| **COUNT** | Counting unique sounds | "How many unique sounds do you hear?" |
+| **DURATION** | Temporal comparison | "Which sound plays for the longest duration?" |
+| **ORDER** | Temporal ordering | "Which sound plays first/last/after X?" |
+| **VOLUME** | Loudness comparison | "Which sound is the loudest/softest?" |
+### Directory Structure
+```
+pipeline/
+├── main.py                 # Entry point - orchestrates all tasks
+├── config.yaml             # All configuration parameters
+├── tasks/
+│   ├── task_count.py       # CountTaskGenerator class
+│   ├── task_duration.py    # DurationTaskGenerator class
+│   ├── task_order.py       # OrderTaskGenerator class
+│   └── task_volume.py      # VolumeTaskGenerator class
+├── utils/
+│   ├── __init__.py         # Exports all utilities
+│   ├── audio_utils.py      # Audio processing functions
+│   ├── dataset_utils.py    # ESC50Dataset, PreprocessedESC50Dataset
+│   ├── question_utils.py   # QuestionGenerator
+│   ├── llm_utils.py        # LLMQuestionGenerator
+│   └── logger.py           # setup_logger
+└── output/                 # Generated outputs
+```
+### Data Flow
+```
+ESC-50 Dataset (2000 clips, 50 categories, 5s each)
+        ↓
+[DURATION TASK ONLY] Preprocessing Script (preprocess_esc50.py)
+├── Detects sound regions using adaptive noise-floor thresholding
+├── Trims leading/trailing silence (keeps internal structure)
+├── Calculates effective durations
+        ↓
+ESC-50_preprocessed/
+├── effective_durations.csv (metadata with effective durations)
+└── trimmed_audio/*.wav (edge-trimmed clips)
+        ↓
+Pipeline (task-specific generation with balancing)
+├── COUNT: Uses raw ESC-50 clips
+├── DURATION: Uses preprocessed clips with effective durations
+├── ORDER: Uses raw ESC-50 clips
+└── VOLUME: Uses raw ESC-50 clips (normalized then volume-adjusted)
+        ↓
+output/{task}/
+├── audios/*.wav (generated audio samples)
+├── {task}_mcq.csv (multiple choice questions)
+├── {task}_open_text.csv (open-ended questions)
+└── {task}_metadata.csv (detailed metadata)
+```
+### Entry Point: `main.py`
+The main orchestration happens via individual task runner functions:
+```python
+def run_count_task(config: dict, logger):
+    generator = CountTaskGenerator(config, logger)
+    generator.dataset.reset_category_usage()
+    generator.generate_dataset()
+def run_duration_task(config: dict, logger):
+    generator = DurationTaskGenerator(config, logger)
+    generator.dataset.reset_category_usage()
+    generator.generate_dataset()
+def run_order_task(config: dict, logger):
+    generator = OrderTaskGenerator(config, logger)
+    generator.dataset.reset_category_usage()
+    generator.generate_dataset()
+def run_volume_task(config: dict, logger):
+    generator = VolumeTaskGenerator(config, logger)
+    generator.dataset.reset_category_usage()
+    generator.generate_dataset()
+```
+---
+## How Sample Durations Are Generated
+**IMPORTANT**: Sample durations are generated upfront to **exactly fill the target task duration**.
+### The Algorithm
+Located in `utils/audio_utils.py`:
+```python
+def generate_sample_durations_for_task(
+    task_duration_hours: float,
+    min_clip_duration: float,
+    max_clip_duration: float
+) -> list:
+    """
+    Generate sample durations that exactly fill the target task duration.
+    """
+    task_duration_seconds = task_duration_hours * 3600
+    remaining = task_duration_seconds
+    durations = []
+    while remaining >= min_clip_duration:
+        # Cap max at remaining to avoid overshoot
+        effective_max = min(max_clip_duration, remaining)
+        # If remaining is less than min, we can't fit another sample
+        if effective_max < min_clip_duration:
+            break
+        # Sample uniformly within valid range
+        d = random.uniform(min_clip_duration, effective_max)
+        durations.append(d)
+        remaining -= d
+    # Shuffle to randomize order
+    random.shuffle(durations)
+    return durations
+```
+1. Start with `remaining = total_seconds`
+2. While `remaining >= min_clip_duration`:
+   - Sample `d ~ Uniform(min, min(max, remaining))`
+   - Append `d` to durations list
+   - Subtract `d` from remaining
+3. Shuffle and return
+### Mathematical Properties
+**Guarantee**: $\sum_{i=1}^{N} d_i \leq T$ and $T - \sum d_i < d_{\min}$
+Where:
+- $T$ = total task duration
+- $d_i$ = duration of sample $i$
+- $d_{\min}$ = minimum clip duration
+- $N$ = number of samples generated (variable, not fixed!)
+**Each duration**: $d_i \sim \text{Uniform}(d_{\min}, \min(d_{\max}, \text{remaining}_i))$
+### Example
+With `task_duration_size = 1.0` hours (3600s), `min = 20s`, `max = 60s`:
+```
+remaining=3600 → d₁=45.2s → remaining=3554.8
+remaining=3554.8 → d₂=28.7s → remaining=3526.1
+remaining=3526.1 → d₃=52.1s → remaining=3474.0
+...
+remaining=35.2 → d₈₉=35.2s → remaining=0  (capped at remaining)
+```
+Result: 89 samples totaling exactly 3600s (instead of estimated 90)
+### Where It's Called
+Each task's `generate_dataset()` method uses this:
+```python
+def generate_dataset(self) -> tuple:
+    # Generate all durations upfront
+    sample_durations = generate_sample_durations_for_task(
+        self.task_duration_hours,
+        self.min_clip_duration,
+        self.max_clip_duration
+    )
+    num_samples = len(sample_durations)
+    self.logger.info(f"Generating {num_samples} samples...")
+    # Each sample uses its pre-assigned duration
+    for i, target_duration in enumerate(sample_durations):
+        metadata = self.generate_sample(i, target_duration=target_duration, ...)
+```
+```
+---
+## Configuration Reference
+All parameters are defined in `config.yaml`.
+### Dataset Class Subset Configuration
+```yaml
+dataset:
+  use_class_subset: false                # Enable to use only a subset of ESC-50 classes
+  num_classes_subset: 40                 # Number of classes for train/val/test (e.g., 40 of 50)
+  subset_persist_path: "output/class_subset.json"  # Path to save/load class subset
+  subset_seed: 42                        # Random seed for subset selection (persisted)
+```
+**Purpose**: Create in-distribution (ID) splits using a subset of classes, then optionally test on out-of-distribution (OOD) using all classes.
+**Workflow**:
+1. Set `use_class_subset: true` and `num_classes_subset: 40`
+2. Run pipeline - 40 classes randomly selected and saved to `class_subset.json`
+3. Generate train/val/test splits - all use same 40 classes
+4. For OOD test: Set `use_class_subset: false`, use different output path
+### Global Audio Parameters
+```yaml
+audio:
+  min_clip_duration: 20.0     # Minimum generated clip duration (seconds)
+  max_clip_duration: 60.0     # Maximum generated clip duration (seconds)
+  source_clip_duration: 5.0   # ESC-50 clip length (seconds)
+  # Silence and crossfade parameters (applied to ALL tasks)
+  min_silence_duration: 100   # Minimum silence ALWAYS between clips (ms)
+  max_extra_silence_per_gap: 500  # Max extra silence per gap when distributing remainder (ms)
+  crossfade_duration: 500     # Crossfade between audio-silence transitions (ms) for smooth joins
+  crossfade_within_source: 50 # Small crossfade within same-source repetitions (ms) for COUNT task
+  with_silence: true          # Enable silence insertion between clips
+  normalize: false
+  normalize_target_dBFS: -20.0
+```
+### Task-Specific Parameters
+#### COUNT Task
+```yaml
+count:
+  enabled: true
+  task_duration_size: 2.0    # Hours of total audio to generate
+  max_clips_per_sample: 10   # Maximum unique sounds per sample (1 to 10)
+  ordering_mode: "random"    # "random" (shuffled clips) or "consecutive" (grouped by source)
+  # CAPACITY-AWARE ANSWER BALANCING:
+  # - Creates balanced distribution of answers from 1 to max_clips_per_sample
+  # - Sorts samples by capacity (max_clips each can fit)
+  # - Assigns higher targets to high-capacity samples
+  # - Clamps targets to what actually fits (reduces excessive silence)
+```
+#### DURATION Task
+```yaml
+duration:
+  enabled: true
+  task_duration_size: 2.0
+  preprocessed_data_path: "/home/debarpanb1/TREA_2.0/ESC-50_preprocessed"
+  question_types: ["shortest", "longest"]
+  num_unique_sources: 10      # Can be int or list (e.g., [2,3,4,5])
+  ordering_methods: ["consecutive"]  # Only consecutive for duration task
+  # Preprocessing parameters (adaptive noise-floor thresholding)
+  threshold_strategy: "noise_floor"      # Adaptive per-clip (recommended)
+  noise_floor_percentile: 2.0            # Use 2nd percentile as noise floor
+  noise_floor_delta_db: 5.0              # Threshold = noise_floor + 5dB
+  min_sound_duration_ms: 25              # Filter transient spikes
+  # Gap multipliers
+  multiplier_longest: 1.5     # Target must be ≥ 1.5x max background
+  multiplier_shortest: 0.75   # Target must be ≤ 0.75x min background (changed from 0.5)
+  min_effective_duration_per_source: 1.0  # Minimum duration per source (seconds)
+  reject_if_gap_not_met: true
+  sample_different_clips_same_class: true
+```
+#### ORDER Task
+```yaml
+order:
+  enabled: true
+  task_duration_size: 2.0
+  max_clips_per_sample: 10   # Cap for maximum clips to join
+  question_types: ["first", "last", "second", "second_last", "after", "before"]
+  min_clips_for_second_questions: 3  # "second" and "second_last" require ≥3 clips
+  allow_source_repetition: false     # Each clip from unique source
+  # CAPACITY-AWARE QUESTION TYPE BALANCING:
+  # - Each question type appears equally across samples
+  # - Advanced types (second, second_last) assigned to high-capacity samples
+  # - Basic types (first, last, after, before) for lower-capacity samples
+  # - NO n_clips balancing: randomly samples from [max(2, max_clips-3), max_clips_per_sample]
+```
+#### VOLUME Task
+```yaml
+volume:
+  enabled: true
+  task_duration_size: 2.0
+  max_clips_per_sample: 10   # Cap for maximum clips with different volumes
+  question_types: ["max_loudness", "min_loudness"]
+  # Normalization (CRITICAL for controlled volume comparison)
+  normalize_to_baseline: true
+  baseline_dBFS: -20.0       # All clips normalized to this level first
+  use_lufs: false            # DISABLED - LUFS makes everything same perceived loudness!
+  baseline_lufs: -23.0       # EBU R128 standard (not used when use_lufs=false)
+  # Volume gap constraints (multipliers)
+  multiplier_max_loudness: 4.0   # Max must be ≥ 4x second-loudest (~12 dB)
+  multiplier_min_loudness: 0.25  # Min must be ≤ 0.25x second-softest (~12 dB)
+  reject_if_gap_not_met: true
+  # Source clip options
+  use_same_clip_different_volumes: false  # Use different clips (not same clip repeated)
+  repetitions_per_source: [2, 3, 4]       # If same clip used, how many repetitions
+  # QUESTION TYPE BALANCING: Each question type appears equally across samples
+  # NO n_clips balancing: randomly samples from [max(2, max_clips-3), max_clips_per_sample]
+```
+---
+## ESC-50 Preprocessing (Duration Task Only)
+**File**: `preprocess_esc50.py`
+**Purpose**: Preprocess ESC-50 clips for duration task by detecting actual sound regions and trimming silence.
+### Why Preprocessing?
+The DURATION task compares sound durations. Raw ESC-50 clips have variable amounts of leading/trailing silence, which would make duration comparisons ambiguous. Preprocessing:
+1. **Detects actual sound regions** using adaptive amplitude thresholding
+2. **Trims leading and trailing silence** (preserves internal structure)
+3. **Calculates effective duration** (sum of all sound regions)
+4. **Generates metadata CSV** with per-clip durations
+### Preprocessing Pipeline
+```
+Raw ESC-50 clip (5s with silence)
+        ↓
+1. Load audio and convert to amplitude array
+2. Compute RMS envelope (frame-by-frame energy)
+3. Convert RMS to dB values
+4. Apply adaptive threshold strategy
+5. Detect contiguous sound regions
+6. Trim edges (only if silence >= 100ms)
+7. Calculate effective duration
+8. Save trimmed audio + metadata
+```
+### Adaptive Noise-Floor Thresholding
+The preprocessing uses an **adaptive per-clip threshold** strategy:
+```python
+# Strategy: 'noise_floor' (adaptive, recommended)
+noise_floor_db = np.percentile(db_values, noise_floor_percentile)  # e.g., 2nd percentile
+absolute_threshold = noise_floor_db + noise_floor_delta_db          # e.g., +5 dB above noise floor
+```
+**Key Parameters** (from `config.yaml`):
+```yaml
+duration:
+  threshold_strategy: "noise_floor"     # Adaptive per-clip (recommended)
+  noise_floor_percentile: 2.0           # Use 2nd percentile as noise floor estimate
+  noise_floor_delta_db: 5.0             # Threshold = noise_floor + 5 dB
+  min_sound_duration_ms: 25             # Filter out transient spikes < 25ms
+```
+**Why Adaptive?**
+- Each clip has different background noise levels
+- Fixed threshold (e.g., -40 dB) works poorly across diverse sounds
+- Adaptive threshold adjusts per-clip based on its own noise floor
+**Alternative** (legacy):
+```yaml
+threshold_strategy: "peak_relative"    # threshold = peak_dB - 20 dB (fixed offset)
+amplitude_threshold_db: -20.0
+```
+### Edge Trimming Strategy
+**ADAPTIVE EDGE-ONLY TRIMMING** - preserves natural periodicity:
+```python
+def extract_sound_with_edges_trimmed(audio, regions, min_silence_to_trim_ms=100, buffer_ratio=0.1):
+    """
+    Trim ONLY leftmost and rightmost silence IF significant.
+    Preserves ALL internal structure (perfect for periodic sounds).
+    """
+    leading_silence_ms = regions[0][0]  # Time before first sound
+    trailing_silence_ms = len(audio) - regions[-1][1]  # Time after last sound
+    # Only trim if silence >= 100ms
+    if leading_silence_ms >= min_silence_to_trim_ms:
+        buffer_ms = max(200, int(leading_silence_ms * 0.1))  # Keep 10% as buffer
+        trim_start_ms = max(0, regions[0][0] - buffer_ms)
+    else:
+        trim_start_ms = 0  # Keep from start
+    # Similar for trailing silence
+    ...
+    return audio[trim_start_ms:trim_end_ms]
+```
+**Why Edge-Only?**
+- Clock ticks, footsteps, typing have periodic silence between sounds
+- Removing internal silences destroys natural rhythm
+- Edge trimming removes irrelevant silence while preserving periodicity
+### Output Files
+```
+ESC-50_preprocessed/
+├── effective_durations.csv
+│   ├── filename
+│   ├── category
+│   ├── raw_duration_s (original 5.0s)
+│   ├── final_duration_s (after edge trimming)
+│   ├── effective_duration_s (sum of sound regions)
+│   ├── num_sound_regions
+│   ├── peak_amplitude_db
+│   ├── avg_rms_db
+│   └── threshold_strategy, noise_floor_percentile, noise_floor_delta_db
+└── trimmed_audio/
+    ├── 1-100032-A-0.wav (edge-trimmed clips)
+    └── ...
+```
+### Running Preprocessing
+```bash
+# Using config defaults
+python preprocess_esc50.py --config config.yaml
+# Override parameters
+python preprocess_esc50.py --config config.yaml \
+    --threshold-strategy noise_floor \
+    --noise-floor-percentile 2.0 \
+    --noise-floor-delta-db 5.0 \
+    --min-sound-ms 25
+# Don't save trimmed audio (only CSV)
+python preprocess_esc50.py --config config.yaml --no-trimmed-audio
+```
+### Preprocessing Statistics Example
+```
+ESC-50 Preprocessing Summary
+============================================================
+Total clips processed: 2000
+Successfully processed: 2000
+Raw duration statistics:
+  Mean: 5.000s  Std: 0.000s  Min: 5.000s  Max: 5.000s
+Final duration statistics (edges trimmed):
+  Mean: 4.723s  Std: 0.412s  Min: 2.134s  Max: 5.000s
+Effective duration statistics (sum of sound regions):
+  Mean: 3.856s  Std: 0.823s  Min: 0.542s  Max: 4.982s
+Comparison:
+  Avg effective: 3.856s
+  Avg final:     4.723s
+  Difference:    0.867s (internal silences preserved)
+Average edge trimming reduction: 5.5%
+```
+### How Duration Task Uses Preprocessed Data
+The `DurationTaskGenerator` loads preprocessed data:
+```python
+self.preprocessed_dataset = PreprocessedESC50Dataset(
+    metadata_csv=config['tasks']['duration']['preprocessed_data_path'] + '/effective_durations.csv',
+    audio_dir=config['tasks']['duration']['preprocessed_data_path'] + '/trimmed_audio'
+)
+# Calculate average effective duration for slot distribution
+effective_durations = self.preprocessed_dataset.metadata_df['effective_duration_s']
+self.avg_effective_duration = effective_durations.mean()  # ~3.856s
+```
+---
+## Audio Utilities
+Located in `utils/audio_utils.py`.
+### `generate_single_clip_duration(min_duration, max_duration) → float`
+**Purpose**: Generate a random target clip duration using UNIFORM sampling.
+**Implementation**:
+```python
+def generate_single_clip_duration(min_duration: float, max_duration: float) -> float:
+    return random.uniform(min_duration, max_duration)
+```
+**Mathematical Formulation**:
+$$d \sim \text{Uniform}(d_{\min}, d_{\max})$$
+With default values (20s, 60s):
+- Mean: $\mu = \frac{20 + 60}{2} = 40$ seconds
+- Standard Deviation: $\sigma = \frac{60 - 20}{\sqrt{12}} \approx 11.5$ seconds
+---
+### `get_max_clip_num_to_be_joined(target_duration_s, source_duration_s, min_silence_ms) → Tuple[int, float]`
+**Purpose**: Calculate maximum number of source clips that can fit in target duration.
+**Returns**: Tuple of (max_clips, remainder_seconds)
+**Implementation** (conceptual):
+```python
+def get_max_clip_num_to_be_joined(target_s, source_s, min_silence_ms):
+    silence_s = min_silence_ms / 1000.0
+    # Each clip + silence except last
+    effective_unit = source_s + silence_s
+    max_clips = int((target_s + silence_s) / effective_unit)
+    remainder = target_s - (max_clips * source_s + (max_clips - 1) * silence_s)
+    return max_clips, remainder
+```
+**Mathematical Formula**:
+$$N_{\max} = \left\lfloor \frac{T + g}{S + g} \right\rfloor$$
+Where:
+- $T$ = target duration (seconds)
+- $S$ = source clip duration (5.0s for ESC-50)
+- $g$ = minimum silence gap (seconds)
+---
+### `build_count_task_audio(source_audios, source_categories, target_duration, ...)`
+**Purpose**: Build the final audio for COUNT task.
+**Parameters**:
+- `source_audios`: List of AudioSegment objects (one per category)
+- `source_categories`: List of category names
+- `target_duration`: Target total duration in seconds
+- `ordering_mode`: "random" or "consecutive"
+- `source_clip_duration_seconds`: Duration of each source clip
+- `min_silence_ms`, `max_extra_silence_per_gap_ms`: Silence parameters
+**Returns**: Tuple of (final_audio, clip_sequence, build_metadata)
+---
+### `build_duration_task_audio(...)`
+**Purpose**: Build audio for DURATION task with slot distribution.
+---
+### `build_clip_sequence_with_silences(clips, target_duration_s, min_silence_ms, max_extra_silence_per_gap_ms, crossfade_ms)`
+**Purpose**: Concatenate clips with random silence gaps and smooth crossfades.
+**Algorithm**:
+1. Calculate total audio content duration
+2. Calculate minimum required silence: `(n_clips - 1) × min_silence_ms`
+3. Calculate available extra time: `target_duration - total_audio - min_silence`
+4. Distribute extra time randomly across gaps (up to `max_extra_silence_per_gap_ms` per gap)
+5. Build sequence with crossfades:
+   - Audio → Silence: crossfade for smooth transition
+   - Silence → Audio: No crossfade (preserves audio start)
+**Crossfade Benefits**:
+- Smooth transitions between audio and silence
+- Reduces clicks/pops at audio boundaries
+- Preserves natural sound attack (no crossfade at audio start)
+---
+## Task: COUNT
+**File**: `tasks/task_count.py`
+**Class**: `CountTaskGenerator`
+### Complete Flow
+```
+CountTaskGenerator.__init__(config, logger)
+    ↓
+    Initialize:
+    - ESC50Dataset (loads metadata, tracks category usage)
+    - AudioProcessor
+    - QuestionGenerator
+    - LLMQuestionGenerator (if enabled)
+    ↓
+generate_dataset()
+    ↓
+    1. num_samples = calculate_num_samples_for_task(task_duration_hours, min, max)
+    2. Create balanced_answers list from num_clips_per_sample
+    3. Shuffle balanced_answers
+    4. For each sample:
+        generate_sample(sample_id, target_unique_count=balanced_answers[i])
+    5. Save CSVs
+```
+### Key Method: `generate_sample(sample_id, target_unique_count)`
+**Pipeline**:
+1. Generate random target duration: `clip_duration_seconds = generate_single_clip_duration(min, max)`
+2. Calculate max clips: `max_clips, remainder = get_max_clip_num_to_be_joined(...)`
+3. Cap `n_unique_audios` at min(target_unique_count, max_clips, 50)
+4. Select categories: `selected_categories = dataset.get_least_used_categories(n_unique_audios)`
+5. Track usage: Increment `category_usage_counts` for each selected category
+6. Sample one file per category: `dataset.sample_file_from_category(category)`
+7. Load source audios
+8. Build final audio: `build_count_task_audio(source_audios, categories, target_duration, ordering_mode, ...)`
+9. Export audio file
+10. Generate MCQ and open-text questions
+11. Return metadata dict
+### Balanced Answer Distribution (Updated with max_clips_per_sample)
+```python
+# In generate_dataset()
+max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)  # Single number: 10
+possible_answers = list(range(1, max_clips_per_sample + 1))  # [1, 2, 3, ..., 10]
+samples_per_answer = num_samples // len(possible_answers)
+remainder = num_samples % len(possible_answers)
+balanced_answers = []
+for answer in possible_answers:
+    count = samples_per_answer + (1 if remainder > 0 else 0)
+    balanced_answers.extend([answer] * count)
+    remainder = max(0, remainder - 1)
+random.shuffle(balanced_answers)
+```
+**For 90 samples, max_clips_per_sample=10**: Each answer (1-10) appears exactly 9 times.
+### Silence Reduction Strategy (NEW)
+Each sample's target answer is capped at what actually fits in the duration:
+```python
+# In generate_sample()
+max_clips, _ = get_max_clip_num_to_be_joined(clip_duration_seconds, source_clip_duration, min_silence_ms)
+if target_unique_count is not None:
+    # Cap target at what actually fits (reduces silence)
+    n_unique_audios = min(target_unique_count, max_clips, len(CATEGORIES))
+```
+**Example**:
+- Target answer from balanced pool: **8 unique sounds**
+- Duration allows: **max_clips = 7**
+- Actual n_unique_audios: **min(8, 7) = 7** ✓ (uses max possible, reduces silence)
+**Why?** Prevents excessive silence when target exceeds what fits in duration.
+---
+## Task: DURATION
+**File**: `tasks/task_duration.py`
+**Class**: `DurationTaskGenerator`
+### Complete Flow
+```
+DurationTaskGenerator.__init__(config, logger)
+    ↓
+    Initialize:
+    - PreprocessedESC50Dataset (uses effective_durations.csv)
+    - Calculate avg_effective_duration from preprocessed data
+    - AudioProcessor, QuestionGenerator
+    - Load multiplier_longest, multiplier_shortest from config
+    ↓
+generate_dataset()
+    ↓
+    1. num_samples = calculate_num_samples_for_task(...)
+    2. Create balanced question types: ["longest"] * 45 + ["shortest"] * 45
+    3. Shuffle balanced_types
+    4. While len(samples) < num_samples:
+        generate_sample(sample_idx, question_type=balanced_types[idx])
+        If returns None → increment rejection_count, continue
+    5. Save CSVs
+```
+### Key Methods
+#### `_calculate_max_clips_and_sources(target_duration_s, question_type)`
+**Purpose**: Determine valid number of sources based on question type and duration.
+**For LONGEST**:
+- Target needs ≥2 clips to beat backgrounds by 1.5x
+- `min_valid_sources = 2`
+- `max_valid_sources = max_clips - 2 + 1`
+**For SHORTEST**:
+- Target gets 1 clip
+- Each background needs ≥2 clips to be 2x target
+- `max_valid_sources = 1 + (max_clips - 1) // 2`
+```python
+# Filter config values to valid range, then pick RANDOMLY
+valid_config_sources = [n for n in num_sources_config if min_valid <= n <= max_valid]
+n_sources = random.choice(valid_config_sources)
+```
+#### `_try_generate_sample(sample_id, question_type)`
+**Full Algorithm**:
+1. Generate target duration: `generate_single_clip_duration(min, max)`
+2. Calculate max_clips and n_sources: `_calculate_max_clips_and_sources(...)`
+3. Select target category (least used)
+4. Select background categories (from remaining least used)
+5. Calculate slot distribution based on question_type
+6. For each category, select source files and generate clip durations
+7. Load and trim clips
+8. Calculate total effective duration per category
+9. Verify gap constraint
+10. If gap not satisfied, try `_try_improve_slot_distribution()`
+11. If still not satisfied, return None (triggers retry)
+12. Build audio and generate questions
+13. Return metadata
+#### `_try_improve_slot_distribution(slot_distribution, durations, question_type, max_clips)`
+**Purpose**: Redistribute slots to satisfy gap constraint.
+---
+## Task: ORDER
+**File**: `tasks/task_order.py`
+**Class**: `OrderTaskGenerator`
+### Complete Flow
+```
+OrderTaskGenerator.__init__(config, logger)
+    ↓
+    Initialize ESC50Dataset, AudioProcessor, QuestionGenerator
+    ↓
+generate_dataset()
+    ↓
+    1. Generate sample durations upfront (exact fill)
+    2. num_samples = len(sample_durations)
+    3. Create balanced question_types distribution
+    4. For each sample:
+        generate_sample(sample_id, target_question_type=balanced_types[i])
+        → n_clips randomly selected from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)]
+    5. Save CSVs
+```
+### Key Method: `_get_valid_question_types(n_clips)`
+Filters question types based on clip count:
+- `second`, `second_last`: require `n_clips >= min_clips_for_second_questions` (default: 4)
+- `after`, `before`: require `n_clips >= 2`
+- `first`, `last`: always valid
+### Key Method: `generate_sample(sample_id, target_question_type, target_duration_seconds)`
+**Algorithm**:
+1. Use pre-generated `target_duration_seconds` (from sample_durations)
+2. Calculate max_clips from duration: `get_max_clip_num_to_be_joined(...)`
+3. **Silence reduction - randomly select n_clips**:
+   ```python
+   min_clips = max(2, max_clips - 3)
+   max_clips_allowed = min(max_clips, max_clips_per_sample, len(CATEGORIES))
+   if min_clips > max_clips_allowed:  # Handle edge case
+       min_clips = max_clips_allowed
+   n_clips = random.randint(min_clips, max_clips_allowed)
+   ```
+4. Get valid question types for n_clips
+5. Select answer position based on question type:
+   - `first` → position 0
+   - `last` → position n_clips - 1
+   - `second` → position 1
+   - `second_last` → position n_clips - 2
+   - `after` → random position 1 to n-1
+   - `before` → random position 0 to n-2
+6. Select categories using least-used balancing (answer first, then others)
+7. Build audio with `build_clip_sequence_with_silences` (includes crossfade)
+8. Generate questions including sequence question
+9. Return metadata
+**Silence Reduction**: Target n_clips is capped at `max_clips` to avoid excessive silence.
+---
+## Task: VOLUME
+**File**: `tasks/task_volume.py`
+**Class**: `VolumeTaskGenerator`
+### Complete Flow
+```
+VolumeTaskGenerator.__init__(config, logger)
+    ↓
+    Initialize ESC50Dataset, AudioProcessor, QuestionGenerator
+    Load multiplier_max_loudness, multiplier_min_loudness, baseline normalization settings
+    ↓
+generate_dataset()
+    ↓
+    1. Generate sample durations upfront (exact fill)
+    2. num_samples = len(sample_durations)
+    3. Create balanced clips_count_pool from 2 to max_clips_per_sample
+    4. Create balanced question_types: ["max_loudness"] * N/2 + ["min_loudness"] * N/2
+    5. Shuffle both pools
+    6. Store clips_count_pool as instance variable
+    7. For each sample:
+        generate_sample(sample_id, target_question_type=balanced_types[i])
+        → Uses clips_count_pool.pop(0) internally, capped at max_clips_that_fit
+        → Normalizes clips to baseline, applies volume adjustments
+        → Verifies gap constraints (up to 10 attempts)
+    8. Save CSVs
+```
+### Key Methods
+#### `_normalize_to_baseline(audio)`
+```python
+def _normalize_to_baseline(self, audio):
+    if not self.normalize_to_baseline:
+        return audio
+    change_in_dBFS = self.baseline_dBFS - audio.dBFS
+    return audio.apply_gain(change_in_dBFS)
+```
+#### `_verify_loudness_gap(volume_levels, question_type)`
+**For MAX_LOUDNESS**:
+```python
+required_gap_dB = 20 * math.log10(self.multiplier_max_loudness)  # ≈ 3.52 dB
+actual_gap_dB = max_level - second_max
+gap_satisfied = actual_gap_dB >= required_gap_dB
+```
+**For MIN_LOUDNESS**:
+```python
+required_gap_dB = abs(20 * math.log10(self.multiplier_min_loudness))  # ≈ 6.02 dB
+actual_gap_dB = second_min - min_level
+gap_satisfied = actual_gap_dB >= required_gap_dB
+```
+#### Volume Level Generation
+Volume levels are generated to satisfy gap constraints:
+- For `max_loudness`: target gets +gap_dB above baseline, backgrounds at/below baseline
+- For `min_loudness`: target gets -gap_dB below baseline, backgrounds at/above baseline
+---
+## Deterministic Balancing Mechanisms
+### Overview
+The pipeline ensures balanced distributions across multiple dimensions with **capacity-aware assignment**.
+### 1. Capacity-Aware Answer Balancing (COUNT Task)
+Each possible answer (1-10) appears equally often, but **higher targets are assigned to samples with higher capacity**.
+```python
+# Calculate capacity for each sample
+for duration in sample_durations:
+    max_clips, _ = get_max_clip_num_to_be_joined(duration, source_clip_duration, min_silence_ms)
+    max_for_sample = min(max_clips, max_clips_per_sample, len(CATEGORIES))
+    sample_max_clips.append(max_for_sample)
+# Create balanced pool
+possible_answers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+samples_per_answer = num_samples // len(possible_answers)
+remainder = num_samples % len(possible_answers)
+assignment_pool = []
+for answer in possible_answers:
+    count = samples_per_answer + (1 if remainder > 0 else 0)
+    assignment_pool.extend([answer] * count)
+    remainder = max(0, remainder - 1)
+# Sort samples by capacity (descending)
+sample_info.sort(key=lambda x: x[2], reverse=True)
+# Sort pool descending - assign high targets first
+assignment_pool.sort(reverse=True)
+# Assign targets, clamped to capacity
+for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
+    target = min(assignment_pool[idx], capacity)
+    balanced_assignments[sample_idx] = target
+```
+**Guarantee**: Each answer value appears equally, and high targets go to samples that can fit them.
+### 2. Capacity-Aware Question Type Balancing (ORDER Task)
+ORDER task uses **capacity-aware balancing** - advanced question types assigned to high-capacity samples.
+```python
+# Separate question types by requirements
+basic_types = ['first', 'last', 'after', 'before']  # Need >= 2 clips
+advanced_types = ['second', 'second_last']  # Need >= min_clips_for_second (e.g., 3)
+# Sort samples by capacity (descending)
+sample_info.sort(key=lambda x: x[2], reverse=True)
+# Build assignment pool - advanced types first
+samples_per_type = num_samples // len(question_types)
+remainder = num_samples % len(question_types)
+assignment_pool = []
+# Add advanced types first (for high-capacity samples)
+for qtype in advanced_types:
+    count = samples_per_type + (1 if remainder > 0 else 0)
+    assignment_pool.extend([qtype] * count)
+    remainder = max(0, remainder - 1)
+# Then basic types
+for qtype in basic_types:
+    count = samples_per_type + (1 if remainder > 0 else 0)
+    assignment_pool.extend([qtype] * count)
+    remainder = max(0, remainder - 1)
+# Assign with validation
+for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
+    target_qtype = assignment_pool[idx]
+    valid_types = _get_valid_question_types(capacity)
+    if target_qtype not in valid_types:
+        # Downgrade to valid type
+        target_qtype = random.choice(valid_types)
+    balanced_assignments[sample_idx] = target_qtype
+```
+### 3. Simple Question Type Balancing (DURATION, VOLUME Tasks)
+```python
+# DURATION: 2 types → N/2 each
+# VOLUME: 2 types → N/2 each
+samples_per_type = num_samples // len(question_types)
+remainder = num_samples % len(question_types)
+balanced_types = []
+for qtype in question_types:
+    count = samples_per_type + (1 if remainder > 0 else 0)
+    balanced_types.extend([qtype] * count)
+    remainder = max(0, remainder - 1)
+random.shuffle(balanced_types)
+```
+### 4. Category Usage Balancing
+All 50 ESC-50 categories are used equally via least-used selection:
+```python
+def get_least_used_categories(self, n: int, exclude: List[str] = None) -> List[str]:
+    # Sort categories by usage count
+    sorted_cats = sorted(
+        self.category_usage_counts.items(),
+        key=lambda x: (x[1], x[0])  # Sort by count, then alphabetically for ties
+    )
+    # Filter excluded and return first n
+    available = [cat for cat, _ in sorted_cats if cat not in (exclude or [])]
+    return available[:n]
+```
+Each task calls `reset_category_usage()` at the start to ensure independent balancing.
+### 5. N_Clips Selection Strategy
+**COUNT Task**: Uses capacity-aware answer balancing (see #1 above)
+**ORDER and VOLUME Tasks**: Use **silence reduction strategy** (NOT balanced):
+```python
+# Randomly sample n_clips from valid range to minimize silence
+min_clips = max(2, max_clips - 3)
+max_clips_allowed = min(max_clips, max_clips_per_sample, len(CATEGORIES))
+if min_clips > max_clips_allowed:
+    min_clips = max_clips_allowed  # Handle edge case
+n_clips = random.randint(min_clips, max_clips_allowed)
+```
+This maximizes clip usage within the allowed range, minimizing excessive silence.
+---
+## Rejection Logic and Retry Mechanisms
+### When Samples Are Rejected
+Rejections occur only in tasks with gap constraints:
+1. **DURATION Task**: Gap constraint not satisfied
+   - LONGEST: target_duration < max_background × 1.5
+   - SHORTEST: target_duration > min_background × 0.5
+2. **VOLUME Task**: Gap constraint not satisfied
+   - MAX_LOUDNESS: actual_gap_dB < required_gap_dB (3.52 dB)
+   - MIN_LOUDNESS: actual_gap_dB < required_gap_dB (6.02 dB)
+### DURATION Task Retry Logic
+```python
+def generate_dataset(self):
+    all_metadata = []
+    sample_idx = 0
+    type_idx = 0
+    while len(all_metadata) < num_samples and type_idx < len(balanced_types) * 2:
+        question_type = balanced_types[type_idx % len(balanced_types)]
+        metadata = self.generate_sample(sample_idx, question_type)
+        if metadata is not None:
+            all_metadata.append(metadata)
+            sample_idx += 1
+        # If None, sample was rejected - just move to next
+        type_idx += 1
+```
+### Rejection Rate Calculation
+$$\text{Rejection Rate} = \frac{\text{rejections}}{\text{rejections} + \text{successes}} \times 100\%$$
+---
+## Complete Task Creation Explanation
+### How Each Task Is Generated (Step-by-Step)
+#### COUNT TASK - "How many unique sounds?"
+**Goal**: Create audio with N unique sound sources, ask how many distinct sounds exist.
+**Process**:
+1. **Preprocessing**: None (uses raw ESC-50 clips)
+2. **Duration Generation**: `target_duration ~ Uniform(20s, 60s)` per sample
+3. **Calculate Max Clips**: `max_clips = get_max_clip_num_to_be_joined(target_duration, 5s, 100ms)`
+   - Example: 45s duration → ~8 clips of 5s each with 100ms silence between
+4. **Balanced Answer Selection**: Pre-generated pool of answers [1,2,3,...,10] balanced equally
+   - Target answer (e.g., 5 unique sounds) selected from pool
+5. **Silence Reduction**: Cap target at `min(target_answer, max_clips)`
+   - If target=8 but max_clips=6 → use 6 (prevents excessive silence)
+6. **Category Selection**: Pick N least-used categories from ESC-50 (balancing)
+7. **Audio Construction**:
+   - Load one file per category
+   - Calculate repetitions needed: `total_clips = max_clips`
+   - Distribute repetitions across N sources
+   - **Ordering mode**:
+     - `random`: Shuffle clips (A B A C B...) - harder, tests recognition
+     - `consecutive`: Group same-source (AAA BBB CCC) - easier
+8. **Silence Insertion**:
+   - Minimum 100ms silence between EVERY clip
+   - Extra silence (up to 500ms per gap) distributed from remainder
+   - **Crossfade**: 50ms within same-source, 500ms at audio-silence boundaries
+9. **Question Generation**: MCQ + open-text asking "How many unique sounds?"
+10. **Export**: Save audio WAV + metadata
+**Example**:
+- Target duration: 40s
+- Max clips that fit: 7 clips (7×5s + 6×0.1s = 35.6s)
+- Target answer: 3 unique sounds
+- Actual: 3 unique sounds (7 total clips: 3+2+2 repetitions)
+- Ordering: Random shuffle → [A B A C B A C]
+- Result: Audio with 3 distinct sounds, some repeated, with silences and crossfades
+#### DURATION TASK - "Which sound is longest/shortest?"
+**Goal**: Create audio where one sound has clearly longest/shortest duration compared to others.
+**Process**:
+1. **Preprocessing** (preprocess_esc50.py - REQUIRED):
+   - Load raw ESC-50 clips
+   - Detect sound regions using adaptive noise-floor thresholding
+   - Trim leading/trailing silence (preserve internal structure)
+   - Calculate effective duration per clip
+   - Save trimmed audio + effective_durations.csv
+2. **Duration Generation**: `target_duration ~ Uniform(20s, 60s)` per sample
+3. **Calculate Max Clips**: Based on average effective duration (~3.86s)
+4. **Determine N Sources**: Based on question type and max_clips
+   - **LONGEST**: Target needs ≥2 clips, backgrounds get 1 each → `n_sources ≤ max_clips - 1`
+   - **SHORTEST**: Target gets 1 clip, backgrounds need ≥2 each → `n_sources ≤ 1 + (max_clips-1)//2`
+5. **Category Selection**: Pick target + backgrounds from least-used categories
+6. **Slot Distribution**: Allocate clips to each source
+   - LONGEST: Give most clips to target, 1 to each background
+   - SHORTEST: Give 1 to target, multiple to each background
+7. **Clip Selection**: For each source, select clips from preprocessed dataset
+8. **Gap Verification**:
+   - LONGEST: `target_duration ≥ max_background × 1.5` ✓
+   - SHORTEST: `target_duration ≤ min_background × 0.75` ✓
+   - If gap not satisfied: Try redistributing slots, or reject sample
+9. **Audio Construction**:
+   - Load trimmed clips
+   - Concatenate with consecutive ordering (preserve periodicity)
+   - Insert silences with crossfades
+10. **Question Generation**: "Which sound is longest/shortest?"
+11. **Export**: Audio + metadata
+**Example**:
+- Question type: LONGEST
+- Target duration: 50s, max_clips: 12
+- N sources: 4 (target + 3 backgrounds)
+- Slot distribution: Target=6 clips (6×3.8s=22.8s), Backgrounds=2 clips each (2×3.8s=7.6s)
+- Gap check: 22.8s ≥ 7.6s × 1.5 = 11.4s ✓
+- Result: Target sound clearly longest
+#### ORDER TASK - "Which sound is first/last/after X?"
+**Goal**: Create ordered sequence of sounds, ask about temporal relationships.
+**Process**:
+1. **Preprocessing**: None (uses raw ESC-50)
+2. **Duration Generation**: Pre-generated durations to exactly fill task duration
+3. **Calculate Max Clips**: `get_max_clip_num_to_be_joined(target_duration, 5s, 100ms)`
+4. **Balanced N_Clips Selection**: Pre-generated pool [2,3,4,...,10] balanced equally
+   - Target n_clips (e.g., 5) selected from pool
+   - Capped at `min(target_n_clips, max_clips)` (silence reduction)
+5. **Question Type Selection**: From balanced pool (first, last, second, after, before, second_last)
+6. **Answer Position Determination**: Based on question type
+   - `first` → position 0
+   - `last` → position n_clips-1
+   - `second` → position 1
+   - `second_last` → position n_clips-2
+   - `after`/`before` → random valid position
+7. **Category Selection**: Answer category at determined position, others from least-used
+8. **Audio Construction**:
+   - Load one clip per position
+   - Build sequence with silences (min 100ms + random extra up to 500ms per gap)
+   - **Crossfade**: 500ms at audio-silence boundaries for smooth transitions
+9. **Question Generation**:
+   - MCQ: "Which sound is first?" with 4 options
+   - Open-text: "What is the first sound?" + full sequence
+10. **Export**: Audio + metadata
+**Example**:
+- Target n_clips: 4, max_clips: 8 → use 4 ✓
+- Question: "Which sound is second?"
+- Answer position: 1 (0-indexed)
+- Sequence: [dog, cat, bird, rain] → Answer: cat
+- Audio: 4 clips in order with silences and crossfades
+#### VOLUME TASK - "Which sound is loudest/softest?"
+**Goal**: Create audio with clips at different volume levels, ask about loudness comparison.
+**Process**:
+1. **Preprocessing**: None (uses raw ESC-50)
+2. **Duration Generation**: Pre-generated durations
+3. **Calculate Max Clips**: `get_max_clip_num_to_be_joined(...)`
+4. **Balanced N_Clips Selection**: From pool [2,3,...,10], capped at max_clips
+5. **Question Type Selection**: "max_loudness" or "min_loudness" (balanced 50/50)
+6. **Volume Level Generation**: Create n_clips volume adjustments (in dB)
+   - Ensure gap constraint (multiplier 4.0 for max, 0.25 for min)
+   - Example: [+12dB, 0dB, -6dB] → max at +12dB has ≥12dB gap from second
+7. **Gap Verification** (up to 10 attempts):
+   - MAX: `max_level - second_max ≥ 20×log10(4.0) ≈ 12dB`
+   - MIN: `second_min - min_level ≥ 20×log10(4.0) ≈ 12dB`
+   - If not satisfied: Regenerate levels or reject
+8. **Category Selection**: Answer at determined position, others from least-used
+9. **Audio Construction**:
+   - Load clips
+   - **CRITICAL: Normalize all to baseline (-20 dBFS)** → ensures controlled comparison
+   - Apply volume adjustments to normalized clips
+   - Concatenate with silences and crossfades
+10. **Question Generation**: "Which sound has maximum/minimum loudness?"
+11. **Export**: Audio + metadata with volume levels
+**Example**:
+- Target n_clips: 3, max_clips: 6 → use 3 ✓
+- Question: "max_loudness"
+- Volume levels: [+12dB, 0dB, -6dB]
+- Gap check: 12 - 0 = 12dB ≥ 12dB ✓
+- Process: Normalize all clips to -20dBFS, then adjust to [-8dBFS, -20dBFS, -26dBFS]
+- Result: First sound clearly loudest
+### Key Innovations
+1. **Crossfade Everywhere**: Smooth transitions at audio-silence boundaries (500ms), small crossfade within same-source repetitions (50ms)
+2. **Adaptive Preprocessing**: Noise-floor thresholding adapts per-clip (duration task)
+3. **Silence Reduction**: ORDER/VOLUME tasks sample n_clips from [max_clips-3, max_clips_per_sample] to minimize silence
+4. **Balanced Distribution**:
+   - **COUNT**: Balances answers (1 to max_clips_per_sample) + question types
+   - **ORDER/VOLUME**: Balances question types only (n_clips uses silence reduction)
+5. **Category Balancing**: Least-used selection ensures all 50 ESC-50 categories used evenly
+6. **Gap Constraints**: Mathematical guarantees for duration/volume comparisons
+7. **Exact Duration Filling**: Pre-generate sample durations to exactly fill task duration (no wasted time)
+---
+## Command-Line Arguments
+### Main Pipeline (`main.py`)
+```bash
+python main.py [OPTIONS]
+Options:
+  --config, -c PATH        Path to config YAML (default: config.yaml)
+  --tasks, -t TASKS        Specific tasks to run (choices: count, duration, order, volume)
+  --output, -o PATH        Custom output directory (overrides config)
+Examples:
+  # Run all enabled tasks with default config
+  python main.py
+  # Run specific tasks only
+  python main.py --tasks count order
+  # Use custom config and output
+  python main.py --config my_config.yaml --output ./my_dataset
+```
+### Preprocessing Script (`preprocess_esc50.py`)
+```bash
+python preprocess_esc50.py [OPTIONS]
+Options:
+  --config PATH                    Path to config YAML (default: config.yaml)
+  --threshold-strategy STRATEGY    "noise_floor" or "peak_relative"
+  --threshold-db FLOAT             Threshold in dB (for peak_relative)
+  --noise-floor-percentile FLOAT   Percentile for noise floor estimation
+  --noise-floor-delta-db FLOAT     Delta above noise floor in dB
+  --min-sound-ms INT               Minimum sound duration in ms
+  --no-trimmed-audio              Skip saving trimmed audio files
+  --output-dir PATH               Custom output directory
+Examples:
+  # Use config defaults
+  python preprocess_esc50.py --config config.yaml
+  # Override threshold parameters
+  python preprocess_esc50.py --config config.yaml \
+      --threshold-strategy noise_floor \
+      --noise-floor-percentile 2.0 \
+      --noise-floor-delta-db 5.0 \
+      --min-sound-ms 25
+  # Generate metadata only (no trimmed audio)
+  python preprocess_esc50.py --config config.yaml --no-trimmed-audio
+```
+---
+## Summary
+The TREA 2.0 pipeline generates balanced, constraint-satisfying audio QA samples through:
+1. **Preprocessing** (Duration only): Adaptive noise-floor thresholding + edge trimming
+2. **Exact Duration Filling**: Pre-generate sample durations to sum exactly to task duration
+3. **Capacity-Aware Balancing**:
+   - **COUNT**: High answer targets → high-capacity samples
+   - **ORDER**: Advanced question types → high-capacity samples
+4. **Silence Reduction**: ORDER/VOLUME randomly sample n_clips from [max_clips-3, max_clips_per_sample]
+5. **Crossfade Transitions**: Smooth audio-silence boundaries (500ms) + within-source (50ms)
+6. **Category Balancing**: Least-used selection ensures even ESC-50 category distribution
+7. **Gap Constraints**: Mathematical guarantees (1.5x for longest, 0.75x for shortest, 4.0x/0.25x for volume)
+8. **Retry Mechanisms**: Failed samples rejected, pipeline continues until target count reached
+All randomness is seeded (`random_seed: 42`) for reproducibility.

README.md ADDED Viewed

	@@ -0,0 +1,112 @@

+# TREA 2.0 Pipeline
+Audio question-answering dataset generator using ESC-50. Creates four task types: COUNT, DURATION, ORDER, and VOLUME.
+## Quick Start
+```bash
+# 1. Install dependencies
+pip install -r requirements.txt
+# 2. Preprocess ESC-50 (required for DURATION task only)
+python preprocess_esc50.py --config config.yaml
+# 3. Generate datasets
+python main.py --config config.yaml
+```
+## Configuration
+Edit `config.yaml` to set:
+- **Task duration**: `task_duration_size` (hours) per task
+- **Clip duration range**: `min_clip_duration` to `max_clip_duration` (seconds)
+- **ESC-50 paths**: Point to your ESC-50 dataset location
+- **Enable/disable tasks**: Set `enabled: true/false` for each task
+## Key Files
+- **`config.yaml`** - All configuration parameters
+- **`main.py`** - Pipeline entry point (runs all tasks)
+- **`preprocess_esc50.py`** - Preprocess ESC-50 for duration task
+- **`tasks/task_*.py`** - Individual task generators
+## Tasks
+| Task | Question | Example |
+|------|----------|---------|
+| **COUNT** | "How many unique sounds?" | Audio with 5 distinct sound types |
+| **DURATION** | "Which sound is longest/shortest?" | Compare sound durations |
+| **ORDER** | "Which sound is first/last/after X?" | Temporal sequence questions |
+| **VOLUME** | "Which sound is loudest/softest?" | Loudness comparison |
+## Output Structure
+```
+output/{task}/
+├── audios/*.wav          # Generated audio files
+├── {task}_mcq.csv        # Multiple choice questions
+├── {task}_open_text.csv  # Open-ended questions
+└── {task}_metadata.csv   # Detailed metadata
+```
+## Shell scripts (quick)
+Use the provided shell helpers for simple runs.
+Run full pipeline (uses `python main.py` under the hood):
+```bash
+# Make executable and run (from pipeline/)
+./run_pipeline.sh
+# With custom config, tasks, and output
+./run_pipeline.sh --config my_config.yaml --tasks count,order --output ./my_dataset
+```
+Run the LLM answer generation across splits (uses `llm_answer_generator.py`):
+```bash
+# Processes open_text CSVs across splits/tasks defined in the script
+./run_llm_answers_all.sh
+# Or run per-file with the helper script directly
+python llm_answer_generator.py --input /path/to/count_open_text.csv --mode open_text --task count
+```
+## Advanced Usage
+```bash
+# Run specific tasks only
+python main.py --tasks count order
+# Use custom config
+python main.py --config my_config.yaml
+# Custom output directory
+python main.py --output /path/to/output
+# Preprocess with custom parameters
+python preprocess_esc50.py --config config.yaml \
+    --threshold-strategy noise_floor \
+    --noise-floor-percentile 2.0 \
+    --noise-floor-delta-db 5.0
+```
+## Documentation
+See **`DOCS.md`** for complete technical documentation including:
+- Mathematical formulations
+- Detailed algorithm explanations
+- Configuration parameter reference
+- Preprocessing pipeline details
+- Balancing mechanisms
+## Requirements
+- Python 3.8+
+- pydub
+- numpy
+- pandas
+- tqdm
+- pyyaml

config.yaml ADDED Viewed

	@@ -0,0 +1,348 @@

+# Temporal Reasoning Audio Dataset Pipeline Configuration
+##uniform distributuon for clip duration
+##not mixing datasets
+##count
+##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder
+##duration
+##amplitude based filtering -> normalize -> threshold based selection
+##gap between audio clips - x2/1.5 the shorter one -> add as param
+##different clips of the same class can be contatenated to reach target duration
+##consecutive ordering only
+##based on n unique sources and total clips we can have -> shortest and longest duration calculation
+##reject datapoint if same target audio clip cannot be repeated to maintain the gap - arg
+##sample different clip from the same class -> check if different clips can be used to fill the gap - arg
+##amplitude filtered durations in metadata csv
+##get_max_clip_num_to_be_joined()
+##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder
+##ensure_silence_between_clips()
+##silence should always be there between two clips
+##order
+##repeat target clips
+##second and second last - modify question types
+##volume
+##amplitude average loudness for a audio clip -> repetitions but same clip(argument) -> different volume levels based on dB levels
+##add crossfade
+##trimming - threshold separately for each audio clip - normalize 1 and 0 - get threshold -> trim -> concatenate
+##leftmost and rightmost silence trimming
+##buffer for trimming - cutting early and cutting a bit late to avoid cutting important parts
+##periodicity affect
+##volume - trim and get average loudness -> normalize -> adjust volume levels
+##number of clips per samples to avoid silence
+# ESC-50 Dataset paths (each clip is 5 seconds)
+esc50:
+  audio_path: "/path/to/ESC-50_github/audio"
+  metadata_path: "/path/to/ESC-50_github/meta/esc50.csv"
+# Synthetic silence audio for concatenation
+synthetic_silence:
+  path: "/path/to/synthetic_silences"
+# Output configuration
+output:
+  base_path: "/path/to/pipeline/test_ood"
+# Dataset class-subset configuration
+# Use this to create datasets (train/val/test) from a persistent subset
+# of classes (e.g. use 40 of 50 classes for in-distribution splits and
+# optionally create an OOD test set using all 50 classes).
+dataset:
+  use_class_subset: false                # if false, use all available classes
+  num_classes_subset: 40                # number of classes to use for train/val/test
+  subset_persist_path: "/path/to/class_subset.json"
+  subset_seed: 42                       # RNG seed when sampling the subset (persisted)
+# Audio generation parameters
+audio:
+  # Duration range for each GENERATED clip (in seconds)
+  # Original ESC-50 clips are 5s and will be concatenated to create clips in this range
+  min_clip_duration: 20.0     # Minimum duration for each generated clip
+  max_clip_duration: 60.0     # Maximum duration for each generated clip
+  # Crossfade and silence
+  crossfade_duration: 500    # Crossfade between audio and silence (milliseconds) for smooth transitions
+  silence_duration: 1000     # Default silence between clips (milliseconds)
+  min_silence_duration: 100  # Minimum silence ALWAYS inserted between clips (milliseconds)
+  max_extra_silence_per_gap: 500  # Maximum extra silence per gap when distributing remainder
+  crossfade_within_source: 50  # Small crossfade within same-source repetitions (count task)
+  with_silence: true         # Add silence between clips
+  # Duration (seconds) of individual source clips (ESC-50 are 5s by default).
+  # Used to compute how many source clips are concatenated to reach a target
+  # generated clip duration. Change only if your source clips differ.
+  source_clip_duration: 5.0
+  # Audio normalization
+  normalize: false
+  normalize_target_dBFS: -20.0
+# Random seed for reproducibility
+random_seed: 42
+# LLM for question generation (local Llama 3.1 8B)
+llm:
+  enabled: false  # Set to true to use LLM for question generation
+# Task-specific configurations
+tasks:
+  count:
+    enabled: true
+    # Total duration for ALL samples in this task combined (in hours)
+    # Pipeline will calculate number of samples based on min/max clip durations
+    task_duration_size: 2.0  # hours
+    # Maximum unique sound sources per sample (single number)
+    # Actual number will be subsampled from max(1, max_clips-3) to min(max_clips, max_clips_per_sample)
+    max_clips_per_sample: 10
+    # Ordering mode for repeated clips of same source:
+    # "random": Clips are shuffled randomly (A B A C B A C...) - tests recognition of recurring sounds
+    # "consecutive": Same-source clips grouped together (AAA BBB CCC) - easier, just count blocks
+    ordering_mode: "random"
+    # Question templates for MCQ
+    mcq_questions:
+      - "What is the number of distinct sound sources in the audio file?"
+      - "How many different types of sounds can be identified in this recording?"
+      - "How many unique types of sound are present in this audio?"
+      - "Identify the count of different sound sources in this clip."
+      - "What is the total number of unique sounds heard in this audio?"
+      - "How many distinct sound categories are there in this audio file?"
+      - "Determine the number of unique sound sources in this recording."
+      - "How many separate sound sources are included in the audio?"
+      - "What is the total number of unique sound types in this audio?"
+      - "How many different sound sources can be heard in this clip?"
+    # Question templates for open-text
+    open_text_questions:
+      - "How many distinct sound sources are present in the audio?"
+      - "Count the number of unique sounds in this recording."
+      - "What is the total count of different sound categories heard?"
+      - "Identify and count all unique sound types in the clip."
+  duration:
+    enabled: true
+    # Total duration for ALL samples in this task combined (in hours)
+    task_duration_size: 2.0  # hours
+    # Number of unique sound sources per sample (can be single int or list)
+    # Single int (e.g., 15): randomly samples from 1 to 15 (like count/order tasks)
+    # List (e.g., [2,3,4]): randomly picks from the list
+    # The script will automatically generate repetition patterns to create
+    # shortest/longest variations based on the target clip duration
+    num_unique_sources: 10
+    # Ordering: only keep "consecutive" so repeated segments of the same
+    # source remain grouped together, ensuring that multiple consecutive
+    # clips of the same audio yield the longest duration unambiguously.
+    ordering_methods: ["consecutive"]
+    # =====================================================
+    # Amplitude-based filtering parameters (preprocessing)
+    # =====================================================
+    # RELATIVE dB threshold below peak to consider as silence
+    # For each clip: silence_threshold = clip_peak_dB + amplitude_threshold_db
+    # Example: If clip peak is -5 dB and threshold is -20, silence threshold = -25 dB
+    # Based on ESC-50 analysis: -20 dB gives ~60% effective duration (good balance)
+    # More aggressive (removes more silence): -15 dB
+    # More conservative (keeps more sound): -25 dB
+    amplitude_threshold_db: -20.0
+    # Minimum duration of sound region to keep (milliseconds)
+    # Filters out very short transient noise spikes
+    # ESC-50 is curated, so 20-30ms is sufficient
+    min_sound_duration_ms: 25
+    # =====================================================
+    # Adaptive threshold strategy
+    # =====================================================
+    # "peak_relative": threshold = peak_dB + amplitude_threshold_db (fixed offset from peak)
+    #   - Simple but not adaptive to actual noise levels
+    # "noise_floor": threshold = percentile(dB, N) + delta_dB (RECOMMENDED)
+    #   - Fully adaptive per-clip based on its own noise floor
+    #   - Each clip analyzed independently - no fixed dB values needed
+    #   - Better for diverse audio with varying noise levels
+    threshold_strategy: "noise_floor"
+    # Noise floor estimation percentile (used when threshold_strategy = noise_floor)
+    # Lower percentile = more conservative estimate of background noise
+    # 5 = use 5th percentile of dB values as noise floor estimate (better for sparse sounds)
+    noise_floor_percentile: 2.0
+    # Delta above noise floor (dB) to set as threshold
+    # This is relative to EACH clip's own noise floor, not a fixed dB value
+    # 8dB above the clip's noise floor works well for most ESC-50 clips
+    # Higher = more conservative (keeps more), Lower = more aggressive (removes more)
+    noise_floor_delta_db: 5.0
+    # Path to preprocessed ESC-50 data (effective durations + trimmed audio)
+    preprocessed_data_path: "/path/to/ESC-50_preprocessed"
+    # =====================================================
+    # Duration gap multipliers
+    # =====================================================
+    # For LONGEST questions: target_effective >= max_background × multiplier_longest
+    multiplier_longest: 1.5
+    # For SHORTEST questions: target_effective <= min_background × multiplier_shortest
+    # Using 0.75 instead of 0.5 for smaller gaps (easier to distinguish)
+    multiplier_shortest: 0.75
+    # Minimum effective duration per source (seconds)
+    # Clips with less than this duration are harder to distinguish
+    min_effective_duration_per_source: 1.0
+    # =====================================================
+    # Fallback/rejection options
+    # =====================================================
+    # Reject sample if duration gap cannot be satisfied
+    reject_if_gap_not_met: true
+    # Try different clips from same class if one clip isn't enough
+    sample_different_clips_same_class: true
+    # Question types
+    question_types: ["shortest", "longest"]
+    # MCQ questions
+    mcq_questions:
+      shortest: "Which of the following sounds is heard for the shortest duration?"
+      longest: "Which of the following sounds is heard for the longest duration?"
+    # Open-text questions
+    open_text_questions:
+      shortest: "Which sound is heard for the shortest duration in the audio?"
+      longest: "Which sound is heard for the longest duration in the audio?"
+  order:
+    enabled: true
+    # Total duration for ALL samples in this task combined (in hours)
+    task_duration_size: 2.0  # hours
+    # Maximum clips to join per sample (minimum 2 for ordering)
+    # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
+    max_clips_per_sample: 10
+    # Whether to allow repeating clips from the same source category
+    # If true: sequence could be [dog, dog, cat, bird] (same clip repeated)
+    # If false: sequence is always unique sources
+    allow_source_repetition: false
+    # Minimum clips needed for "second" and "second_last" questions
+    # Set to 4 to ensure second and second_last refer to different positions
+    # (with 3 clips, both would refer to middle clip at position 1)
+    min_clips_for_second_questions: 3
+    # Question types: "first", "last", "after", "before", "second", "second_last"
+    # "second" and "second_last" only generated when n_clips >= min_clips_for_second_questions
+    question_types: ["first", "last", "after", "before", "second", "second_last"]
+    # MCQ question templates
+    mcq_questions:
+      first: "Which sound appears first in the audio clip?"
+      last: "Which sound appears last in the audio clip?"
+      after: "Which sound comes after {sound1}?"
+      before: "Which sound comes before {sound2}?"
+      second: "Which sound appears second in the audio clip?"
+      second_last: "Which sound appears second to last in the audio clip?"
+    # Open-text question templates
+    open_text_questions:
+      first: "What is the first sound you hear in the audio?"
+      last: "What is the last sound you hear in the audio?"
+      after: "What sound comes after {sound1}?"
+      before: "What sound comes before {sound2}?"
+      second: "What is the second sound you hear in the audio?"
+      second_last: "What sound is second to last in the audio?"
+      sequence: "List the sounds in the order they appear in the audio."
+  volume:
+    enabled: true
+    # Total duration for ALL samples in this task combined (in hours)
+    task_duration_size: 2.0  # hours
+    # Maximum clips with different volumes per sample
+    # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
+    max_clips_per_sample: 10
+    # =====================================================
+    # Normalization settings (CRITICAL for volume comparison)
+    # =====================================================
+    # All clips are FIRST normalized to baseline, THEN volume adjusted
+    # This ensures volume differences are controlled and comparable
+    normalize_to_baseline: true
+    baseline_dBFS: -20.0  # Normalize all clips to this level first (used if use_lufs=false)
+    # =====================================================
+    # LUFS (Perceived Loudness) Settings
+    # =====================================================
+    # LUFS (Loudness Units Full Scale) measures PERCEIVED loudness
+    # Unlike dBFS which only measures RMS amplitude, LUFS accounts for
+    # human hearing sensitivity to different frequencies (K-weighting)
+    #
+    # IMPORTANT: For volume comparison task, we DISABLE LUFS normalization!
+    # LUFS makes everything the same perceived loudness, defeating the purpose.
+    # Instead, we normalize to a baseline dBFS then apply LARGE volume adjustments.
+    use_lufs: false               # DISABLED for audible volume differences
+    baseline_lufs: -23.0          # EBU R128 standard (not used when use_lufs=false)
+    # =====================================================
+    # Volume gap multipliers (similar to duration task)
+    # =====================================================
+    # For MAX_LOUDNESS questions: target_loudness >= second_loudest × multiplier_max
+    # Multiplier 2.5 = ~8dB difference = clearly audible
+    # Multiplier 4.0 = ~12dB difference = very obvious (4x perceived loudness)
+    multiplier_max_loudness: 4.0
+    # For MIN_LOUDNESS questions: target_loudness <= second_softest × multiplier_min
+    # Multiplier 0.25 = ~12dB quieter = clearly distinguishable
+    multiplier_min_loudness: 0.25
+    # Reject sample if loudness gap cannot be satisfied
+    reject_if_gap_not_met: true
+    # =====================================================
+    # Source clip options
+    # =====================================================
+    # If true: same clip can be repeated at different volumes
+    # If false: always use different source clips (default behavior)
+    use_same_clip_different_volumes: false
+    # If use_same_clip_different_volumes is true, how many repetitions per source?
+    # Can be a single int or list for variety
+    repetitions_per_source: [2, 3, 4]
+    # Question types: "max_loudness", "min_loudness"
+    question_types: ["max_loudness", "min_loudness"]
+    # MCQ questions
+    mcq_questions:
+      max_loudness: "Which sound has the maximum loudness in the audio?"
+      min_loudness: "Which sound has the minimum loudness in the audio?"
+    # Open-text questions
+    open_text_questions:
+      max_loudness: "Identify the sound with maximum loudness in the audio clip."
+      min_loudness: "Identify the sound with minimum loudness in the audio clip."
+      order_volume: "List the sounds in order from maximum to minimum loudness."
+# MCQ options configuration
+mcq:
+  num_options: 4
+  option_labels: ["A", "B", "C", "D"]
+  # Strategy for generating distractor options
+  # "present_only": only use sounds present in audio
+  # "mixed": mix of present and absent sounds
+  # "balanced": balanced distribution
+  distractor_strategy: "balanced"
+# Logging configuration
+logging:
+  level: "INFO"  # DEBUG, INFO, WARNING, ERROR, CRITICAL
+  log_file: "pipeline.log"
+  console_output: true

llm_answer_generator.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import pandas as pd
+import argparse
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import random
+# Convert MCQ CSV to NL answers using a text-only LLM (meta-llama/Llama-3.1-8B-Instruct)
+# Adds: (1) stronger LLM-driven variability for duration/volume in open_text mode via system prompt
+#       (2) --one_word_ratio (default 0.2) to skip forward pass for a fraction of rows,
+#           outputting the normalized (underscore-removed) answer only.
+def convert_to_natural_phrase(val):
+    """Convert underscore-separated tokens to natural phrases."""
+    if isinstance(val, str) and "_" in val:
+        val = val.replace("_", " ")
+    return val
+def generate_answer(tokenizer, model, question, correct_value, device, mode="mcq"):
+    """Generate a natural language answer using a text-only LLM.
+    mode: "mcq" (default) uses the original MCQ-oriented prompt.
+          "open_text" uses a direct rewrite prompt for provided question/answer pairs.
+    """
+    correct_value = convert_to_natural_phrase(correct_value)
+    if mode == "open_text":
+        system_preamble = (
+            "You convert (Question, short Answer) into EXACTLY ONE natural English sentence that answers the Question.\n\n"
+            "HARD RULES:\n"
+            "- Output exactly ONE sentence. No newlines, no bullet points, no labels, no quotes.\n"
+            "- Use ONLY the provided Answer content as the factual answer; do not add any new facts.\n"
+            "- Be concise and direct.\n"
+            "- Do NOT include any numbers unless the question is a COUNT question.\n"
+            "- Vary phrasing strongly across items; avoid repeating the same structure.\n\n"
+            "VARIABILITY REQUIREMENT (IMPORTANT):\n"
+            "- For all questions, you MUST vary sentence structure.\n"
+            "- Randomly choose ONE of these patterns each time:\n"
+            "  (A) Start with the sound name (Answer) -> then the relation.\n"
+            "  (B) Start with the relation -> then the sound name (Answer).\n"
+            "  (C) Use an 'it`s...' style clause after the Answer.\n"
+            "  (D) Use a short, natural rephrase with different verbs (e.g., lasts, continues, stands out, comes through).\n"
+            "- Do not always use 'The sound with the ... is ...' — that pattern should be rare.\n\n"
+            "TASK HANDLING (infer from the Question):\n"
+            "- COUNT questions (how many / count / number):\n"
+            "  * If Answer is numeric, write it EITHER as digits (e.g., 10) OR as a word (e.g., ten). Do NOT include both.\n"
+            "- DURATION questions (longest/shortest):\n"
+            "  * Clearly state longest vs shortest, and use the Answer as the sound name. Do not include any numbers.\n"
+            "- VOLUME questions (minimum/maximum loudness, quietest/loudest):\n"
+            "  * Match minimum vs maximum loudness and use the Answer as the sound name. No dB values.\n"
+            "- ORDER questions (first/second/before/after/second-to-last):\n"
+            "  * Match the requested relation and use the Answer as the sound name.\n\n"
+            "Return only the sentence."
+        )
+        user_prompt = (
+            f"Question: {question}\n"
+            f"Answer: {correct_value}\n"
+            "Rewrite the answer as a single, natural sentence that directly answers the question."
+        )
+    else:
+        system_preamble = (
+            "You are a helpful assistant that converts multiple-choice QA pairs into natural language answers.\n"
+            "CRITICAL RULES:\n"
+            "1. Write as a human would naturally speak - vary sentence structure and avoid repetitive patterns\n"
+            "2. Keep responses concise but natural and affirmative avoiding words like 'might/may' or 'could' - one clear sentence\n"
+            "3. Do not mention 'among the options/among the following' even if the question mentions it. This natural language statement is supposed to be a direct answer.\n"
+            "4. Do NOT invent sounds.\n"
+            "5. Do not reason to answer the question, you're just supposed to provide the correct mcq answer as a natural language answer in a single sentence.\n"
+            "Return only the natural language answer, nothing else."
+        )
+        user_prompt = (
+            f"Now, given the question: '{question}' and the correct answer: '{correct_value}', "
+            f"write one natural-language answer as you would expect from a human."
+        )
+    # Chat format
+    messages = [
+        {"role": "system", "content": system_preamble},
+        {"role": "user", "content": user_prompt},
+    ]
+    inputs = tokenizer.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    ).to(device)
+    input_length = inputs.shape[1]
+    with torch.no_grad():
+        output = model.generate(
+            inputs,
+            max_new_tokens=64,
+            do_sample=True,
+            temperature=0.8,
+            top_p=0.9,
+            repetition_penalty=1.05,
+            no_repeat_ngram_size=3,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    generated_ids = output[0, input_length:]
+    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+    print(f"Model response: {response}")
+    return response
+def detect_csv_format(df):
+    """
+    Detect CSV layout and return column mappings.
+    Supports:
+      - original MCQ format
+      - perturbed MCQ format
+      - open-text format (question/answer present)
+    """
+    columns = df.columns.tolist()
+    if "correct" in columns and "id" in columns and "audio_path" in columns:
+        # Original format (count.csv)
+        return {
+            "id_col": "id",
+            "audio_path_col": "audio_path",
+            "answer_col": "correct",
+            "question_col": "question",
+            "format_type": "original",
+        }
+    if "answer" in columns and "idx" in columns and "new_audio_path" in columns:
+        # Perturbed format (count_perturbed.csv)
+        return {
+            "id_col": "idx",
+            "audio_path_col": "new_audio_path",
+            "answer_col": "answer",
+            "question_col": "question",
+            "format_type": "perturbed",
+        }
+    if "answer" in columns and "question" in columns:
+        # Open-text format
+        return {
+            "id_col": "id" if "id" in columns else None,
+            "audio_path_col": "audio_path" if "audio_path" in columns else None,
+            "answer_col": "answer",
+            "question_col": "question",
+            "format_type": "open_text",
+        }
+    raise ValueError(f"Unknown CSV format. Columns found: {columns}")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert CSV to NL answers (MCQ or open-text) using meta-llama/Llama-3.1-8B-Instruct"
+    )
+    parser.add_argument("--input", required=True, help="Input CSV file")
+    parser.add_argument("--output", required=False, help="Output CSV file (defaults to input for in-place append)")
+    parser.add_argument(
+        "--mode",
+        required=True,
+        choices=["mcq", "open_text"],
+        help="Conversion mode: mcq -> convert MCQ correct option to natural answer; open_text -> rewrite provided short answer to a natural sentence",
+    )
+    parser.add_argument(
+        "--task",
+        required=True,
+        choices=["count", "duration", "order", "volume"],
+        help="Task type this CSV belongs to (used for bookkeeping/logging)",
+    )
+    # NEW: one-word skipping
+    parser.add_argument(
+        "--one_word_ratio",
+        type=float,
+        default=0.2,
+        help="Fraction of samples to output as just the normalized one-word/phrase answer (no LLM forward pass). Default 0.2",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=123,
+        help="Random seed for reproducible one_word sampling.",
+    )
+    args = parser.parse_args()
+    random.seed(args.seed)
+    print("Loading meta-llama/Llama-3.1-8B-Instruct tokenizer and model...")
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", use_fast=False)
+    model = AutoModelForCausalLM.from_pretrained(
+        "meta-llama/Llama-3.1-8B-Instruct",
+        torch_dtype="auto",
+        device_map="auto",
+    )
+    model.eval()
+    df = pd.read_csv(args.input)
+    # Detect CSV format and get column mappings
+    format_info = detect_csv_format(df)
+    print(f"Detected CSV format: {format_info['format_type']}")
+    # Validate requested mode against detected CSV format
+    if args.mode == "mcq" and format_info["format_type"] == "open_text":
+        raise ValueError(
+            "Requested mode=mcq but input appears to be open_text format. Use --mode open_text or supply an MCQ CSV."
+        )
+    if args.mode == "open_text" and format_info["format_type"] != "open_text":
+        raise ValueError(
+            "Requested mode=open_text but input does not appear to be open_text format. Use --mode mcq or supply an open_text CSV."
+        )
+    output_path = args.output if args.output else args.input
+    nl_rows = []
+    device = model.device
+    for i, row in df.iterrows():
+        question = row[format_info["question_col"]]
+        # Resolve correct_value from CSV format
+        if format_info["format_type"] == "open_text":
+            correct_value = row[format_info["answer_col"]]
+        else:
+            correct_letter = row[format_info["answer_col"]]
+            option_map = {"A": "optionA", "B": "optionB", "C": "optionC", "D": "optionD"}
+            correct_value = row[option_map[correct_letter]]
+        # Normalize underscores BEFORE deciding one_word skip
+        correct_value = convert_to_natural_phrase(correct_value)
+        print(f"[{i+1}/{len(df)}] Q: {question} | Ans: {correct_value}")
+        # 20%: one-word/phrase answer, no forward pass
+        if random.random() < args.one_word_ratio:
+            nl_answer = correct_value
+            print(f"Skipped LLM (one_word_ratio). Output: {nl_answer}")
+        else:
+            nl_answer = generate_answer(
+                tokenizer,
+                model,
+                question,
+                correct_value,
+                device,
+                mode=("open_text" if format_info["format_type"] == "open_text" else "mcq"),
+            )
+        nl_rows.append(
+            {
+                "question": question,
+                "id": row[format_info["id_col"]] if format_info.get("id_col") and format_info["id_col"] in row else None,
+                "audio_path": row[format_info["audio_path_col"]]
+                if format_info.get("audio_path_col")
+                else None,
+                "original_answer": correct_value,
+                "open_text_answer": nl_answer,
+            }
+        )
+    # Merge back as new column to the original CSV to preserve all fields
+    nl_df = pd.DataFrame(nl_rows)
+    df["open_text_answer"] = nl_df["open_text_answer"]
+    df.to_csv(output_path, index=False)
+    print(f"Appended natural language answers to {output_path}")
+if __name__ == "__main__":
+    main()

main.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""
+Main pipeline runner for temporal reasoning audio dataset generation.
+This script orchestrates the generation of all task datasets.
+"""
+import argparse
+import sys
+import yaml
+from pathlib import Path
+from typing import List, Optional
+# Add project root to path
+sys.path.append(str(Path(__file__).parent))
+from utils import setup_logger, set_random_seed
+from tasks.task_count import CountTaskGenerator
+from tasks.task_duration import DurationTaskGenerator
+from tasks.task_order import OrderTaskGenerator
+from tasks.task_volume import VolumeTaskGenerator
+def load_config(config_path: str) -> dict:
+    """Load configuration from YAML file."""
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    return config
+def run_count_task(config: dict, logger):
+    """Run the count task generation."""
+    if not config['tasks']['count']['enabled']:
+        logger.info("Count task is disabled, skipping...")
+        return
+    logger.info("=" * 80)
+    logger.info("STARTING COUNT TASK GENERATION")
+    logger.info("=" * 80)
+    generator = CountTaskGenerator(config, logger)
+    generator.dataset.reset_category_usage()  # Reset counter for this task
+    generator.generate_dataset()
+    # Log category usage statistics
+    usage_stats = generator.dataset.get_category_usage_stats()
+    sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
+    logger.info("Category usage statistics (as answers):")
+    logger.info(f"  Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
+    logger.info(f"  Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
+    logger.info(f"  Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
+    logger.info("Count task completed successfully!")
+def run_duration_task(config: dict, logger):
+    """Run the duration task generation."""
+    if not config['tasks']['duration']['enabled']:
+        logger.info("Duration task is disabled, skipping...")
+        return
+    logger.info("=" * 80)
+    logger.info("STARTING DURATION TASK GENERATION")
+    logger.info("=" * 80)
+    generator = DurationTaskGenerator(config, logger)
+    generator.dataset.reset_category_usage()  # Reset counter for this task
+    generator.generate_dataset()
+    # Log category usage statistics
+    usage_stats = generator.dataset.get_category_usage_stats()
+    sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
+    logger.info("Category usage statistics (as longest/shortest answers):")
+    logger.info(f"  Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
+    logger.info(f"  Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
+    logger.info(f"  Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
+    logger.info("Duration task completed successfully!")
+def run_order_task(config: dict, logger):
+    """Run the order task generation."""
+    if not config['tasks']['order']['enabled']:
+        logger.info("Order task is disabled, skipping...")
+        return
+    logger.info("=" * 80)
+    logger.info("STARTING ORDER TASK GENERATION")
+    logger.info("=" * 80)
+    generator = OrderTaskGenerator(config, logger)
+    generator.dataset.reset_category_usage()  # Reset counter for this task
+    generator.generate_dataset()
+    # Log category usage statistics
+    usage_stats = generator.dataset.get_category_usage_stats()
+    sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
+    logger.info("Category usage statistics (as first/last/after/before answers):")
+    logger.info(f"  Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
+    logger.info(f"  Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
+    logger.info(f"  Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
+    logger.info("Order task completed successfully!")
+def run_volume_task(config: dict, logger):
+    """Run the volume task generation."""
+    if not config['tasks']['volume']['enabled']:
+        logger.info("Volume task is disabled, skipping...")
+        return
+    logger.info("=" * 80)
+    logger.info("STARTING VOLUME TASK GENERATION")
+    logger.info("=" * 80)
+    generator = VolumeTaskGenerator(config, logger)
+    generator.dataset.reset_category_usage()  # Reset counter for this task
+    generator.generate_dataset()
+    # Log category usage statistics
+    usage_stats = generator.dataset.get_category_usage_stats()
+    sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
+    logger.info("Category usage statistics (as loudest/softest answers):")
+    logger.info(f"  Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
+    logger.info(f"  Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
+    logger.info(f"  Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
+    logger.info("Volume task completed successfully!")
+def run_pipeline(
+    config_path: str,
+    tasks: Optional[List[str]] = None,
+    output_path: Optional[str] = None
+):
+    """
+    Run the complete dataset generation pipeline.
+    Args:
+        config_path: Path to configuration YAML file
+        tasks: Optional list of specific tasks to run (default: all enabled tasks)
+        output_path: Optional custom output path (overrides config)
+    """
+    # Load configuration
+    config = load_config(config_path)
+    # Override output path if provided
+    if output_path:
+        config['output']['base_path'] = output_path
+    # Create output directory
+    output_base = Path(config['output']['base_path'])
+    output_base.mkdir(parents=True, exist_ok=True)
+    # Set random seed
+    set_random_seed(config['random_seed'])
+    # Setup main logger
+    logger = setup_logger(
+        'pipeline',
+        log_file=str(output_base / config['logging']['log_file']),
+        level=config['logging']['level'],
+        console_output=config['logging']['console_output']
+    )
+    logger.info("=" * 80)
+    logger.info("TEMPORAL REASONING AUDIO DATASET GENERATION PIPELINE")
+    logger.info("=" * 80)
+    logger.info(f"Configuration: {config_path}")
+    logger.info(f"Output directory: {output_base}")
+    logger.info(f"Random seed: {config['random_seed']}")
+    logger.info(f"ESC-50 audio path: {config['esc50']['audio_path']}")
+    logger.info(f"ESC-50 metadata path: {config['esc50']['metadata_path']}")
+    # Determine which tasks to run
+    task_map = {
+        'count': run_count_task,
+        'duration': run_duration_task,
+        'order': run_order_task,
+        'volume': run_volume_task
+    }
+    if tasks:
+        tasks_to_run = {k: v for k, v in task_map.items() if k in tasks}
+        logger.info(f"Running specific tasks: {', '.join(tasks)}")
+    else:
+        tasks_to_run = task_map
+        logger.info("Running all enabled tasks")
+    # Run tasks
+    for task_name, task_func in tasks_to_run.items():
+        try:
+            task_func(config, logger)
+        except Exception as e:
+            logger.error(f"Error running {task_name} task: {e}", exc_info=True)
+            raise
+    logger.info("=" * 80)
+    logger.info("PIPELINE COMPLETED SUCCESSFULLY!")
+    logger.info("=" * 80)
+    logger.info(f"All outputs saved to: {output_base}")
+def main():
+    """Main entry point with argument parsing."""
+    parser = argparse.ArgumentParser(
+        description="Temporal Reasoning Audio Dataset Generation Pipeline",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Run all tasks with default config
+  python main.py
+  # Run with custom config
+  python main.py --config my_config.yaml
+  # Run specific tasks only
+  python main.py --tasks count duration
+  # Use custom output directory
+  python main.py --output /path/to/output
+  # Combine options
+  python main.py --config custom.yaml --tasks count order --output ./my_dataset
+        """
+    )
+    parser.add_argument(
+        '--config', '-c',
+        type=str,
+        default='config.yaml',
+        help='Path to configuration YAML file (default: config.yaml)'
+    )
+    parser.add_argument(
+        '--tasks', '-t',
+        nargs='+',
+        choices=['count', 'duration', 'order', 'volume'],
+        help='Specific tasks to run (default: all enabled tasks)'
+    )
+    parser.add_argument(
+        '--output', '-o',
+        type=str,
+        help='Custom output directory (overrides config)'
+    )
+    args = parser.parse_args()
+    # Check if config file exists
+    config_path = Path(args.config)
+    if not config_path.exists():
+        # Try relative to script directory
+        script_dir = Path(__file__).parent
+        config_path = script_dir / args.config
+        if not config_path.exists():
+            print(f"Error: Config file not found: {args.config}")
+            sys.exit(1)
+    # Run pipeline
+    try:
+        run_pipeline(
+            config_path=str(config_path),
+            tasks=args.tasks,
+            output_path=args.output
+        )
+    except Exception as e:
+        print(f"Pipeline failed with error: {e}")
+        sys.exit(1)
+if __name__ == '__main__':
+    main()

preprocess_esc50.py ADDED Viewed

	@@ -0,0 +1,714 @@

+#!/usr/bin/env python3
+"""
+ESC-50 Preprocessing Script for Duration Task
+This script processes all ESC-50 audio clips to:
+1. Apply amplitude-based filtering to detect actual sound regions
+2. Calculate effective duration (portion containing actual sound)
+3. Save trimmed audio files (with silence removed)
+4. Generate a CSV with all metadata including effective durations
+Usage:
+    python preprocess_esc50.py --config config.yaml
+    python preprocess_esc50.py --config config.yaml --threshold-db -40 --min-sound-ms 50
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+import pandas as pd
+from pydub import AudioSegment
+from tqdm import tqdm
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent))
+from utils.logger import setup_logger
+logger = setup_logger(__name__)
+def get_amplitude_array(audio: AudioSegment) -> np.ndarray:
+    """
+    Convert AudioSegment to numpy array of amplitudes.
+    Args:
+        audio: Input audio segment
+    Returns:
+        Numpy array of amplitude values (normalized to -1 to 1)
+    """
+    samples = np.array(audio.get_array_of_samples())
+    # Handle stereo by averaging channels
+    if audio.channels == 2:
+        samples = samples.reshape((-1, 2)).mean(axis=1)
+    # Normalize to -1 to 1 range
+    max_val = float(2 ** (audio.sample_width * 8 - 1))
+    samples = samples / max_val
+    return samples
+def compute_rms_envelope(samples: np.ndarray, frame_size_ms: int, hop_size_ms: int,
+                         sample_rate: int) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Compute RMS envelope of audio signal.
+    Args:
+        samples: Audio samples as numpy array
+        frame_size_ms: Frame size in milliseconds
+        hop_size_ms: Hop size in milliseconds
+        sample_rate: Audio sample rate
+    Returns:
+        Tuple of (rms_values, time_stamps_ms)
+    """
+    frame_size = int(sample_rate * frame_size_ms / 1000)
+    hop_size = int(sample_rate * hop_size_ms / 1000)
+    rms_values = []
+    time_stamps = []
+    for i in range(0, len(samples) - frame_size + 1, hop_size):
+        frame = samples[i:i + frame_size]
+        rms = np.sqrt(np.mean(frame ** 2))
+        rms_values.append(rms)
+        time_stamps.append(i / sample_rate * 1000)  # Convert to ms
+    return np.array(rms_values), np.array(time_stamps)
+def rms_to_db(rms: np.ndarray, reference: float = 1.0) -> np.ndarray:
+    """
+    Convert RMS values to decibels.
+    Args:
+        rms: RMS values
+        reference: Reference value (default 1.0 for normalized audio)
+    Returns:
+        dB values
+    """
+    # Avoid log(0) by using a small epsilon
+    epsilon = 1e-10
+    return 20 * np.log10(np.maximum(rms, epsilon) / reference)
+def detect_sound_regions(
+    audio: AudioSegment,
+    threshold_db: float = -40.0,
+    min_sound_duration_ms: int = 50,
+    frame_size_ms: int = 20,
+    hop_size_ms: int = 10,
+    merge_gap_ms: int = 100,
+    threshold_strategy: str = 'noise_floor',
+    noise_floor_percentile: float = 10.0,
+    noise_floor_delta_db: float = 15.0
+) -> List[Tuple[int, int]]:
+    """
+    Detect regions in audio that contain actual sound (above threshold).
+    Supports two threshold strategies:
+    - 'peak_relative': threshold = peak_db + threshold_db (old behavior)
+    - 'noise_floor': threshold = percentile(db_values, p) + delta_db (adaptive per-clip)
+    The 'noise_floor' strategy is recommended as it adapts to each clip's
+    actual background noise level rather than using a fixed offset from peak.
+    Args:
+        audio: Input audio segment
+        threshold_db: dB threshold below peak (used if strategy='peak_relative')
+        min_sound_duration_ms: Minimum duration of sound region to keep
+        frame_size_ms: Frame size for RMS computation
+        hop_size_ms: Hop size for RMS computation
+        merge_gap_ms: Merge regions separated by less than this gap
+        threshold_strategy: 'peak_relative' or 'noise_floor'
+        noise_floor_percentile: Percentile for noise floor estimation (default 10)
+        noise_floor_delta_db: dB above noise floor to set threshold (default 15)
+    Returns:
+        List of (start_ms, end_ms) tuples for sound regions
+    """
+    samples = get_amplitude_array(audio)
+    sample_rate = audio.frame_rate
+    # Compute RMS envelope
+    rms_values, time_stamps = compute_rms_envelope(
+        samples, frame_size_ms, hop_size_ms, sample_rate
+    )
+    if len(rms_values) == 0:
+        return []
+    # Convert to dB
+    db_values = rms_to_db(rms_values)
+    # Compute threshold based on strategy
+    peak_db = np.max(db_values)
+    if threshold_strategy == 'noise_floor':
+        # ADAPTIVE: Use noise floor (low percentile) + delta
+        # This adapts to each clip's actual background noise level
+        noise_floor_db = np.percentile(db_values, noise_floor_percentile)
+        absolute_threshold = noise_floor_db + noise_floor_delta_db
+        # Safeguard: don't exceed peak (would detect nothing)
+        # Leave at least 1 dB below peak
+        absolute_threshold = min(absolute_threshold, peak_db - 1.0)
+        logger.debug(
+            f"Noise-floor threshold: floor={noise_floor_db:.1f}dB (p{noise_floor_percentile}), "
+            f"delta={noise_floor_delta_db}dB, threshold={absolute_threshold:.1f}dB, peak={peak_db:.1f}dB"
+        )
+    else:
+        # OLD: peak-relative threshold
+        absolute_threshold = peak_db + threshold_db  # threshold_db is negative
+        logger.debug(
+            f"Peak-relative threshold: peak={peak_db:.1f}dB, offset={threshold_db}dB, "
+            f"threshold={absolute_threshold:.1f}dB"
+        )
+    # Find frames above threshold
+    above_threshold = db_values > absolute_threshold
+    # Find contiguous regions
+    regions = []
+    in_region = False
+    region_start = 0
+    for i, (is_above, time_ms) in enumerate(zip(above_threshold, time_stamps)):
+        if is_above and not in_region:
+            # Start of new region
+            in_region = True
+            region_start = time_ms
+        elif not is_above and in_region:
+            # End of region
+            in_region = False
+            region_end = time_ms
+            if region_end - region_start >= min_sound_duration_ms:
+                regions.append((int(region_start), int(region_end)))
+    # Handle case where audio ends while still in a region
+    if in_region:
+        region_end = time_stamps[-1] + hop_size_ms
+        if region_end - region_start >= min_sound_duration_ms:
+            regions.append((int(region_start), int(region_end)))
+    # Merge regions that are close together
+    if len(regions) > 1:
+        merged_regions = [regions[0]]
+        for start, end in regions[1:]:
+            prev_start, prev_end = merged_regions[-1]
+            if start - prev_end <= merge_gap_ms:
+                # Merge with previous region
+                merged_regions[-1] = (prev_start, end)
+            else:
+                merged_regions.append((start, end))
+        regions = merged_regions
+    return regions
+def get_sound_regions(
+    audio: AudioSegment,
+    threshold_db: float = -40.0,
+    min_sound_duration_ms: int = 50,
+    threshold_strategy: str = 'noise_floor',
+    noise_floor_percentile: float = 10.0,
+    noise_floor_delta_db: float = 15.0
+) -> List[Tuple[int, int]]:
+    """
+    Detect sound regions in audio using adaptive threshold.
+    Args:
+        audio: Input audio segment
+        threshold_db: dB threshold below peak (used if strategy='peak_relative')
+        min_sound_duration_ms: Minimum duration of sound region to keep
+        threshold_strategy: 'peak_relative' or 'noise_floor'
+        noise_floor_percentile: Percentile for noise floor estimation
+        noise_floor_delta_db: dB above noise floor to set threshold
+    Returns:
+        List of (start_ms, end_ms) tuples for sound regions
+    """
+    return detect_sound_regions(
+        audio,
+        threshold_db=threshold_db,
+        min_sound_duration_ms=min_sound_duration_ms,
+        threshold_strategy=threshold_strategy,
+        noise_floor_percentile=noise_floor_percentile,
+        noise_floor_delta_db=noise_floor_delta_db
+    )
+def extract_sound_with_edges_trimmed(
+    audio: AudioSegment,
+    regions: List[Tuple[int, int]],
+    min_silence_to_trim_ms: int = 100,
+    buffer_ratio: float = 0.1
+) -> AudioSegment:
+    """
+    Extract audio with ONLY leftmost and rightmost silence removed IF present.
+    Trimming is ADAPTIVE:
+    - Only trims if edge silence >= min_silence_to_trim_ms
+    - Keeps a small percentage (buffer_ratio) of the silence to preserve transients
+    - Buffer size adapts to actual silence duration (not fixed)
+    Preserves all internal structure and silence between sounds.
+    Perfect for periodic sounds (clock ticks, footsteps, typing).
+    Args:
+        audio: Input audio segment
+        regions: List of (start_ms, end_ms) tuples for sound regions
+        min_silence_to_trim_ms: Only trim if edge silence is at least this long (default 100ms)
+        buffer_ratio: Keep this fraction of the silence as buffer (default 0.1 = 10%)
+                     Example: 500ms silence -> keep 50ms buffer
+    Returns:
+        Audio segment with edges trimmed (or original if no significant silence)
+    """
+    if not regions:
+        # No sound detected - return original
+        return audio
+    # Find the overall sound boundaries (first sound start, last sound end)
+    first_sound_start_ms = regions[0][0]
+    last_sound_end_ms = regions[-1][1]
+    audio_duration_ms = len(audio)
+    # Calculate actual silence durations at edges
+    leading_silence_ms = first_sound_start_ms
+    trailing_silence_ms = audio_duration_ms - last_sound_end_ms
+    # Adaptive trimming: only trim if there's significant silence
+    # Keep a small percentage as buffer to avoid cutting transients
+    if leading_silence_ms >= min_silence_to_trim_ms:
+        buffer_ms = max(200, int(leading_silence_ms * buffer_ratio))  # At least 200ms buffer
+        trim_start_ms = max(0, first_sound_start_ms - buffer_ms)
+    else:
+        # Not enough silence to trim - keep from start
+        trim_start_ms = 0
+    if trailing_silence_ms >= min_silence_to_trim_ms:
+        buffer_ms = max(200, int(trailing_silence_ms * buffer_ratio))
+        trim_end_ms = min(audio_duration_ms, last_sound_end_ms + buffer_ms)
+    else:
+        # Not enough silence to trim - keep to end
+        trim_end_ms = audio_duration_ms
+    # Extract the edge-trimmed portion (internal structure preserved)
+    trimmed_audio = audio[trim_start_ms:trim_end_ms]
+    logger.debug(
+        f"Edge trim: {audio_duration_ms}ms -> {len(trimmed_audio)}ms "
+        f"(leading_silence={leading_silence_ms}ms, trailing_silence={trailing_silence_ms}ms, "
+        f"trim_start={trim_start_ms}ms, trim_end={trim_end_ms}ms)"
+    )
+    return trimmed_audio
+def extract_all_sound_regions(
+    audio: AudioSegment,
+    regions: List[Tuple[int, int]],
+    crossfade_ms: int = 10,
+    padding_ms: int = 20
+) -> AudioSegment:
+    """
+    Extract ALL sound portions and join them, removing ALL silence.
+    WARNING: This destroys natural periodicity! Use trim_edges_only() instead
+    for most use cases. This function is kept for backward compatibility.
+    Args:
+        audio: Input audio segment
+        regions: List of (start_ms, end_ms) tuples for sound regions
+        crossfade_ms: Crossfade duration when joining regions
+        padding_ms: Padding around each region to avoid cutting transients
+    Returns:
+        Audio segment containing only sound portions (internal silence removed)
+    """
+    if not regions:
+        return audio
+    # Extract each region
+    extracted_parts = []
+    for start_ms, end_ms in regions:
+        # Add padding to avoid cutting off transients
+        padded_start = max(0, start_ms - padding_ms)
+        padded_end = min(len(audio), end_ms + padding_ms)
+        part = audio[padded_start:padded_end]
+        extracted_parts.append(part)
+    # Concatenate with crossfade
+    if len(extracted_parts) == 1:
+        return extracted_parts[0]
+    result = extracted_parts[0]
+    for part in extracted_parts[1:]:
+        if len(result) > crossfade_ms and len(part) > crossfade_ms:
+            result = result.append(part, crossfade=crossfade_ms)
+        else:
+            result = result + part
+    return result
+def process_esc50_dataset(
+    audio_dir: str,
+    metadata_path: str,
+    output_dir: str,
+    threshold_db: float = -40.0,
+    min_sound_duration_ms: int = 50,
+    save_trimmed_audio: bool = True,
+    threshold_strategy: str = 'noise_floor',
+    noise_floor_percentile: float = 10.0,
+    noise_floor_delta_db: float = 15.0
+) -> pd.DataFrame:
+    """
+    Process entire ESC-50 dataset and compute effective durations.
+    Uses ADAPTIVE EDGE-ONLY trimming to preserve natural periodicity of sounds.
+    Only leading and trailing silence is removed IF significant (>=100ms).
+    Trimming is adaptive: keeps a small percentage of silence as buffer for transients.
+    All internal structure is preserved.
+    Supports two threshold strategies for adaptive per-clip thresholding:
+    - 'peak_relative': threshold = peak_db + threshold_db (fixed offset from peak)
+    - 'noise_floor': threshold = percentile(db, p) + delta (adapts to noise floor)
+    Args:
+        audio_dir: Path to ESC-50 audio directory
+        metadata_path: Path to ESC-50 metadata CSV
+        output_dir: Output directory for processed files
+        threshold_db: dB threshold for silence detection (peak_relative mode)
+        min_sound_duration_ms: Minimum sound duration to keep
+        save_trimmed_audio: Whether to save trimmed audio files
+        threshold_strategy: 'peak_relative' or 'noise_floor' (recommended)
+        noise_floor_percentile: Percentile for noise floor estimation (default 5)
+        noise_floor_delta_db: dB above noise floor to set threshold (default 8)
+    Returns:
+        DataFrame with processed metadata
+    """
+    # Load original metadata
+    original_metadata = pd.read_csv(metadata_path)
+    logger.info(f"Loaded metadata for {len(original_metadata)} clips")
+    # Create output directories
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    if save_trimmed_audio:
+        trimmed_audio_dir = output_path / "trimmed_audio"
+        trimmed_audio_dir.mkdir(exist_ok=True)
+    # Process each audio file
+    results = []
+    for _, row in tqdm(original_metadata.iterrows(), total=len(original_metadata),
+                       desc="Processing ESC-50 clips"):
+        filename = row['filename']
+        category = row['category']
+        audio_path = Path(audio_dir) / filename
+        try:
+            # Load audio
+            audio = AudioSegment.from_file(str(audio_path), format="wav")
+            raw_duration_s = len(audio) / 1000.0
+            # Detect sound regions (using adaptive threshold)
+            regions = get_sound_regions(
+                audio,
+                threshold_db=threshold_db,
+                min_sound_duration_ms=min_sound_duration_ms,
+                threshold_strategy=threshold_strategy,
+                noise_floor_percentile=noise_floor_percentile,
+                noise_floor_delta_db=noise_floor_delta_db
+            )
+            # Trim edges only (leftmost and rightmost silence)
+            # Adaptive trimming: only trims if silence >= 100ms, keeps 10% as buffer
+            trimmed_audio = extract_sound_with_edges_trimmed(audio, regions)
+            final_duration_s = len(trimmed_audio) / 1000.0
+            # Calculate peak amplitude and RMS from trimmed audio
+            samples = get_amplitude_array(trimmed_audio)
+            peak_amplitude = np.max(np.abs(samples))
+            peak_amplitude_db = 20 * np.log10(peak_amplitude + 1e-10)
+            rms = np.sqrt(np.mean(samples ** 2))
+            avg_rms_db = 20 * np.log10(rms + 1e-10)
+            # Calculate effective duration (sum of sound regions)
+            effective_duration_s = sum(end - start for start, end in regions) / 1000.0 if regions else final_duration_s
+            # Save trimmed audio
+            trimmed_filename = None
+            if save_trimmed_audio:
+                trimmed_filename = filename
+                trimmed_path = trimmed_audio_dir / trimmed_filename
+                trimmed_audio.export(str(trimmed_path), format="wav")
+            # Store results
+            results.append({
+                'filename': filename,
+                'category': category,
+                'fold': row['fold'],
+                'target': row['target'],
+                'esc10': row['esc10'],
+                'raw_duration_s': round(raw_duration_s, 4),
+                'final_duration_s': round(final_duration_s, 4),
+                'effective_duration_s': round(effective_duration_s, 4),
+                'num_sound_regions': len(regions),
+                'peak_amplitude_db': round(peak_amplitude_db, 2),
+                'avg_rms_db': round(avg_rms_db, 2),
+                'trimmed_filename': trimmed_filename if save_trimmed_audio else None,
+                'threshold_strategy': threshold_strategy,
+                'threshold_db_used': threshold_db if threshold_strategy == 'peak_relative' else None,
+                'noise_floor_percentile': noise_floor_percentile if threshold_strategy == 'noise_floor' else None,
+                'noise_floor_delta_db': noise_floor_delta_db if threshold_strategy == 'noise_floor' else None,
+                'min_sound_duration_ms_used': min_sound_duration_ms
+            })
+        except Exception as e:
+            logger.error(f"Error processing {filename}: {e}")
+            results.append({
+                'filename': filename,
+                'category': category,
+                'fold': row['fold'],
+                'target': row['target'],
+                'esc10': row['esc10'],
+                'raw_duration_s': None,
+                'final_duration_s': None,
+                'effective_duration_s': None,
+                'num_sound_regions': 0,
+                'peak_amplitude_db': None,
+                'avg_rms_db': None,
+                'trimmed_filename': None,
+                'threshold_strategy': threshold_strategy,
+                'threshold_db_used': threshold_db if threshold_strategy == 'peak_relative' else None,
+                'noise_floor_percentile': noise_floor_percentile if threshold_strategy == 'noise_floor' else None,
+                'noise_floor_delta_db': noise_floor_delta_db if threshold_strategy == 'noise_floor' else None,
+                'min_sound_duration_ms_used': min_sound_duration_ms,
+                'error': str(e)
+            })
+    # Create DataFrame
+    results_df = pd.DataFrame(results)
+    # Save CSV
+    csv_path = output_path / "effective_durations.csv"
+    results_df.to_csv(csv_path, index=False)
+    logger.info(f"Saved effective durations to {csv_path}")
+    # Print summary statistics
+    print_summary_statistics(results_df)
+    return results_df
+def print_summary_statistics(df: pd.DataFrame):
+    """Print summary statistics of the processed dataset."""
+    print("\n" + "=" * 60)
+    print("ESC-50 Preprocessing Summary")
+    print("=" * 60)
+    # Filter out errors
+    valid_df = df[df['effective_duration_s'].notna()]
+    print(f"\nTotal clips processed: {len(df)}")
+    print(f"Successfully processed: {len(valid_df)}")
+    print(f"Errors: {len(df) - len(valid_df)}")
+    print(f"\nRaw duration statistics:")
+    print(f"  Mean: {valid_df['raw_duration_s'].mean():.3f}s")
+    print(f"  Std:  {valid_df['raw_duration_s'].std():.3f}s")
+    print(f"  Min:  {valid_df['raw_duration_s'].min():.3f}s")
+    print(f"  Max:  {valid_df['raw_duration_s'].max():.3f}s")
+    print(f"\nFinal duration statistics (edges trimmed, internal structure preserved):")
+    print(f"  Mean: {valid_df['final_duration_s'].mean():.3f}s")
+    print(f"  Std:  {valid_df['final_duration_s'].std():.3f}s")
+    print(f"  Min:  {valid_df['final_duration_s'].min():.3f}s")
+    print(f"  Max:  {valid_df['final_duration_s'].max():.3f}s")
+    print(f"\nEffective duration statistics (sum of sound regions only):")
+    print(f"  Mean: {valid_df['effective_duration_s'].mean():.3f}s")
+    print(f"  Std:  {valid_df['effective_duration_s'].std():.3f}s")
+    print(f"  Min:  {valid_df['effective_duration_s'].min():.3f}s")
+    print(f"  Max:  {valid_df['effective_duration_s'].max():.3f}s")
+    # Compare effective vs final
+    print(f"\nComparison (final includes internal silences):")
+    print(f"  Avg effective: {valid_df['effective_duration_s'].mean():.3f}s")
+    print(f"  Avg final:     {valid_df['final_duration_s'].mean():.3f}s")
+    print(f"  Difference:    {valid_df['final_duration_s'].mean() - valid_df['effective_duration_s'].mean():.3f}s (internal silences)")
+    # Duration reduction
+    reduction = (1 - valid_df['final_duration_s'].mean() / valid_df['raw_duration_s'].mean()) * 100
+    print(f"\nAverage edge trimming reduction: {reduction:.1f}%")
+    # Per-category statistics
+    print("\nEffective duration by category (top 10 longest):")
+    category_stats = valid_df.groupby('category')['effective_duration_s'].agg(['mean', 'std', 'min', 'max'])
+    category_stats = category_stats.sort_values('mean', ascending=False)
+    print(category_stats.head(10).to_string())
+    print("\nEffective duration by category (top 10 shortest):")
+    print(category_stats.tail(10).to_string())
+    print("\n" + "=" * 60)
+def load_config(config_path: str) -> dict:
+    """Load configuration from YAML file."""
+    import yaml
+    with open(config_path, 'r') as f:
+        return yaml.safe_load(f)
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess ESC-50 dataset for duration task"
+    )
+    parser.add_argument(
+        '--config', '-c',
+        type=str,
+        default='config.yaml',
+        help='Path to configuration file'
+    )
+    parser.add_argument(
+        '--threshold-db',
+        type=float,
+        default=None,
+        help='dB threshold below peak for silence detection (default: -40)'
+    )
+    parser.add_argument(
+        '--min-sound-ms',
+        type=int,
+        default=None,
+        help='Minimum sound duration in ms to keep (default: 50)'
+    )
+    parser.add_argument(
+        '--output-dir',
+        type=str,
+        default=None,
+        help='Output directory (default: from config or ESC-50_preprocessed)'
+    )
+    parser.add_argument(
+        '--no-trimmed-audio',
+        action='store_true',
+        help='Do not save trimmed audio files (only save CSV)'
+    )
+    parser.add_argument(
+        '--threshold-strategy',
+        type=str,
+        choices=['peak_relative', 'noise_floor'],
+        default=None,
+        help='Threshold strategy: peak_relative (old) or noise_floor (adaptive, recommended)'
+    )
+    parser.add_argument(
+        '--noise-floor-percentile',
+        type=float,
+        default=None,
+        help='Percentile for noise floor estimation (default: 10)'
+    )
+    parser.add_argument(
+        '--noise-floor-delta-db',
+        type=float,
+        default=None,
+        help='dB above noise floor to set threshold (default: 15)'
+    )
+    args = parser.parse_args()
+    # Load config
+    config = load_config(args.config)
+    # Get ESC-50 paths from config
+    esc50_config = config.get('esc50', {})
+    audio_dir = esc50_config.get('audio_path', '/home/debarpanb1/ESC-50_github/audio')
+    metadata_path = esc50_config.get('metadata_path', '/home/debarpanb1/ESC-50_github/meta/esc50.csv')
+    # Get duration task config for preprocessing parameters
+    duration_config = config.get('tasks', {}).get('duration', {})
+    # Determine threshold and min sound duration
+    threshold_db = args.threshold_db
+    if threshold_db is None:
+        threshold_db = duration_config.get('amplitude_threshold_db', -40.0)
+    min_sound_ms = args.min_sound_ms
+    if min_sound_ms is None:
+        min_sound_ms = duration_config.get('min_sound_duration_ms', 50)
+    # Determine output directory
+    output_dir = args.output_dir
+    if output_dir is None:
+        output_dir = duration_config.get(
+            'preprocessed_data_path',
+            '/home/debarpanb1/TREA_2.0/ESC-50_preprocessed'
+        )
+    # Determine threshold strategy (noise_floor is recommended/default)
+    threshold_strategy = args.threshold_strategy
+    if threshold_strategy is None:
+        threshold_strategy = duration_config.get('threshold_strategy', 'noise_floor')
+    # Determine noise floor percentile
+    noise_floor_percentile = args.noise_floor_percentile
+    if noise_floor_percentile is None:
+        noise_floor_percentile = duration_config.get('noise_floor_percentile', 10.0)
+    # Determine noise floor delta dB
+    noise_floor_delta_db = args.noise_floor_delta_db
+    if noise_floor_delta_db is None:
+        noise_floor_delta_db = duration_config.get('noise_floor_delta_db', 15.0)
+    # Log configuration
+    logger.info("=" * 60)
+    logger.info("ESC-50 Preprocessing Configuration")
+    logger.info("=" * 60)
+    logger.info(f"Audio directory: {audio_dir}")
+    logger.info(f"Metadata path: {metadata_path}")
+    logger.info(f"Output directory: {output_dir}")
+    logger.info(f"Threshold strategy: {threshold_strategy}")
+    if threshold_strategy == 'peak_relative':
+        logger.info(f"  Peak-relative threshold dB: {threshold_db}")
+    else:
+        logger.info(f"  Noise floor percentile: {noise_floor_percentile}")
+        logger.info(f"  Noise floor delta dB: {noise_floor_delta_db}")
+    logger.info(f"Min sound duration (ms): {min_sound_ms}")
+    logger.info(f"Adaptive edge trimming: only if silence >= 100ms, keep 10% buffer")
+    logger.info(f"Save trimmed audio: {not args.no_trimmed_audio}")
+    logger.info("=" * 60)
+    # Process dataset
+    results_df = process_esc50_dataset(
+        audio_dir=audio_dir,
+        metadata_path=metadata_path,
+        output_dir=output_dir,
+        threshold_db=threshold_db,
+        min_sound_duration_ms=min_sound_ms,
+        save_trimmed_audio=not args.no_trimmed_audio,
+        threshold_strategy=threshold_strategy,
+        noise_floor_percentile=noise_floor_percentile,
+        noise_floor_delta_db=noise_floor_delta_db
+    )
+    logger.info(f"\nPreprocessing complete!")
+    logger.info(f"Results saved to: {output_dir}")
+    return results_df
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pyyaml
+pandas
+pydub
+numpy
+pyloudnorm

run_llm_answers_all.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/usr/bin/env bash
+# Run llm_answer_generator.py across dataset folders and tasks
+# Processes both MCQ and open_text CSVs for tasks: count, duration, order, volume
+set -euo pipefail
+export CUDA_VISIBLE_DEVICES=7
+PY_SCRIPT="$(dirname "$0")/llm_answer_generator.py"
+BASE_DIR="$(dirname "$0")"
+DATA_SPLITS=(train validation test_large test_ood)
+TASKS=(count duration order volume)
+echo "Running LLM answer generation script across splits: ${DATA_SPLITS[*]} and tasks: ${TASKS[*]}"
+for split in "${DATA_SPLITS[@]}"; do
+  for task in "${TASKS[@]}"; do
+    # open_text file
+    ot_csv="${BASE_DIR}/dataset_v2/${split}/${task}/${task}_open_text.csv"
+    if [ -f "${ot_csv}" ]; then
+      echo "[OPEN_TEXT] Processing ${ot_csv}"
+      python "${PY_SCRIPT}" --input "${ot_csv}" --mode open_text --task "${task}"
+    else
+      echo "[OPEN_TEXT] Not found: ${ot_csv}"
+    fi
+  done
+done
+echo "All tasks processed."

run_pipeline.sh ADDED Viewed

	@@ -0,0 +1,166 @@

+#!/bin/bash
+################################################################################
+# Temporal Reasoning Audio Dataset Generation Pipeline
+#
+# This script orchestrates the entire dataset creation process for all tasks.
+################################################################################
+set -e  # Exit on error
+# Default configuration
+CONFIG_FILE="config.yaml"
+OUTPUT_DIR=""
+TASKS=""
+PYTHON_CMD="python"
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+# Function to print colored messages
+print_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+# Function to print usage
+usage() {
+    cat << EOF
+Usage: $0 [OPTIONS]
+Temporal Reasoning Audio Dataset Generation Pipeline
+OPTIONS:
+    -c, --config FILE       Configuration file (default: config.yaml)
+    -o, --output DIR        Output directory (overrides config)
+    -t, --tasks TASKS       Specific tasks to run: count,duration,order,volume
+                           (default: all enabled tasks)
+    -p, --python CMD        Python command to use (default: python)
+    -h, --help             Display this help message
+EXAMPLES:
+    # Run all tasks with default config
+    $0
+    # Run with custom config
+    $0 --config my_config.yaml
+    # Run specific tasks only
+    $0 --tasks count,duration
+    # Use custom output directory
+    $0 --output /path/to/output
+    # Combine options
+    $0 --config custom.yaml --tasks count,order --output ./my_dataset
+EOF
+}
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -c|--config)
+            CONFIG_FILE="$2"
+            shift 2
+            ;;
+        -o|--output)
+            OUTPUT_DIR="$2"
+            shift 2
+            ;;
+        -t|--tasks)
+            TASKS="$2"
+            shift 2
+            ;;
+        -p|--python)
+            PYTHON_CMD="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            print_error "Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+# Get script directory
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+# Check if config file exists
+if [ ! -f "$SCRIPT_DIR/$CONFIG_FILE" ]; then
+    print_error "Config file not found: $CONFIG_FILE"
+    exit 1
+fi
+# Print header
+echo ""
+echo "================================================================================"
+echo "  TEMPORAL REASONING AUDIO DATASET GENERATION PIPELINE"
+echo "================================================================================"
+echo ""
+print_info "Configuration: $CONFIG_FILE"
+print_info "Python command: $PYTHON_CMD"
+[ -n "$OUTPUT_DIR" ] && print_info "Output directory: $OUTPUT_DIR"
+[ -n "$TASKS" ] && print_info "Tasks to run: $TASKS"
+echo ""
+# Check Python dependencies
+print_info "Checking Python dependencies..."
+$PYTHON_CMD -c "import yaml, pandas, pydub" 2>/dev/null
+if [ $? -ne 0 ]; then
+    print_error "Missing required Python packages. Please install:"
+    echo "  pip install pyyaml pandas pydub"
+    exit 1
+fi
+print_success "Dependencies OK"
+echo ""
+# Build Python command arguments
+PYTHON_ARGS="$SCRIPT_DIR/main.py --config $SCRIPT_DIR/$CONFIG_FILE"
+[ -n "$OUTPUT_DIR" ] && PYTHON_ARGS="$PYTHON_ARGS --output $OUTPUT_DIR"
+if [ -n "$TASKS" ]; then
+    # Convert comma-separated to space-separated for Python argparse
+    TASKS_SPACE=$(echo $TASKS | tr ',' ' ')
+    PYTHON_ARGS="$PYTHON_ARGS --tasks $TASKS_SPACE"
+fi
+# Run the pipeline
+print_info "Starting pipeline..."
+echo ""
+$PYTHON_CMD $PYTHON_ARGS
+if [ $? -eq 0 ]; then
+    echo ""
+    echo "================================================================================"
+    print_success "PIPELINE COMPLETED SUCCESSFULLY!"
+    echo "================================================================================"
+    echo ""
+else
+    echo ""
+    echo "================================================================================"
+    print_error "PIPELINE FAILED!"
+    echo "================================================================================"
+    echo ""
+    exit 1
+fi

synthetic_silences/silent_1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed8ddf138c2c59409bb4f1dbbf3fc910b486752b0c389dbb5dac6a4e68b8cbe5
+size 263052

synthetic_silences/silent_10.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35fab767d2262eb552485542c6e593a5d84b7080862c577b23c11385176c7767
+size 274840

synthetic_silences/silent_11.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b01397619b480a22261daa7b018b59b5fd1baf1e3d4ed81161908def25112f17
+size 324418

synthetic_silences/silent_12.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2da9f4814fd0c6d50aa68696079c8d0ee880ed37d583e88a20481fd88c54e612
+size 310108

synthetic_silences/silent_13.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5999933e975cd5846ac152bf888a954ea9243fa2218429998d96ceffac54a7e0
+size 121474

synthetic_silences/silent_14.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42c8b935bd521534635cc4fea040023dbf420084b51d1e3529953d5d1593df48
+size 209182

synthetic_silences/silent_15.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00529829944fd650a368d6fe65e25a7f3d25d8d4ba932712b35dfa5608380c3e
+size 160682

synthetic_silences/silent_16.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:469eb34930878ba69a3994da5c2160314ce0c8bf0157d83f4ad349052a0c197b
+size 112534

synthetic_silences/silent_17.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d788262618a55e51d12b0c2220ced172c0edf9072569ab010d48adc01607215
+size 165986

synthetic_silences/silent_18.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83b6ef068680eacd83ac3d0b2f282fb37e2f4f018b03e89ab9a129aeac27a054
+size 257330

synthetic_silences/silent_19.wav ADDED Viewed

Binary file (96.9 kB). View file

synthetic_silences/silent_2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01bbeb6e0c14200b30be0eb57484450ba5807954333fede2e4c59d32a7042eaf
+size 310850

synthetic_silences/silent_20.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cda1e6a66b8cca7fc408f90cb6b8e8c13294fc33e8735a23dd72f1d36f9a991b
+size 140232

synthetic_silences/silent_3.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d49fadd497b9af43be5afbb08070a6317000f15edc8924bf3c11b3fcbb140616
+size 227846

synthetic_silences/silent_4.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:148c743ea43d3528a53395f579d4d337512de9d1fb3c5d5b66e55f3a5e9c4d0c
+size 337068

synthetic_silences/silent_5.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51976ce15c0272f14125acaa5529a88d6f085ce153ef64bdc662586e97cb5678
+size 205426

synthetic_silences/silent_6.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dda249ab269984ae15d0a78b582455c053b1cddafb78c792cafbcbf3f682a087
+size 329056

synthetic_silences/silent_7.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27156909b1191624cff0b0478477f8c40e47581bbb0be24a84e9113bf88f36a1
+size 146876

synthetic_silences/silent_8.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:815a88cc01def086ca4dc23c41359eea297ec39c179b114e6e608d27bd2d9a39
+size 216452

synthetic_silences/silent_9.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f88a22998b27a0e18e1801aa63ef4b83315c243762478c9cf149db4338ebafdb
+size 307884

tasks/__pycache__/task_count.cpython-312.pyc ADDED Viewed

Binary file (19.7 kB). View file

tasks/__pycache__/task_duration.cpython-312.pyc ADDED Viewed

Binary file (30.9 kB). View file

tasks/__pycache__/task_order.cpython-312.pyc ADDED Viewed

Binary file (23.7 kB). View file

tasks/__pycache__/task_volume.cpython-312.pyc ADDED Viewed

Binary file (27.7 kB). View file

tasks/task_count.py ADDED Viewed

	@@ -0,0 +1,472 @@

+"""
+Task 1: Count - Generate counting questions
+This task joins multiple audio sources and asks questions about counting
+the number of unique sound sources in the audio.
+"""
+import csv
+import random
+from pathlib import Path
+from typing import Dict, List
+import sys
+sys.path.append(str(Path(__file__).parent.parent))
+from utils import (
+    AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
+    setup_logger, set_random_seed, generate_sample_durations_for_task,
+    generate_single_clip_duration, build_count_task_audio,
+    get_max_clip_num_to_be_joined
+)
+class CountTaskGenerator:
+    """Generator for counting task dataset."""
+    def __init__(self, config: Dict, logger):
+        """
+        Initialize count task generator.
+        Args:
+            config: Configuration dictionary
+            logger: Logger instance
+        """
+        self.config = config
+        self.logger = logger
+        self.task_config = config['tasks']['count']
+        # Initialize components
+        self.dataset = ESC50Dataset(
+            config['esc50']['metadata_path'],
+            config['esc50']['audio_path'],
+            config  # Pass config for class subset loading
+        )
+        self.audio_processor = AudioProcessor(
+            crossfade_duration=config['audio']['crossfade_duration'],
+            silence_duration=config['audio']['silence_duration'],
+            with_silence=config['audio']['with_silence'],
+            normalize=config['audio']['normalize'],
+            normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
+            synthetic_silence_path=config['synthetic_silence']['path']
+        )
+        self.question_generator = QuestionGenerator(
+            num_options=config['mcq']['num_options'],
+            option_labels=config['mcq']['option_labels'],
+            distractor_strategy=config['mcq']['distractor_strategy']
+        )
+        # Initialize LLM question generator
+        self.llm_enabled = config.get('llm', {}).get('enabled', False)
+        self.llm_generator = LLMQuestionGenerator(
+            enabled=self.llm_enabled,
+            template_questions=self.task_config
+        )
+        if self.llm_enabled:
+            logger.info("LLM question generation enabled (local Llama 3.1 8B)")
+        else:
+            logger.info("Using template-based question generation")
+        # Duration settings from config
+        self.min_clip_duration = config['audio']['min_clip_duration']
+        self.max_clip_duration = config['audio']['max_clip_duration']
+        self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0)
+        self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
+        self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
+        # Small crossfade within same-source repetitions (for consecutive mode)
+        self.crossfade_within_source_ms = config['audio'].get('crossfade_within_source', 50)
+        self.task_duration_hours = self.task_config['task_duration_size']
+        # Ordering mode: "random" or "consecutive"
+        # random: Clips shuffled (A B A C B A C) - tests sound recognition
+        # consecutive: Same-source grouped (AAA BBB CCC) - easier
+        self.ordering_mode = self.task_config.get('ordering_mode', 'random')
+        logger.info(f"Count task ordering mode: {self.ordering_mode}")
+        # Set up output paths
+        self.output_base = Path(config['output']['base_path']) / 'count'
+        self.output_base.mkdir(parents=True, exist_ok=True)
+        self.audio_output = self.output_base / 'audios'
+        self.audio_output.mkdir(parents=True, exist_ok=True)
+    def create_sampling_list(self, parent_list: List, n_sampling: int) -> List:
+        """
+        Sample elements from parent list with replacement.
+        Args:
+            parent_list: List to sample from
+            n_sampling: Number of samples
+        Returns:
+            List of sampled elements
+        """
+        return [random.choice(parent_list) for _ in range(n_sampling)]
+    def generate_sample(self, sample_id: int, target_unique_count: int = None, target_duration_seconds: float = None) -> Dict:
+        """
+        Generate a single count task sample.
+        Pipeline for COUNT task:
+        1. Use pre-generated target duration (or generate if not provided)
+        2. Calculate max clips that can fit
+        3. Pick N unique classes (N <= max_clips, since each source needs at least 1 clip)
+        4. For each class, sample one audio clip
+        5. Calculate repetitions to fill target duration
+        6. Based on ordering_mode:
+           - "random": Shuffle clips (A B A C B A C) - tests recognition
+           - "consecutive": Group same-class (AAA BBB CCC) - easier
+        7. Insert silences between clips
+        8. Distribute remainder as random extra silences
+        Args:
+            sample_id: Sample ID number
+            target_unique_count: Target number of unique sounds (for balanced distribution)
+            target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
+        Returns:
+            Dictionary with sample metadata
+        """
+        # Use pre-generated duration or generate one (backward compatibility)
+        if target_duration_seconds is not None:
+            clip_duration_seconds = target_duration_seconds
+        else:
+            clip_duration_seconds = generate_single_clip_duration(
+                self.min_clip_duration,
+                self.max_clip_duration
+            )
+        # Calculate max clips that can fit in target duration
+        max_clips, remainder_seconds = get_max_clip_num_to_be_joined(
+            clip_duration_seconds,
+            self.source_clip_duration,
+            self.min_silence_ms
+        )
+        # Ensure at least 1 clip
+        max_clips = max(1, max_clips)
+        max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
+        # Calculate valid range: n_unique_audios can be 1 to max_clips_per_sample
+        # but cannot exceed what physically fits or available categories
+        max_unique_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
+        if max_unique_for_sample < 1:
+            raise ValueError(
+                f"Sample {sample_id}: Cannot generate sample - max_unique_for_sample={max_unique_for_sample}. "
+                f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, "
+                f"available_categories={len(self.dataset.CATEGORIES)}, duration={clip_duration_seconds:.1f}s. "
+                f"Increase min_clip_duration or reduce max_clips_per_sample."
+            )
+        # Determine n_unique_audios - use target from balanced distribution or random
+        if target_unique_count is not None:
+            # Clamp target to what this specific sample duration can fit
+            # Short samples can't fit all possible answers, so we clamp down
+            n_unique_audios = min(target_unique_count, max_unique_for_sample)
+            if n_unique_audios != target_unique_count:
+                self.logger.debug(
+                    f"Sample {sample_id}: Clamped target from {target_unique_count} to {n_unique_audios} "
+                    f"(duration={clip_duration_seconds:.1f}s can only fit {max_clips} clips)"
+                )
+        else:
+            # No target specified - randomly select from valid range
+            n_unique_audios = random.randint(1, max_unique_for_sample)
+        self.logger.debug(
+            f"Sample {sample_id}: target={clip_duration_seconds:.1f}s, max_clips={max_clips}, "
+            f"n_unique_audios={n_unique_audios}"
+        )
+        # Sample unique categories - use least-used categories for balanced distribution
+        selected_categories = self.dataset.get_least_used_categories(n_unique_audios)
+        # Track usage of all selected categories
+        for cat in selected_categories:
+            self.dataset.category_usage_counts[cat] += 1
+        # Sample one file from each unique category
+        source_files = []
+        source_paths = []
+        source_categories = []
+        for category in selected_categories:
+            filename, filepath = self.dataset.sample_file_from_category(category)
+            source_files.append(filename)
+            source_paths.append(filepath)
+            source_categories.append(category)
+        # Load unique source audios
+        source_audios = []
+        for file_path in source_paths:
+            audio = self.audio_processor.load_audio(file_path)
+            source_audios.append(audio)
+        # Build audio using configured ordering mode
+        final_audio, clip_sequence, build_metadata = build_count_task_audio(
+            source_audios,
+            source_categories,
+            clip_duration_seconds,
+            ordering_mode=self.ordering_mode,
+            source_clip_duration_seconds=self.source_clip_duration,
+            min_silence_ms=self.min_silence_ms,
+            max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
+            crossfade_within_source_ms=self.crossfade_within_source_ms
+        )
+        # Save the audio
+        output_audio_path = self.audio_output / f"{sample_id}.wav"
+        final_audio.export(str(output_audio_path), format="wav")
+        # Generate questions (using LLM if enabled)
+        if self.llm_enabled and self.llm_generator:
+            llm_questions = self.llm_generator.generate_count_questions(
+                correct_count=n_unique_audios,
+                categories_present=list(set(clip_sequence))
+            )
+            mcq_question_text = llm_questions.get('mcq_question')
+            open_text_question_text = llm_questions.get('open_text_question')
+        else:
+            mcq_question_text = random.choice(self.task_config['mcq_questions'])
+            open_text_question_text = random.choice(self.task_config['open_text_questions'])
+        # Generate MCQ with options
+        mcq_data = self.question_generator.generate_count_mcq(
+            mcq_question_text,
+            n_unique_audios,
+            self.dataset.CATEGORIES
+        )
+        # Generate open-text answer
+        open_text_data = self.question_generator.generate_count_open_text(
+            open_text_question_text,
+            n_unique_audios
+        )
+        # Create metadata
+        metadata = {
+            'id': sample_id,
+            'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
+            'n_unique_sounds': n_unique_audios,
+            'total_clips': build_metadata['total_clips'],
+            'repetitions_per_source': build_metadata['repetitions_per_source'],
+            'ordering_mode': self.ordering_mode,
+            'source_files': source_files,
+            'source_categories': source_categories,
+            'clip_sequence': clip_sequence,
+            'unique_categories': sorted(list(set(source_categories))),
+            'target_duration_seconds': clip_duration_seconds,
+            'actual_duration_seconds': len(final_audio) / 1000.0,
+            'mcq_question': mcq_data['question'],
+            'mcq_options': mcq_data['options'],
+            'mcq_correct_answer': mcq_data['correct_answer'],
+            'open_text_question': open_text_data['question'],
+            'open_text_answer': open_text_data['correct_answer'],
+            'llm_generated': self.llm_enabled
+        }
+        self.logger.info(
+            f"Generated count sample {sample_id}: {n_unique_audios} unique sounds, "
+            f"{build_metadata['total_clips']} clips, {len(final_audio)/1000:.1f}s"
+        )
+        return metadata
+    def generate_dataset(self) -> tuple:
+        """
+        Generate the complete count task dataset.
+        Returns:
+            Tuple of (mcq_csv_path, open_text_csv_path)
+        """
+        # Generate sample durations upfront to exactly fill target duration
+        sample_durations = generate_sample_durations_for_task(
+            self.task_duration_hours,
+            self.min_clip_duration,
+            self.max_clip_duration
+        )
+        num_samples = len(sample_durations)
+        self.logger.info(f"Generating {num_samples} count task samples (target: {self.task_duration_hours}h, actual: {sum(sample_durations)/3600:.2f}h)...")
+        # Calculate max clips each sample can fit based on duration
+        max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
+        sample_max_clips = []
+        for duration in sample_durations:
+            max_clips, _ = get_max_clip_num_to_be_joined(
+                duration,
+                self.source_clip_duration,
+                self.min_silence_ms
+            )
+            # Limit to config max and available categories
+            max_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
+            sample_max_clips.append(max_for_sample)
+        # Create balanced distribution by assigning targets based on sample capacity
+        # Sort samples by capacity to assign higher targets to samples that can fit them
+        possible_answers = list(range(1, max_clips_per_sample + 1))
+        samples_per_answer = num_samples // len(possible_answers)
+        remainder = num_samples % len(possible_answers)
+        # Create list of (sample_idx, duration, max_clips_capacity)
+        sample_info = [(i, sample_durations[i], sample_max_clips[i]) for i in range(num_samples)]
+        # Sort by capacity (descending) - assign high targets to high-capacity samples
+        sample_info.sort(key=lambda x: x[2], reverse=True)
+        # Assign targets: distribute each answer count across samples
+        balanced_assignments = [None] * num_samples
+        assignment_pool = []
+        for answer in possible_answers:
+            count = samples_per_answer + (1 if remainder > 0 else 0)
+            assignment_pool.extend([answer] * count)
+            remainder = max(0, remainder - 1)
+        # Reverse pool so we assign high targets first (to high-capacity samples)
+        assignment_pool.sort(reverse=True)
+        for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
+            # Assign target, clamped to sample's capacity
+            target = min(assignment_pool[idx], capacity)
+            balanced_assignments[sample_idx] = target
+        # Log the actual distribution after capacity clamping
+        from collections import Counter
+        distribution = Counter(balanced_assignments)
+        self.logger.info(f"Balanced answer distribution (after capacity-aware assignment): {dict(sorted(distribution.items()))}")
+        all_metadata = []
+        for i in range(num_samples):
+            metadata = self.generate_sample(
+                i,
+                target_unique_count=balanced_assignments[i],
+                target_duration_seconds=sample_durations[i]
+            )
+            all_metadata.append(metadata)
+        # Save MCQ CSV
+        mcq_csv_path = self.output_base / 'count_mcq.csv'
+        self._save_mcq_csv(all_metadata, mcq_csv_path)
+        # Save open-text CSV
+        open_text_csv_path = self.output_base / 'count_open_text.csv'
+        self._save_open_text_csv(all_metadata, open_text_csv_path)
+        # Save metadata CSV
+        metadata_csv_path = self.output_base / 'count_metadata.csv'
+        self._save_metadata_csv(all_metadata, metadata_csv_path)
+        self.logger.info(f"Count task dataset generation complete!")
+        self.logger.info(f"  - MCQ CSV: {mcq_csv_path}")
+        self.logger.info(f"  - Open-text CSV: {open_text_csv_path}")
+        self.logger.info(f"  - Metadata CSV: {metadata_csv_path}")
+        self.logger.info(f"  - Audio files: {self.audio_output}")
+        return mcq_csv_path, open_text_csv_path
+    def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save MCQ format CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            # Header
+            writer.writerow([
+                'question', 'id', 'audio_path',
+                'optionA', 'optionB', 'optionC', 'optionD',
+                'correct', 'source_wavs', 'source_categories'
+            ])
+            # Data rows
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['mcq_question'],
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['mcq_options']['A'],
+                    meta['mcq_options']['B'],
+                    meta['mcq_options']['C'],
+                    meta['mcq_options']['D'],
+                    meta['mcq_correct_answer'],
+                    str(meta['source_files']),
+                    str(meta['unique_categories'])
+                ])
+    def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save open-text format CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            # Header
+            writer.writerow([
+                'question', 'id', 'audio_path', 'answer',
+                'source_wavs', 'source_categories'
+            ])
+            # Data rows
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['open_text_question'],
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['open_text_answer'],
+                    str(meta['source_files']),
+                    str(meta['unique_categories'])
+                ])
+    def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save detailed metadata CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            # Header
+            writer.writerow([
+                'id', 'audio_path', 'total_clips', 'n_unique_sounds',
+                'source_files', 'source_categories', 'unique_categories',
+                'ordering_mode', 'target_duration_s', 'actual_duration_s', 'llm_generated'
+            ])
+            # Data rows
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['total_clips'],
+                    meta['n_unique_sounds'],
+                    str(meta['source_files']),
+                    str(meta['source_categories']),
+                    str(meta['unique_categories']),
+                    meta.get('ordering_mode', 'random'),
+                    meta.get('target_duration_seconds', 0),
+                    meta.get('actual_duration_seconds', 0),
+                    meta.get('llm_generated', False)
+                ])
+def main(config_path: str = None):
+    """Main entry point for count task generation."""
+    import yaml
+    # Load configuration
+    if config_path is None:
+        config_path = Path(__file__).parent.parent / 'config.yaml'
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    # Set random seed
+    set_random_seed(config['random_seed'])
+    # Setup logger
+    logger = setup_logger(
+        'count_task',
+        log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
+        level=config['logging']['level'],
+        console_output=config['logging']['console_output']
+    )
+    # Generate dataset
+    generator = CountTaskGenerator(config, logger)
+    generator.generate_dataset()
+if __name__ == '__main__':
+    main()

tasks/task_duration.py ADDED Viewed

	@@ -0,0 +1,820 @@

+"""
+Task 2: Duration - Generate duration comparison questions
+This task creates audio samples where sources have different effective durations
+and asks questions about which sound is heard for the longest or shortest time.
+Key features:
+- Uses amplitude-filtered (preprocessed) audio clips with known effective durations
+- First calculates max clips from total duration, then distributes slots
+- Strategically distributes repetitions to ensure clear longest/shortest answers
+- Consecutive ordering within sources, random order between sources
+- Gap multipliers ensure unambiguous answers (e.g., longest is 1.5x longer than next)
+- NO category preference - random selection to avoid bias
+"""
+import csv
+import random
+import math
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+from collections import Counter
+import sys
+sys.path.append(str(Path(__file__).parent.parent))
+from utils import (
+    AudioProcessor, PreprocessedESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
+    setup_logger, set_random_seed, calculate_num_samples_for_task,
+    generate_single_clip_duration, get_max_clip_num_to_be_joined,
+    build_duration_task_audio, distribute_remainder_as_silences,
+    generate_sample_durations_for_task
+)
+class DurationTaskGenerator:
+    """Generator for duration comparison task dataset using preprocessed ESC-50."""
+    def __init__(self, config: Dict, logger):
+        """
+        Initialize duration task generator.
+        Args:
+            config: Configuration dictionary
+            logger: Logger instance
+        """
+        self.config = config
+        self.logger = logger
+        self.task_config = config['tasks']['duration']
+        # Initialize preprocessed dataset (with effective durations)
+        self.dataset = PreprocessedESC50Dataset(
+            metadata_path=config['esc50']['metadata_path'],
+            audio_path=config['esc50']['audio_path'],
+            preprocessed_path=self.task_config['preprocessed_data_path'],
+            config=config  # Pass config for class subset loading
+        )
+        # Calculate average effective duration from preprocessed data
+        self.avg_effective_duration = self.dataset.effective_df['effective_duration_s'].mean()
+        self.logger.info(f"Average effective duration: {self.avg_effective_duration:.2f}s")
+        # Initialize audio processor
+        self.audio_processor = AudioProcessor(
+            crossfade_duration=config['audio']['crossfade_duration'],
+            silence_duration=config['audio']['silence_duration'],
+            with_silence=config['audio']['with_silence'],
+            normalize=config['audio']['normalize'],
+            normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
+            synthetic_silence_path=config['synthetic_silence']['path']
+        )
+        # Initialize question generator
+        self.question_generator = QuestionGenerator(
+            num_options=config['mcq']['num_options'],
+            option_labels=config['mcq']['option_labels'],
+            distractor_strategy=config['mcq']['distractor_strategy']
+        )
+        # Initialize LLM question generator
+        self.llm_enabled = config.get('llm', {}).get('enabled', False)
+        self.llm_generator = LLMQuestionGenerator(
+            enabled=self.llm_enabled,
+            template_questions=self.task_config
+        )
+        # Duration settings from config
+        self.min_clip_duration = config['audio']['min_clip_duration']
+        self.max_clip_duration = config['audio']['max_clip_duration']
+        self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
+        self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
+        self.crossfade_within_source_ms = config['audio'].get('crossfade_within_source', 50)
+        self.task_duration_hours = self.task_config['task_duration_size']
+        # Duration task specific settings
+        self.multiplier_longest = self.task_config.get('multiplier_longest', 1.5)
+        self.multiplier_shortest = self.task_config.get('multiplier_shortest', 0.75)
+        self.reject_if_gap_not_met = self.task_config.get('reject_if_gap_not_met', True)
+        self.sample_different_clips = self.task_config.get('sample_different_clips_same_class', True)
+        # Minimum effective duration per source (seconds) - clips shorter than this are harder to distinguish
+        self.min_effective_duration_per_source = self.task_config.get('min_effective_duration_per_source', 1.0)
+        # Set up output paths
+        self.output_base = Path(config['output']['base_path']) / 'duration'
+        self.output_base.mkdir(parents=True, exist_ok=True)
+        self.audio_output = self.output_base / 'audios'
+        self.audio_output.mkdir(parents=True, exist_ok=True)
+        # Statistics tracking
+        self.rejection_count = 0
+        self.success_count = 0
+    def _calculate_max_clips_and_sources(
+        self,
+        target_duration_s: float,
+        question_type: str
+    ) -> Tuple[int, int, float]:
+        """
+        Calculate max clips possible and choose n_sources from config that satisfies gap.
+        Key principle:
+        1. Calculate valid range of sources that can satisfy gap constraint
+        2. Filter config values to only those within valid range
+        3. Pick RANDOMLY from valid config values (ensures variety)
+        For LONGEST:
+        - Target needs at least 2 clips to beat max_background by 1.5x
+        - max_sources = max_clips - 2 + 1 (backgrounds get 1 each)
+        - min_sources = 2 (need at least 1 background)
+        For SHORTEST:
+        - Target gets 1 clip
+        - Each background needs at least 2 clips to be 2x target (1/0.5)
+        - max_sources = 1 + (max_clips - 1) // 2
+        - min_sources = 2
+        Args:
+            target_duration_s: Target total audio duration
+            question_type: "longest" or "shortest"
+        Returns:
+            Tuple of (max_clips, n_sources, remainder_s)
+        """
+        # Get max clips using average effective duration
+        max_clips, remainder_s = get_max_clip_num_to_be_joined(
+            target_duration_s,
+            self.avg_effective_duration,
+            self.min_silence_ms
+        )
+        # Ensure at least 2 clips
+        max_clips = max(2, max_clips)
+        # Get config values for n_sources
+        # If single int (e.g., 15), sample from [1, 15] like count/order tasks
+        # If list (e.g., [2,3,4]), sample from the list
+        num_sources_config = self.task_config.get('num_unique_sources', [2, 3, 4, 5])
+        if isinstance(num_sources_config, int):
+            # Single int: create range [1, num_sources_config]
+            num_sources_config = list(range(1, num_sources_config + 1))
+        if question_type == "longest":
+            # Target needs at least 2 clips to reliably beat background by multiplier
+            # (with 1.5x multiplier, 2 clips of target vs 1 clip of background usually works)
+            min_target_clips = 2
+            # Minimum sources: need at least 1 background + target = 2
+            min_valid_sources = 2
+            # Maximum sources: max_clips - min_target_clips + 1
+            # (subtract target's clips, add 1 for the target itself)
+            max_valid_sources = max_clips - min_target_clips + 1
+        else:  # shortest
+            # Target gets 1 clip
+            # Each background needs at least 2 clips to be >= 2x target (1/0.5 multiplier)
+            min_clips_per_background = 2
+            # Minimum sources: 2 (target + 1 background)
+            min_valid_sources = 2
+            # Maximum sources: how many backgrounds can we fit?
+            remaining_clips = max_clips - 1  # 1 for target
+            max_backgrounds = remaining_clips // min_clips_per_background
+            max_valid_sources = max_backgrounds + 1  # +1 for target
+        # Filter config values to only valid ones
+        valid_config_sources = [
+            n for n in num_sources_config
+            if min_valid_sources <= n <= max_valid_sources
+        ]
+        if not valid_config_sources:
+            raise ValueError(
+                f"Duration task: No valid num_unique_sources for {question_type} question. "
+                f"Config values: {num_sources_config}, Valid range: [{min_valid_sources}, {max_valid_sources}]. "
+                f"max_clips={max_clips}, duration={target_duration_s:.1f}s. "
+                f"Increase min_clip_duration or adjust num_unique_sources config."
+            )
+        # Pick RANDOMLY from valid config values (ensures variety!)
+        n_sources = random.choice(valid_config_sources)
+        # Validate final value
+        if n_sources < 2 or n_sources > len(self.dataset.CATEGORIES):
+            raise ValueError(
+                f"Duration task: Invalid n_sources={n_sources}. "
+                f"Must be in range [2, {len(self.dataset.CATEGORIES)}]"
+            )
+        self.logger.debug(
+            f"Max clips: {max_clips}, Question: {question_type}, "
+            f"Valid range: [{min_valid_sources}, {max_valid_sources}], "
+            f"Valid config: {valid_config_sources}, Selected: {n_sources}"
+        )
+        return max_clips, n_sources, remainder_s
+    def _calculate_slot_distribution(
+        self,
+        max_clips: int,
+        n_sources: int,
+        effective_durations: Dict[str, float],
+        target_category: str,
+        question_type: str
+    ) -> Tuple[Dict[str, int], bool, Dict]:
+        """
+        Calculate how many clips each source gets.
+        For LONGEST: target gets (max_clips - n_backgrounds), backgrounds get 1 each
+        For SHORTEST: target gets 1, backgrounds share (max_clips - 1)
+        Args:
+            max_clips: Maximum number of clips that fit
+            n_sources: Number of unique sources
+            effective_durations: Dict mapping category -> effective duration
+            target_category: The category that should be longest/shortest
+            question_type: "longest" or "shortest"
+        Returns:
+            Tuple of (slot_distribution, gap_satisfied, metadata)
+        """
+        categories = list(effective_durations.keys())
+        background_categories = [c for c in categories if c != target_category]
+        n_backgrounds = len(background_categories)
+        if question_type == "longest":
+            # Target gets max_clips - n_backgrounds
+            # Backgrounds get 1 each
+            target_clips = max_clips - n_backgrounds
+            target_clips = max(1, target_clips)  # At least 1
+            slot_distribution = {target_category: target_clips}
+            for cat in background_categories:
+                slot_distribution[cat] = 1
+            # Verify gap: target_duration >= max_background × multiplier
+            target_duration = target_clips * effective_durations[target_category]
+            background_durations = [effective_durations[c] for c in background_categories]
+            max_background = max(background_durations) if background_durations else 0
+            required_target = max_background * self.multiplier_longest
+            gap_satisfied = target_duration >= required_target
+            metadata = {
+                'target_clips': target_clips,
+                'target_duration_s': target_duration,
+                'max_background_s': max_background,
+                'required_target_s': required_target,
+                'multiplier': self.multiplier_longest
+            }
+        else:  # shortest
+            # Target gets 1 clip
+            # Backgrounds share (max_clips - 1)
+            remaining_clips = max_clips - 1
+            clips_per_background = max(1, remaining_clips // n_backgrounds)
+            extra_clips = remaining_clips % n_backgrounds
+            slot_distribution = {target_category: 1}
+            for i, cat in enumerate(background_categories):
+                clips = clips_per_background + (1 if i < extra_clips else 0)
+                slot_distribution[cat] = clips
+            # Verify gap: target_duration <= min_background × multiplier
+            target_duration = effective_durations[target_category]
+            background_durations = [
+                slot_distribution[c] * effective_durations[c]
+                for c in background_categories
+            ]
+            min_background = min(background_durations) if background_durations else float('inf')
+            required_max_target = min_background * self.multiplier_shortest
+            # CRITICAL: Target must still be at least min_effective_duration_per_source
+            # Otherwise clips that are too short (e.g., 0.03s) would be used and be indistinguishable
+            target_too_short = target_duration < self.min_effective_duration_per_source
+            gap_satisfied = (target_duration <= required_max_target) and (not target_too_short)
+            metadata = {
+                'target_clips': 1,
+                'target_duration_s': target_duration,
+                'min_background_s': min_background,
+                'required_max_target_s': required_max_target,
+                'multiplier': self.multiplier_shortest,
+                'target_too_short': target_too_short
+            }
+        return slot_distribution, gap_satisfied, metadata
+    def _try_generate_sample(
+        self,
+        sample_id: int,
+        question_type: str,
+        max_retries: int = 5,
+        target_duration_seconds: float = None
+    ) -> Optional[Dict]:
+        """
+        Try to generate a valid duration sample with retries.
+        Args:
+            sample_id: Sample ID
+            question_type: "longest" or "shortest"
+            max_retries: Maximum retry attempts
+            target_duration_seconds: Pre-generated target duration
+        Returns:
+            Metadata dict if successful, None if all retries failed
+        """
+        for attempt in range(max_retries):
+            try:
+                result = self._generate_single_sample(sample_id, question_type, target_duration_seconds=target_duration_seconds)
+                if result is not None:
+                    return result
+            except Exception as e:
+                self.logger.warning(f"Sample {sample_id} attempt {attempt+1} failed: {e}")
+        return None
+    def _generate_single_sample(
+        self,
+        sample_id: int,
+        question_type: str,
+        target_duration_seconds: float = None
+    ) -> Optional[Dict]:
+        """
+        Generate a single duration task sample.
+        Corrected Pipeline:
+        1. Use pre-generated target duration (or generate if not provided)
+        2. Calculate max_clips using get_max_clip_num_to_be_joined
+        3. Based on max_clips and question_type, determine n_sources
+        4. Select categories RANDOMLY (no bias toward short/long)
+        5. Pick target category RANDOMLY from selected
+        6. Get effective durations for all sources
+        7. Calculate slot distribution based on max_clips
+        8. Verify gap constraint
+        9. Load audio clips and build final audio
+        Args:
+            sample_id: Sample ID number
+            question_type: "longest" or "shortest"
+            target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
+        Returns:
+            Dictionary with sample metadata, or None if failed
+        """
+        # Step 1: Use pre-generated duration or generate one (backward compatibility)
+        if target_duration_seconds is not None:
+            target_duration_s = target_duration_seconds
+        else:
+            target_duration_s = generate_single_clip_duration(
+                self.min_clip_duration,
+                self.max_clip_duration
+            )
+        # Step 2 & 3: Calculate max_clips and n_sources
+        max_clips, n_sources, remainder_s = self._calculate_max_clips_and_sources(
+            target_duration_s,
+            question_type
+        )
+        # Step 4: Select categories RANDOMLY (using least-used for balance, but no duration preference)
+        all_categories = self.dataset.get_least_used_categories(n_sources)
+        # Step 5: Pick target category RANDOMLY from selected (no bias!)
+        target_category = random.choice(all_categories)
+        self.dataset.category_usage_counts[target_category] += 1
+        # Step 6: Get effective durations by sampling one file per category
+        # Use min_effective_duration_per_source to avoid clips that are too short to distinguish
+        effective_durations = {}
+        selected_files = {}
+        for category in all_categories:
+            filename, filepath, eff_dur = self.dataset.sample_file_from_category_with_duration(
+                category,
+                min_effective_duration=self.min_effective_duration_per_source
+            )
+            effective_durations[category] = eff_dur
+            selected_files[category] = {
+                'filename': filename,
+                'filepath': filepath,
+                'effective_duration_s': eff_dur
+            }
+        # Step 7: Calculate slot distribution based on max_clips
+        slot_distribution, gap_satisfied, calc_metadata = self._calculate_slot_distribution(
+            max_clips=max_clips,
+            n_sources=n_sources,
+            effective_durations=effective_durations,
+            target_category=target_category,
+            question_type=question_type
+        )
+        # Step 8: If gap not satisfied, try adjustments
+        if not gap_satisfied:
+            # Try with different clips that have better durations
+            if self.sample_different_clips:
+                gap_satisfied = self._try_improve_gap_with_different_clips(
+                    question_type=question_type,
+                    target_category=target_category,
+                    all_categories=all_categories,
+                    max_clips=max_clips,
+                    n_sources=n_sources,
+                    effective_durations=effective_durations,
+                    selected_files=selected_files,
+                    slot_distribution=slot_distribution
+                )
+            if not gap_satisfied and self.reject_if_gap_not_met:
+                self.rejection_count += 1
+                self.logger.debug(
+                    f"Sample {sample_id} rejected: gap not satisfied "
+                    f"(type={question_type}, max_clips={max_clips}, sources={n_sources})"
+                )
+                return None
+        # Step 9: Load audio clips based on slot distribution
+        source_audio_lists = {}
+        files_used = {}
+        for category in all_categories:
+            reps = slot_distribution.get(category, 0)
+            if reps == 0:
+                continue
+            # Get files for this category
+            if self.sample_different_clips and reps > 1:
+                filenames, filepaths, total_dur = self.dataset.sample_files_from_category_to_reach_duration(
+                    category,
+                    reps * effective_durations[category],
+                    prefer_same_file=False
+                )
+            else:
+                # Use same file repeated
+                file_info = selected_files[category]
+                filenames = [file_info['filename']] * reps
+                filepaths = [file_info['filepath']] * reps
+            # Load audio segments
+            audio_list = []
+            for fp in filepaths[:reps]:
+                audio = self.audio_processor.load_audio(fp)
+                audio_list.append(audio)
+            # If we need more, cycle through
+            while len(audio_list) < reps:
+                audio_list.append(audio_list[len(audio_list) % len(audio_list)])
+            source_audio_lists[category] = audio_list[:reps]
+            files_used[category] = filenames[:reps]
+        # Step 10: Build final audio
+        final_audio, category_sequence, build_metadata = build_duration_task_audio(
+            source_audio_lists=source_audio_lists,
+            slot_distribution=slot_distribution,
+            effective_durations=effective_durations,
+            target_total_duration_s=target_duration_s,
+            min_silence_between_sources_ms=self.min_silence_ms,
+            max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
+            crossfade_within_source_ms=self.crossfade_within_source_ms
+        )
+        # Save audio
+        output_audio_path = self.audio_output / f"{sample_id}.wav"
+        final_audio.export(str(output_audio_path), format="wav")
+        # Step 11: Generate questions
+        correct_category = target_category
+        present_categories = all_categories
+        mcq_question = self.task_config['mcq_questions'][question_type]
+        mcq_data = self.question_generator.generate_category_mcq(
+            mcq_question,
+            correct_category,
+            present_categories,
+            self.dataset.CATEGORIES
+        )
+        open_text_question = self.task_config['open_text_questions'][question_type]
+        open_text_data = self.question_generator.generate_category_open_text(
+            open_text_question,
+            correct_category
+        )
+        # Calculate actual effective durations
+        actual_effective_durations = {
+            cat: slot_distribution[cat] * effective_durations[cat]
+            for cat in all_categories
+            if cat in slot_distribution
+        }
+        # Create metadata
+        metadata = {
+            'id': sample_id,
+            'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
+            'question_type': question_type,
+            'max_clips': max_clips,
+            'n_unique_sources': n_sources,
+            'target_category': target_category,
+            'present_categories': present_categories,
+            'source_order': build_metadata['source_order'],
+            'slot_distribution': slot_distribution,
+            'effective_durations_per_clip': effective_durations,
+            'total_effective_durations': actual_effective_durations,
+            'gap_satisfied': gap_satisfied,
+            'multiplier_used': self.multiplier_longest if question_type == 'longest' else self.multiplier_shortest,
+            'files_used': files_used,
+            'target_duration_s': target_duration_s,
+            'actual_duration_s': len(final_audio) / 1000.0,
+            'timestamp_string': build_metadata.get('timestamp_string', ''),
+            'source_timestamps': build_metadata.get('source_timestamps', []),
+            'mcq_question': mcq_data['question'],
+            'mcq_options': mcq_data['options'],
+            'mcq_correct_answer': mcq_data['correct_answer'],
+            'open_text_question': open_text_data['question'],
+            'open_text_answer': open_text_data['correct_answer'],
+            'calc_metadata': calc_metadata
+        }
+        self.success_count += 1
+        self.logger.info(
+            f"Generated duration sample {sample_id}: {question_type}, "
+            f"max_clips={max_clips}, sources={n_sources}, target={target_category}, "
+            f"slots={slot_distribution}, gap_satisfied={gap_satisfied}"
+        )
+        return metadata
+    def _try_improve_gap_with_different_clips(
+        self,
+        question_type: str,
+        target_category: str,
+        all_categories: List[str],
+        max_clips: int,
+        n_sources: int,
+        effective_durations: Dict[str, float],
+        selected_files: Dict[str, Dict],
+        slot_distribution: Dict[str, int]
+    ) -> bool:
+        """
+        Try to improve gap satisfaction by selecting different clips.
+        For LONGEST: try clips with longer effective duration for target
+        For SHORTEST: try clips with shorter effective duration for target
+        Args:
+            Various state from generate_sample
+        Returns:
+            True if gap is now satisfied
+        """
+        files = self.dataset.get_files_by_category_with_durations(target_category)
+        if question_type == "longest":
+            # Try to find a longer clip for target category
+            files_sorted = sorted(files, key=lambda x: x['effective_duration_s'], reverse=True)
+        else:
+            # For shortest, try shorter clip for target
+            files_sorted = sorted(files, key=lambda x: x['effective_duration_s'])
+        if files_sorted:
+            best = files_sorted[0]
+            effective_durations[target_category] = best['effective_duration_s']
+            selected_files[target_category] = {
+                'filename': best['filename'],
+                'filepath': best['filepath'],
+                'effective_duration_s': best['effective_duration_s']
+            }
+        # Recalculate slot distribution
+        new_slots, gap_satisfied, _ = self._calculate_slot_distribution(
+            max_clips=max_clips,
+            n_sources=n_sources,
+            effective_durations=effective_durations,
+            target_category=target_category,
+            question_type=question_type
+        )
+        if gap_satisfied:
+            slot_distribution.clear()
+            slot_distribution.update(new_slots)
+        return gap_satisfied
+    def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Optional[Dict]:
+        """
+        Generate a single duration task sample with retries.
+        Args:
+            sample_id: Sample ID number
+            target_question_type: Target question type for balanced distribution
+            target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
+        Returns:
+            Dictionary with sample metadata, or None if failed
+        """
+        question_type = target_question_type or random.choice(
+            self.task_config['question_types']
+        )
+        return self._try_generate_sample(sample_id, question_type, target_duration_seconds=target_duration_seconds)
+    def generate_dataset(self) -> tuple:
+        """
+        Generate the complete duration task dataset.
+        Uses generate_sample_durations_for_task() to pre-generate exact sample durations
+        that sum to exactly the target task duration. This guarantees:
+        - Exact coverage of target duration
+        - No estimation errors from average-based calculation
+        Returns:
+            Tuple of (mcq_csv_path, open_text_csv_path)
+        """
+        # Generate sample durations upfront (guarantees exact total duration)
+        sample_durations = generate_sample_durations_for_task(
+            self.task_duration_hours,
+            self.min_clip_duration,
+            self.max_clip_duration
+        )
+        num_samples = len(sample_durations)
+        self.logger.info(
+            f"Generating {num_samples} duration task samples "
+            f"(target: {self.task_duration_hours}h, exact fill)..."
+        )
+        # Create balanced question type distribution
+        question_types = self.task_config['question_types']
+        balanced_types = []
+        samples_per_type = num_samples // len(question_types)
+        remainder = num_samples % len(question_types)
+        for qtype in question_types:
+            count = samples_per_type + (1 if remainder > 0 else 0)
+            balanced_types.extend([qtype] * count)
+            remainder = max(0, remainder - 1)
+        random.shuffle(balanced_types)
+        type_dist = Counter(balanced_types)
+        self.logger.info(f"Balanced question type distribution: {dict(sorted(type_dist.items()))}")
+        all_metadata = []
+        sample_idx = 0
+        type_idx = 0
+        while len(all_metadata) < num_samples and type_idx < len(balanced_types) * 2:
+            question_type = balanced_types[type_idx % len(balanced_types)]
+            target_duration = sample_durations[sample_idx] if sample_idx < len(sample_durations) else None
+            metadata = self.generate_sample(sample_idx, question_type, target_duration_seconds=target_duration)
+            if metadata is not None:
+                all_metadata.append(metadata)
+                sample_idx += 1
+            type_idx += 1
+            # Log progress
+            if len(all_metadata) % 50 == 0:
+                self.logger.info(
+                    f"Progress: {len(all_metadata)}/{num_samples} samples, "
+                    f"{self.rejection_count} rejections"
+                )
+        self.logger.info(
+            f"Generation complete: {len(all_metadata)} samples, "
+            f"{self.rejection_count} rejections "
+            f"({self.rejection_count/(len(all_metadata)+self.rejection_count)*100:.1f}% rejection rate)"
+        )
+        # Save CSVs
+        mcq_csv_path = self.output_base / 'duration_mcq.csv'
+        self._save_mcq_csv(all_metadata, mcq_csv_path)
+        open_text_csv_path = self.output_base / 'duration_open_text.csv'
+        self._save_open_text_csv(all_metadata, open_text_csv_path)
+        metadata_csv_path = self.output_base / 'duration_metadata.csv'
+        self._save_metadata_csv(all_metadata, metadata_csv_path)
+        self.logger.info(f"Duration task dataset generation complete!")
+        self.logger.info(f"  - MCQ CSV: {mcq_csv_path}")
+        self.logger.info(f"  - Open-text CSV: {open_text_csv_path}")
+        self.logger.info(f"  - Metadata CSV: {metadata_csv_path}")
+        self.logger.info(f"  - Audio files: {self.audio_output}")
+        return mcq_csv_path, open_text_csv_path
+    def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save MCQ format CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow([
+                'question', 'id', 'audio_path',
+                'optionA', 'optionB', 'optionC', 'optionD',
+                'correct', 'question_type', 'max_clips', 'n_sources',
+                'target_category', 'slot_distribution', 'effective_durations'
+            ])
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['mcq_question'],
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['mcq_options']['A'],
+                    meta['mcq_options']['B'],
+                    meta['mcq_options']['C'],
+                    meta['mcq_options']['D'],
+                    meta['mcq_correct_answer'],
+                    meta['question_type'],
+                    meta['max_clips'],
+                    meta['n_unique_sources'],
+                    meta['target_category'],
+                    str(meta['slot_distribution']),
+                    str(meta['total_effective_durations'])
+                ])
+    def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save open-text format CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow([
+                'question', 'id', 'audio_path', 'answer',
+                'question_type', 'max_clips', 'n_sources',
+                'target_category', 'effective_durations'
+            ])
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['open_text_question'],
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['open_text_answer'],
+                    meta['question_type'],
+                    meta['max_clips'],
+                    meta['n_unique_sources'],
+                    meta['target_category'],
+                    str(meta['total_effective_durations'])
+                ])
+    def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save detailed metadata CSV with effective durations and timestamps."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow([
+                'id', 'audio_path', 'question_type', 'max_clips', 'n_sources',
+                'target_category', 'present_categories', 'source_order',
+                'slot_distribution', 'effective_durations_per_clip',
+                'total_effective_durations', 'gap_satisfied', 'multiplier_used',
+                'target_duration_s', 'actual_duration_s', 'clip_timestamps', 'files_used'
+            ])
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['question_type'],
+                    meta['max_clips'],
+                    meta['n_unique_sources'],
+                    meta['target_category'],
+                    str(meta['present_categories']),
+                    str(meta['source_order']),
+                    str(meta['slot_distribution']),
+                    str(meta['effective_durations_per_clip']),
+                    str(meta['total_effective_durations']),
+                    meta['gap_satisfied'],
+                    meta['multiplier_used'],
+                    round(meta['target_duration_s'], 2),
+                    round(meta['actual_duration_s'], 2),
+                    meta.get('timestamp_string', ''),
+                    str(meta['files_used'])
+                ])
+def main(config_path: str = None):
+    """Main entry point for duration task generation."""
+    import yaml
+    if config_path is None:
+        config_path = Path(__file__).parent.parent / 'config.yaml'
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    set_random_seed(config['random_seed'])
+    logger = setup_logger(
+        'duration_task',
+        log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
+        level=config['logging']['level'],
+        console_output=config['logging']['console_output']
+    )
+    generator = DurationTaskGenerator(config, logger)
+    generator.generate_dataset()
+if __name__ == '__main__':
+    main()

tasks/task_order.py ADDED Viewed

	@@ -0,0 +1,598 @@

+"""
+Task 3: Order - Generate temporal ordering questions
+This task joins multiple audio sources and asks questions about their temporal order
+(first, last, what comes after, what comes before).
+"""
+import csv
+import random
+import math
+from pathlib import Path
+from typing import Dict, List
+import sys
+sys.path.append(str(Path(__file__).parent.parent))
+from utils import (
+    AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
+    setup_logger, set_random_seed, calculate_num_samples_for_task,
+    generate_single_clip_duration, get_max_clip_num_to_be_joined,
+    build_clip_sequence_with_silences, generate_sample_durations_for_task
+)
+class OrderTaskGenerator:
+    """Generator for temporal ordering task dataset."""
+    def __init__(self, config: Dict, logger):
+        """
+        Initialize order task generator.
+        Args:
+            config: Configuration dictionary
+            logger: Logger instance
+        """
+        self.config = config
+        self.logger = logger
+        self.task_config = config['tasks']['order']
+        # Initialize components
+        self.dataset = ESC50Dataset(
+            config['esc50']['metadata_path'],
+            config['esc50']['audio_path'],
+            config  # Pass config for class subset loading
+        )
+        self.audio_processor = AudioProcessor(
+            crossfade_duration=config['audio']['crossfade_duration'],
+            silence_duration=config['audio']['silence_duration'],
+            with_silence=config['audio']['with_silence'],
+            normalize=config['audio']['normalize'],
+            normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
+            synthetic_silence_path=config['synthetic_silence']['path']
+        )
+        self.question_generator = QuestionGenerator(
+            num_options=config['mcq']['num_options'],
+            option_labels=config['mcq']['option_labels'],
+            distractor_strategy=config['mcq']['distractor_strategy']
+        )
+        # Initialize LLM question generator
+        self.llm_enabled = config.get('llm', {}).get('enabled', False)
+        self.llm_generator = LLMQuestionGenerator(
+            enabled=self.llm_enabled,
+            template_questions=self.task_config
+        )
+        # Duration settings from config
+        self.min_clip_duration = config['audio']['min_clip_duration']
+        self.max_clip_duration = config['audio']['max_clip_duration']
+        # Duration of individual source clips (ESC-50 default is 5s)
+        self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0)
+        self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
+        self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
+        self.crossfade_ms = config['audio'].get('crossfade_duration', 0)
+        self.task_duration_hours = self.task_config['task_duration_size']
+        # Order task specific settings
+        self.allow_source_repetition = self.task_config.get('allow_source_repetition', False)
+        self.min_clips_for_second = self.task_config.get('min_clips_for_second_questions', 4)
+        # Set up output paths
+        self.output_base = Path(config['output']['base_path']) / 'order'
+        self.output_base.mkdir(parents=True, exist_ok=True)
+        self.audio_output = self.output_base / 'audios'
+        self.audio_output.mkdir(parents=True, exist_ok=True)
+    def _get_valid_question_types(self, n_clips: int) -> List[str]:
+        """
+        Get question types valid for the given number of clips.
+        "second" and "second_last" require at least min_clips_for_second clips.
+        Args:
+            n_clips: Number of clips in the sample
+        Returns:
+            List of valid question types
+        """
+        all_types = self.task_config['question_types']
+        # Filter based on n_clips
+        valid_types = []
+        for qtype in all_types:
+            if qtype in ['second', 'second_last']:
+                if n_clips >= self.min_clips_for_second:
+                    valid_types.append(qtype)
+            elif qtype in ['after', 'before']:
+                if n_clips >= 2:
+                    valid_types.append(qtype)
+            else:  # first, last
+                valid_types.append(qtype)
+        return valid_types if valid_types else ['first', 'last']
+    def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Dict:
+        """
+        Generate a single order task sample.
+        Pipeline: pick dataset -> pick class -> pick audio clip -> get duration ->
+        concatenate clips to reach target duration -> modulo to get num clips ->
+        inserting silences randomly based on remainder.
+        Args:
+            sample_id: Sample ID number
+            target_question_type: Target question type for balanced distribution
+            target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
+        Returns:
+            Dictionary with sample metadata
+        """
+        # Use pre-generated duration or generate one (backward compatibility)
+        if target_duration_seconds is not None:
+            clip_duration_seconds = target_duration_seconds
+        else:
+            clip_duration_seconds = generate_single_clip_duration(
+                self.min_clip_duration,
+                self.max_clip_duration
+            )
+        # Calculate how many clips we need using the new helper
+        max_clips, remainder_seconds = get_max_clip_num_to_be_joined(
+            clip_duration_seconds,
+            self.source_clip_duration,
+            self.min_silence_ms
+        )
+        max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
+        # Silence reduction strategy: subsample from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)]
+        # This ensures we use close to max_clips that fit, reducing excessive silence
+        # Calculate valid range for this sample's duration
+        min_clips_for_sample = max(2, max_clips - 3)  # At least 2, preferably max_clips-3
+        max_clips_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
+        # Validate range
+        if max_clips_for_sample < 2:
+            raise ValueError(
+                f"Sample {sample_id}: Cannot generate order task - need at least 2 clips. "
+                f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, "
+                f"duration={clip_duration_seconds:.1f}s. Increase min_clip_duration."
+            )
+        if min_clips_for_sample > max_clips_for_sample:
+            raise ValueError(
+                f"Sample {sample_id}: Invalid clip range - min_clips ({min_clips_for_sample}) > max_clips ({max_clips_for_sample}). "
+                f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, duration={clip_duration_seconds:.1f}s"
+            )
+        # Randomly select from valid range (NO balanced pool for order task)
+        n_clips = random.randint(min_clips_for_sample, max_clips_for_sample)
+        # Get valid question types for this n_clips
+        valid_question_types = self._get_valid_question_types(n_clips)
+        if not valid_question_types:
+            raise ValueError(
+                f"Sample {sample_id}: No valid question types for n_clips={n_clips}. "
+                f"This should not happen - check _get_valid_question_types implementation."
+            )
+        # Pre-select question type to determine answer position
+        if target_question_type is not None:
+            if target_question_type not in valid_question_types:
+                raise ValueError(
+                    f"Sample {sample_id}: target_question_type='{target_question_type}' not valid for n_clips={n_clips}. "
+                    f"Valid types: {valid_question_types}. Balanced distribution should only assign valid types."
+                )
+            question_type = target_question_type
+        else:
+            question_type = random.choice(valid_question_types)
+        # Determine answer position based on question type
+        if question_type == 'first':
+            answer_position = 0
+        elif question_type == 'last':
+            answer_position = n_clips - 1
+        elif question_type == 'second':
+            answer_position = 1  # 0-indexed, so position 1 is second
+        elif question_type == 'second_last':
+            answer_position = n_clips - 2  # Second to last
+        elif question_type == 'after':
+            # Answer is after a reference, so position 1 to n-1
+            answer_position = random.randint(1, n_clips - 1) if n_clips >= 2 else 0
+        else:  # before
+            # Answer is before a reference, so position 0 to n-2
+            answer_position = random.randint(0, n_clips - 2) if n_clips >= 2 else 0
+        # Select answer category from least-used categories
+        answer_category = self.dataset.get_least_used_categories(1)[0]
+        # Sample remaining categories, ensuring balanced distribution
+        if n_clips <= len(self.dataset.CATEGORIES):
+            other_categories = self.dataset.get_least_used_categories(
+                n_clips - 1,
+                exclude=[answer_category]
+            )
+        else:
+            # Need more clips than unique categories - sample with some repetition
+            other_categories = self.dataset.get_least_used_categories(
+                min(n_clips - 1, len(self.dataset.CATEGORIES) - 1),
+                exclude=[answer_category]
+            )
+            # Add random repetitions if needed
+            while len(other_categories) < n_clips - 1:
+                other_categories.append(random.choice(self.dataset.CATEGORIES))
+        # Arrange categories with answer at correct position
+        selected_categories = []
+        other_idx = 0
+        for i in range(n_clips):
+            if i == answer_position:
+                selected_categories.append(answer_category)
+            else:
+                selected_categories.append(other_categories[other_idx])
+                other_idx += 1
+        # Track usage of answer category
+        self.dataset.category_usage_counts[answer_category] += 1
+        # Sample one file from each category and load audio
+        audio_segments = []
+        filenames_list = []
+        for category in selected_categories:
+            filename, filepath = self.dataset.sample_file_from_category(category)
+            audio = self.audio_processor.load_audio(filepath)
+            audio_segments.append(audio)
+            filenames_list.append(filename)
+        # Build final audio with guaranteed silences between clips
+        output_audio_path = self.audio_output / f"{sample_id}.wav"
+        final_audio = build_clip_sequence_with_silences(
+            audio_segments,
+            clip_duration_seconds,
+            min_silence_ms=self.min_silence_ms,
+            max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
+            crossfade_ms=self.crossfade_ms
+        )
+        # Save the audio
+        final_audio.export(str(output_audio_path), format="wav")
+        # Determine correct answer and generate questions based on question type
+        # CRITICAL BUG FIX: Verify answer_category is actually at answer_position
+        if selected_categories[answer_position] != answer_category:
+            self.logger.error(f"Sample {sample_id}: Answer mismatch! Expected {answer_category} at position {answer_position}, got {selected_categories[answer_position]}")
+            # Force correct by using actual category at answer_position
+            correct_category = selected_categories[answer_position]
+        else:
+            correct_category = answer_category
+        if question_type == 'first':
+            mcq_question = self.task_config['mcq_questions']['first']
+            open_text_question = self.task_config['open_text_questions']['first']
+        elif question_type == 'last':
+            mcq_question = self.task_config['mcq_questions']['last']
+            open_text_question = self.task_config['open_text_questions']['last']
+        elif question_type == 'second':
+            mcq_question = self.task_config['mcq_questions']['second']
+            open_text_question = self.task_config['open_text_questions']['second']
+        elif question_type == 'second_last':
+            mcq_question = self.task_config['mcq_questions']['second_last']
+            open_text_question = self.task_config['open_text_questions']['second_last']
+        elif question_type == 'after':
+            # Reference is the sound before answer_position
+            if answer_position > 0:
+                reference_category = selected_categories[answer_position - 1]
+                mcq_question = self.task_config['mcq_questions']['after'].format(sound1=reference_category)
+                open_text_question = self.task_config['open_text_questions']['after'].format(sound1=reference_category)
+            else:
+                # Fallback shouldn't happen but handle gracefully
+                mcq_question = self.task_config['mcq_questions']['first']
+                open_text_question = self.task_config['open_text_questions']['first']
+        else:  # before
+            # Reference is the sound after answer_position
+            if answer_position < n_clips - 1:
+                reference_category = selected_categories[answer_position + 1]
+                mcq_question = self.task_config['mcq_questions']['before'].format(sound2=reference_category)
+                open_text_question = self.task_config['open_text_questions']['before'].format(sound2=reference_category)
+            else:
+                # Fallback to 'first' if only 1 clip
+                correct_category = selected_categories[0]
+                mcq_question = self.task_config['mcq_questions']['first']
+                open_text_question = self.task_config['open_text_questions']['first']
+                question_type = 'first'
+        # Generate MCQ
+        mcq_data = self.question_generator.generate_category_mcq(
+            mcq_question,
+            correct_category,
+            selected_categories,
+            self.dataset.CATEGORIES
+        )
+        # Generate open-text question
+        open_text_data = self.question_generator.generate_category_open_text(
+            open_text_question,
+            correct_category
+        )
+        # Also generate a sequence question for open-text
+        sequence_question = self.task_config['open_text_questions']['sequence']
+        sequence_data = self.question_generator.generate_sequence_open_text(
+            sequence_question,
+            selected_categories
+        )
+        # Create metadata
+        metadata = {
+            'id': sample_id,
+            'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
+            'n_clips': n_clips,
+            'question_type': question_type,
+            'audio_sequence': selected_categories,
+            'correct_answer_category': correct_category,
+            'source_files': filenames_list,
+            'mcq_question': mcq_data['question'],
+            'mcq_options': mcq_data['options'],
+            'mcq_correct_answer': mcq_data['correct_answer'],
+            'open_text_question': open_text_data['question'],
+            'open_text_answer': open_text_data['correct_answer'],
+            'sequence_question': sequence_data['question'],
+            'sequence_answer': sequence_data['correct_answer']
+        }
+        self.logger.info(f"Generated order sample {sample_id}: {question_type}, {n_clips} clips")
+        return metadata
+    def generate_dataset(self) -> tuple:
+        """
+        Generate the complete order task dataset.
+        Uses generate_sample_durations_for_task() to pre-generate exact sample durations
+        that sum to exactly the target task duration. This guarantees:
+        - Exact coverage of target duration
+        - No estimation errors from average-based calculation
+        Returns:
+            Tuple of (mcq_csv_path, open_text_csv_path, sequence_csv_path)
+        """
+        # Generate sample durations upfront (guarantees exact total duration)
+        sample_durations = generate_sample_durations_for_task(
+            self.task_duration_hours,
+            self.min_clip_duration,
+            self.max_clip_duration
+        )
+        num_samples = len(sample_durations)
+        self.logger.info(f"Generating {num_samples} order task samples (target: {self.task_duration_hours}h, exact fill)...")
+        # Calculate effective max clips each sample can use (accounting for silence reduction)
+        # This matches the logic in generate_sample()
+        max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
+        sample_effective_max_clips = []
+        for duration in sample_durations:
+            max_clips, _ = get_max_clip_num_to_be_joined(
+                duration,
+                self.source_clip_duration,
+                self.min_silence_ms
+            )
+            # Apply the same constraints as generate_sample()
+            effective_max = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
+            sample_effective_max_clips.append(effective_max)
+        # Create capacity-aware balanced question type distribution
+        # Categorize question types by clip requirements
+        question_types = self.task_config['question_types']
+        # Separate into tiers based on clip requirements
+        basic_types = ['first', 'last', 'after', 'before']  # Need >= 2 clips
+        advanced_types = ['second', 'second_last']  # Need >= min_clips_for_second
+        # Count how many samples can support each tier (use effective max, not raw max)
+        samples_for_basic = sum(1 for emc in sample_effective_max_clips if emc >= 2)
+        samples_for_advanced = sum(1 for emc in sample_effective_max_clips if emc >= self.min_clips_for_second)
+        # Create list of (sample_idx, duration, effective_max_clips)
+        sample_info = [(i, sample_durations[i], sample_effective_max_clips[i]) for i in range(num_samples)]
+        # Sort by capacity (descending) - assign advanced types to high-capacity samples
+        sample_info.sort(key=lambda x: x[2], reverse=True)
+        # Calculate distribution: prefer advanced types for longer clips
+        samples_per_type = num_samples // len(question_types)
+        remainder = num_samples % len(question_types)
+        # Build assignment pool - advanced types first (for high-capacity samples)
+        assignment_pool = []
+        for qtype in advanced_types:
+            count = samples_per_type + (1 if remainder > 0 else 0)
+            assignment_pool.extend([qtype] * count)
+            remainder = max(0, remainder - 1)
+        for qtype in basic_types:
+            count = samples_per_type + (1 if remainder > 0 else 0)
+            assignment_pool.extend([qtype] * count)
+            remainder = max(0, remainder - 1)
+        # Assign question types based on capacity
+        balanced_assignments = [None] * num_samples
+        for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
+            target_qtype = assignment_pool[idx]
+            # Validate and adjust if needed
+            valid_types = self._get_valid_question_types(capacity)
+            if target_qtype not in valid_types:
+                # Assign a valid alternative - prefer similar types
+                if target_qtype in advanced_types and any(t in valid_types for t in basic_types):
+                    # Downgrade to basic type
+                    target_qtype = random.choice([t for t in basic_types if t in valid_types])
+                else:
+                    # Fallback to any valid type
+                    target_qtype = random.choice(valid_types)
+            balanced_assignments[sample_idx] = target_qtype
+        # Log the actual distribution after capacity-aware assignment
+        from collections import Counter
+        type_dist = Counter(balanced_assignments)
+        self.logger.info(f"Balanced question type distribution (after capacity-aware assignment): {dict(sorted(type_dist.items()))}")
+        all_metadata = []
+        for i, target_duration in enumerate(sample_durations):
+            metadata = self.generate_sample(i, target_question_type=balanced_assignments[i], target_duration_seconds=target_duration)
+            all_metadata.append(metadata)        # Save MCQ CSV
+        mcq_csv_path = self.output_base / 'order_mcq.csv'
+        self._save_mcq_csv(all_metadata, mcq_csv_path)
+        # Save open-text CSV
+        open_text_csv_path = self.output_base / 'order_open_text.csv'
+        self._save_open_text_csv(all_metadata, open_text_csv_path)
+        # Save sequence CSV
+        sequence_csv_path = self.output_base / 'order_sequence.csv'
+        self._save_sequence_csv(all_metadata, sequence_csv_path)
+        # Save metadata CSV
+        metadata_csv_path = self.output_base / 'order_metadata.csv'
+        self._save_metadata_csv(all_metadata, metadata_csv_path)
+        self.logger.info(f"Order task dataset generation complete!")
+        self.logger.info(f"  - MCQ CSV: {mcq_csv_path}")
+        self.logger.info(f"  - Open-text CSV: {open_text_csv_path}")
+        self.logger.info(f"  - Sequence CSV: {sequence_csv_path}")
+        self.logger.info(f"  - Metadata CSV: {metadata_csv_path}")
+        self.logger.info(f"  - Audio files: {self.audio_output}")
+        return mcq_csv_path, open_text_csv_path, sequence_csv_path
+    def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save MCQ format CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            # Header
+            writer.writerow([
+                'question', 'id', 'audio_path',
+                'optionA', 'optionB', 'optionC', 'optionD',
+                'correct', 'question_type', 'audio_sequence'
+            ])
+            # Data rows
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['mcq_question'],
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['mcq_options']['A'],
+                    meta['mcq_options']['B'],
+                    meta['mcq_options']['C'],
+                    meta['mcq_options']['D'],
+                    meta['mcq_correct_answer'],
+                    meta['question_type'],
+                    str(meta['audio_sequence'])
+                ])
+    def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save open-text format CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            # Header
+            writer.writerow([
+                'question', 'id', 'audio_path', 'answer',
+                'question_type', 'audio_sequence'
+            ])
+            # Data rows
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['open_text_question'],
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['open_text_answer'],
+                    meta['question_type'],
+                    str(meta['audio_sequence'])
+                ])
+    def _save_sequence_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save sequence question CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            # Header
+            writer.writerow([
+                'question', 'id', 'audio_path', 'answer', 'audio_sequence'
+            ])
+            # Data rows
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['sequence_question'],
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['sequence_answer'],
+                    str(meta['audio_sequence'])
+                ])
+    def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save detailed metadata CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            # Header
+            writer.writerow([
+                'id', 'audio_path', 'n_clips', 'question_type',
+                'audio_sequence', 'correct_answer', 'source_files'
+            ])
+            # Data rows
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['n_clips'],
+                    meta['question_type'],
+                    str(meta['audio_sequence']),
+                    meta['correct_answer_category'],
+                    str(meta['source_files'])
+                ])
+def main(config_path: str = None):
+    """Main entry point for order task generation."""
+    import yaml
+    # Load configuration
+    if config_path is None:
+        config_path = Path(__file__).parent.parent / 'config.yaml'
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    # Set random seed
+    set_random_seed(config['random_seed'])
+    # Setup logger
+    logger = setup_logger(
+        'order_task',
+        log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
+        level=config['logging']['level'],
+        console_output=config['logging']['console_output']
+    )
+    # Generate dataset
+    generator = OrderTaskGenerator(config, logger)
+    generator.generate_dataset()
+if __name__ == '__main__':
+    main()

tasks/task_volume.py ADDED Viewed

	@@ -0,0 +1,732 @@

+"""
+Task 4: Volume - Generate volume comparison questions
+This task joins multiple audio sources with different volume levels
+and asks questions about the loudest or softest sound.
+"""
+import csv
+import random
+import math
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+import sys
+sys.path.append(str(Path(__file__).parent.parent))
+from utils import (
+    AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
+    setup_logger, set_random_seed, calculate_num_samples_for_task,
+    generate_single_clip_duration, get_max_clip_num_to_be_joined,
+    build_clip_sequence_with_silences, generate_sample_durations_for_task,
+    get_lufs_loudness, normalize_to_lufs
+)
+class VolumeTaskGenerator:
+    """Generator for volume comparison task dataset."""
+    def __init__(self, config: Dict, logger):
+        """
+        Initialize volume task generator.
+        Args:
+            config: Configuration dictionary
+            logger: Logger instance
+        """
+        self.config = config
+        self.logger = logger
+        self.task_config = config['tasks']['volume']
+        # Initialize components
+        self.dataset = ESC50Dataset(
+            config['esc50']['metadata_path'],
+            config['esc50']['audio_path'],
+            config  # Pass config for class subset loading
+        )
+        self.audio_processor = AudioProcessor(
+            crossfade_duration=config['audio']['crossfade_duration'],
+            silence_duration=config['audio']['silence_duration'],
+            with_silence=config['audio']['with_silence'],
+            normalize=config['audio']['normalize'],
+            normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
+            synthetic_silence_path=config['synthetic_silence']['path']
+        )
+        self.question_generator = QuestionGenerator(
+            num_options=config['mcq']['num_options'],
+            option_labels=config['mcq']['option_labels'],
+            distractor_strategy=config['mcq']['distractor_strategy']
+        )
+        # Initialize LLM question generator
+        self.llm_enabled = config.get('llm', {}).get('enabled', False)
+        self.llm_generator = LLMQuestionGenerator(
+            enabled=self.llm_enabled,
+            template_questions=self.task_config
+        )
+        # Duration settings from config
+        self.min_clip_duration = config['audio']['min_clip_duration']
+        self.max_clip_duration = config['audio']['max_clip_duration']
+        # Duration of individual source clips (ESC-50 default is 5s)
+        self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0)
+        self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
+        self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
+        self.crossfade_ms = config['audio'].get('crossfade_duration', 0)
+        self.task_duration_hours = self.task_config['task_duration_size']
+        # Volume task specific settings
+        self.normalize_to_baseline = self.task_config.get('normalize_to_baseline', True)
+        self.baseline_dBFS = self.task_config.get('baseline_dBFS', -20.0)
+        self.use_same_clip_different_volumes = self.task_config.get('use_same_clip_different_volumes', False)
+        self.repetitions_per_source = self.task_config.get('repetitions_per_source', [2, 3, 4])
+        if isinstance(self.repetitions_per_source, int):
+            self.repetitions_per_source = [self.repetitions_per_source]
+        # Volume gap multipliers (similar to duration task)
+        self.multiplier_max_loudness = self.task_config.get('multiplier_max_loudness', 1.5)
+        self.multiplier_min_loudness = self.task_config.get('multiplier_min_loudness', 0.5)
+        self.reject_if_gap_not_met = self.task_config.get('reject_if_gap_not_met', True)
+        # LUFS vs dBFS loudness measurement option
+        # LUFS (Loudness Units Full Scale) measures PERCEIVED loudness
+        # dBFS measures RMS amplitude - does NOT account for frequency sensitivity
+        # LUFS is recommended for comparing different sound types
+        self.use_lufs = self.task_config.get('use_lufs', True)
+        self.baseline_lufs = self.task_config.get('baseline_lufs', -23.0)  # EBU R128 standard
+        # Set up output paths
+        self.output_base = Path(config['output']['base_path']) / 'volume'
+        self.output_base.mkdir(parents=True, exist_ok=True)
+        self.audio_output = self.output_base / 'audios'
+        self.audio_output.mkdir(parents=True, exist_ok=True)
+        # Create balanced sampling pool for num_clips
+        self.clips_count_pool = []
+    def _normalize_to_baseline(self, audio: "AudioSegment") -> "AudioSegment":
+        """
+        Normalize audio to the baseline loudness level.
+        Uses LUFS (perceived loudness) if use_lufs=True, otherwise dBFS.
+        This ensures all clips start from the same perceived loudness before
+        applying volume adjustments.
+        Args:
+            audio: Input audio segment
+        Returns:
+            Normalized audio segment
+        """
+        if not self.normalize_to_baseline:
+            return audio
+        if self.use_lufs:
+            # Use LUFS-based normalization (perceived loudness)
+            normalized = normalize_to_lufs(audio, self.baseline_lufs)
+            self.logger.debug(
+                f"Normalized to baseline LUFS: {get_lufs_loudness(audio):.2f} -> {get_lufs_loudness(normalized):.2f} LUFS"
+            )
+            return normalized
+        else:
+            # Use dBFS normalization (RMS amplitude)
+            change_in_dBFS = self.baseline_dBFS - audio.dBFS
+            normalized = audio.apply_gain(change_in_dBFS)
+            self.logger.debug(
+                f"Normalized to baseline dBFS: {audio.dBFS:.2f} -> {normalized.dBFS:.2f} dBFS"
+            )
+            return normalized
+    def _get_amplitude_loudness(self, audio: "AudioSegment") -> float:
+        """
+        Get the loudness of an audio clip.
+        Uses LUFS (perceived loudness) if use_lufs=True, otherwise dBFS.
+        Args:
+            audio: Input audio segment
+        Returns:
+            Loudness in LUFS or dBFS depending on configuration
+        """
+        if self.use_lufs:
+            return get_lufs_loudness(audio)
+        else:
+            return audio.dBFS
+    def _verify_loudness_gap(
+        self,
+        volume_levels: List[float],
+        question_type: str
+    ) -> Tuple[bool, int, Dict]:
+        """
+        Verify that loudness gap constraint is satisfied.
+        For MAX_LOUDNESS: max_volume >= second_max × multiplier_max
+        For MIN_LOUDNESS: min_volume <= second_min × multiplier_min
+        Since we work with dB (logarithmic), the gap is in dB difference:
+        - For max: max_dB - second_max_dB >= required_gap_dB
+        - For min: second_min_dB - min_dB >= required_gap_dB
+        The multiplier translates to dB: 1.5x linear = ~3.5dB, 2x = ~6dB
+        Args:
+            volume_levels: List of volume adjustments in dB
+            question_type: "max_loudness" or "min_loudness"
+        Returns:
+            Tuple of (gap_satisfied, answer_idx, metadata)
+        """
+        import math
+        sorted_levels = sorted(volume_levels, reverse=True)  # Highest first
+        if question_type == "max_loudness":
+            max_level = sorted_levels[0]
+            second_max = sorted_levels[1] if len(sorted_levels) > 1 else sorted_levels[0]
+            # Convert multiplier to dB difference
+            # multiplier 1.5 means 1.5x louder in amplitude = 20*log10(1.5) ≈ 3.5 dB
+            required_gap_dB = 20 * math.log10(self.multiplier_max_loudness)
+            actual_gap_dB = max_level - second_max
+            gap_satisfied = actual_gap_dB >= required_gap_dB
+            answer_idx = volume_levels.index(max_level)
+            metadata = {
+                'max_level_dB': max_level,
+                'second_max_dB': second_max,
+                'required_gap_dB': required_gap_dB,
+                'actual_gap_dB': actual_gap_dB,
+                'multiplier': self.multiplier_max_loudness
+            }
+        else:  # min_loudness
+            min_level = sorted_levels[-1]
+            second_min = sorted_levels[-2] if len(sorted_levels) > 1 else sorted_levels[-1]
+            # For min, we want min to be multiplier times softer
+            # multiplier 0.5 means 0.5x amplitude = 20*log10(0.5) ≈ -6 dB
+            # So second_min - min_level should be >= 6 dB
+            required_gap_dB = abs(20 * math.log10(self.multiplier_min_loudness))
+            actual_gap_dB = second_min - min_level
+            gap_satisfied = actual_gap_dB >= required_gap_dB
+            answer_idx = volume_levels.index(min_level)
+            metadata = {
+                'min_level_dB': min_level,
+                'second_min_dB': second_min,
+                'required_gap_dB': required_gap_dB,
+                'actual_gap_dB': actual_gap_dB,
+                'multiplier': self.multiplier_min_loudness
+            }
+        return gap_satisfied, answer_idx, metadata
+    def generate_volume_levels(self, n_clips: int, question_type: str = None) -> List[float]:
+        """
+        Generate volume levels dynamically based on multiplier constraints.
+        The levels are generated to ensure proper gap for the question type:
+        - For max_loudness: the loudest is clearly distinguishable (gap = multiplier_max)
+        - For min_loudness: the softest is clearly distinguishable (gap = multiplier_min)
+        Args:
+            n_clips: Number of clips
+            question_type: "max_loudness" or "min_loudness" to ensure proper gap
+        Returns:
+            List of volume adjustments in dB (integers)
+        """
+        # Base spacing between adjacent volume levels (minimum audible difference)
+        # 6 dB = 2x amplitude, 12 dB = 4x amplitude (clearly distinguishable)
+        min_diff = 12  # 12 dB is a VERY noticeable difference (4x perceived loudness)
+        # Calculate required gap based on multiplier (round up to nearest int)
+        if question_type == "max_loudness":
+            required_gap = int(math.ceil(20 * math.log10(self.multiplier_max_loudness)))
+        elif question_type == "min_loudness":
+            required_gap = int(math.ceil(abs(20 * math.log10(self.multiplier_min_loudness))))
+        else:
+            required_gap = min_diff
+        # Ensure gap is at least min_diff
+        required_gap = max(required_gap, min_diff)
+        if question_type == "max_loudness":
+            # Generate levels where max has clear gap from others
+            # Max level (answer) at a high value - MUCH louder
+            max_level = 18  # dB adjustment = ~8x louder than baseline
+            # Other levels should be at least required_gap below max
+            # Spread them out with min_diff spacing
+            other_levels = []
+            current_level = max_level - required_gap
+            for i in range(n_clips - 1):
+                other_levels.append(current_level)
+                current_level -= min_diff
+            selected_levels = other_levels + [max_level]
+        elif question_type == "min_loudness":
+            # Generate levels where min has clear gap from others
+            # Min level (answer) at a low value - MUCH quieter
+            min_level = -24  # dB adjustment = ~1/16th of baseline volume
+            # Other levels should be at least required_gap above min
+            # Spread them out with min_diff spacing
+            other_levels = []
+            current_level = min_level + required_gap
+            for i in range(n_clips - 1):
+                other_levels.append(current_level)
+                current_level += min_diff
+            selected_levels = [min_level] + other_levels
+        else:
+            # Default: evenly spaced levels centered around 0
+            total_range = (n_clips - 1) * min_diff
+            start_level = -total_range // 2
+            selected_levels = [start_level + i * min_diff for i in range(n_clips)]
+        # Shuffle to randomize order in the audio
+        random.shuffle(selected_levels)
+        return selected_levels
+    def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Dict:
+        """
+        Generate a single volume task sample.
+        Pipeline:
+        1. Pick dataset -> pick class -> pick audio clip
+        2. NORMALIZE all clips to baseline dBFS (critical for controlled comparison)
+        3. Apply different volume adjustments to each clip
+        4. Concatenate clips with silences
+        Optionally: use same clip with different volume levels if configured.
+        Args:
+            sample_id: Sample ID number
+            target_question_type: Target question type for balanced distribution
+            target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
+        Returns:
+            Dictionary with sample metadata
+        """
+        # Use pre-generated duration or generate one (backward compatibility)
+        if target_duration_seconds is not None:
+            clip_duration_seconds = target_duration_seconds
+        else:
+            clip_duration_seconds = generate_single_clip_duration(
+                self.min_clip_duration,
+                self.max_clip_duration
+            )
+        # Calculate how many clips we need using the new helper
+        max_clips, remainder_seconds = get_max_clip_num_to_be_joined(
+            clip_duration_seconds,
+            self.source_clip_duration,
+            self.min_silence_ms
+        )
+        max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
+        # Silence reduction strategy: subsample from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)]
+        # This ensures we use close to max_clips that fit, reducing excessive silence
+        # Calculate valid range for this sample's duration
+        min_clips_for_sample = max(2, max_clips - 3)  # At least 2, preferably max_clips-3
+        max_clips_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
+        # Validate range
+        if max_clips_for_sample < 2:
+            raise ValueError(
+                f"Sample {sample_id}: Cannot generate volume task - need at least 2 clips. "
+                f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, "
+                f"duration={clip_duration_seconds:.1f}s. Increase min_clip_duration."
+            )
+        if min_clips_for_sample > max_clips_for_sample:
+            raise ValueError(
+                f"Sample {sample_id}: Invalid clip range - min_clips ({min_clips_for_sample}) > max_clips ({max_clips_for_sample}). "
+                f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, duration={clip_duration_seconds:.1f}s"
+            )
+        # Randomly select from valid range (NO balanced pool for volume task)
+        n_clips = random.randint(min_clips_for_sample, max_clips_for_sample)
+        n_clips = max(2, n_clips)  # Ensure at least 2 for volume comparison
+        # Pre-select question type to determine answer position
+        # Use target question type if provided, otherwise randomly select
+        if target_question_type is not None:
+            question_type = target_question_type
+        else:
+            question_type = random.choice(self.task_config['question_types'])
+        # Generate volume levels and verify gap constraint
+        max_attempts = 10
+        gap_satisfied = False
+        volume_levels = None
+        gap_metadata = None
+        for attempt in range(max_attempts):
+            volume_levels = self.generate_volume_levels(n_clips, question_type)
+            gap_satisfied, answer_idx, gap_metadata = self._verify_loudness_gap(
+                volume_levels, question_type
+            )
+            if gap_satisfied:
+                break
+            self.logger.debug(
+                f"Sample {sample_id} attempt {attempt+1}: gap not satisfied, "
+                f"required={gap_metadata['required_gap_dB']:.1f}dB, "
+                f"actual={gap_metadata['actual_gap_dB']:.1f}dB"
+            )
+        if not gap_satisfied and self.reject_if_gap_not_met:
+            self.logger.warning(
+                f"Sample {sample_id} rejected: loudness gap not satisfied after {max_attempts} attempts"
+            )
+            return None
+        # Determine answer position based on question type
+        if question_type == 'max_loudness':
+            answer_idx = volume_levels.index(max(volume_levels))
+        else:  # min_loudness
+            answer_idx = volume_levels.index(min(volume_levels))
+        # Select answer category from least-used categories
+        answer_category = self.dataset.get_least_used_categories(1)[0]
+        # Determine if using same clip with different volumes
+        if self.use_same_clip_different_volumes:
+            # Use ONE source clip repeated at different volume levels
+            selected_categories = [answer_category] * n_clips
+            # Track usage
+            self.dataset.category_usage_counts[answer_category] += 1
+            correct_category = answer_category
+        else:
+            # Use different source clips (original behavior)
+            # Sample remaining categories, ensuring balanced distribution
+            if n_clips <= len(self.dataset.CATEGORIES):
+                other_categories = self.dataset.get_least_used_categories(
+                    n_clips - 1,
+                    exclude=[answer_category]
+                )
+            else:
+                # Need more clips than unique categories
+                other_categories = self.dataset.get_least_used_categories(
+                    min(n_clips - 1, len(self.dataset.CATEGORIES) - 1),
+                    exclude=[answer_category]
+                )
+                # Add random repetitions if needed
+                while len(other_categories) < n_clips - 1:
+                    other_categories.append(random.choice(self.dataset.CATEGORIES))
+            # Arrange categories with answer at correct position
+            selected_categories = []
+            other_idx = 0
+            for i in range(n_clips):
+                if i == answer_idx:
+                    selected_categories.append(answer_category)
+                else:
+                    selected_categories.append(other_categories[other_idx])
+                    other_idx += 1
+            # Track usage of answer category
+            self.dataset.category_usage_counts[answer_category] += 1
+            # CRITICAL BUG FIX: Verify answer_category is actually at answer_idx
+            if selected_categories[answer_idx] != answer_category:
+                self.logger.error(f"Sample {sample_id}: Answer mismatch! Expected {answer_category} at index {answer_idx}, got {selected_categories[answer_idx]}")
+                correct_category = selected_categories[answer_idx]
+            else:
+                correct_category = answer_category
+        # Sample files and process audio
+        audio_segments = []
+        filenames_list = []
+        original_loudness = []
+        final_loudness = []
+        if self.use_same_clip_different_volumes:
+            # Load one file and repeat it with different volumes
+            filename, filepath = self.dataset.sample_file_from_category(answer_category)
+            base_audio = self.audio_processor.load_audio(filepath)
+            original_loudness_val = self._get_amplitude_loudness(base_audio)
+            # Normalize to baseline first
+            base_audio_normalized = self._normalize_to_baseline(base_audio)
+            for i in range(n_clips):
+                # Apply volume adjustment to normalized audio
+                audio_adjusted = self.audio_processor.adjust_volume(
+                    base_audio_normalized,
+                    volume_levels[i]
+                )
+                audio_segments.append(audio_adjusted)
+                filenames_list.append(filename)
+                original_loudness.append(original_loudness_val)
+                final_loudness.append(self._get_amplitude_loudness(audio_adjusted))
+        else:
+            # Use different files (original behavior but with normalization)
+            for i, category in enumerate(selected_categories):
+                filename, filepath = self.dataset.sample_file_from_category(category)
+                audio = self.audio_processor.load_audio(filepath)
+                # Record original loudness
+                orig_loud = self._get_amplitude_loudness(audio)
+                original_loudness.append(orig_loud)
+                # STEP 1: Normalize to baseline dBFS
+                audio_normalized = self._normalize_to_baseline(audio)
+                # STEP 2: Apply volume adjustment (relative to baseline)
+                audio_adjusted = self.audio_processor.adjust_volume(
+                    audio_normalized,
+                    volume_levels[i]
+                )
+                audio_segments.append(audio_adjusted)
+                filenames_list.append(filename)
+                final_loudness.append(self._get_amplitude_loudness(audio_adjusted))
+        # Build final audio with guaranteed silences between clips
+        output_audio_path = self.audio_output / f"{sample_id}.wav"
+        final_audio = build_clip_sequence_with_silences(
+            audio_segments,
+            clip_duration_seconds,
+            min_silence_ms=self.min_silence_ms,
+            max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
+            crossfade_ms=self.crossfade_ms
+        )
+        # Save the audio
+        final_audio.export(str(output_audio_path), format="wav")
+        # Generate MCQ
+        mcq_question = self.task_config['mcq_questions'][question_type]
+        mcq_data = self.question_generator.generate_category_mcq(
+            mcq_question,
+            correct_category,
+            selected_categories,
+            self.dataset.CATEGORIES
+        )
+        # Generate open-text question
+        open_text_question = self.task_config['open_text_questions'][question_type]
+        open_text_data = self.question_generator.generate_category_open_text(
+            open_text_question,
+            correct_category
+        )
+        # Create category to volume mapping
+        category_volumes = {
+            selected_categories[i]: volume_levels[i]
+            for i in range(n_clips)
+        }
+        # Create metadata
+        metadata = {
+            'id': sample_id,
+            'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
+            'n_clips': n_clips,
+            'question_type': question_type,
+            'audio_sequence': selected_categories,
+            'volume_levels_db': volume_levels,
+            'category_volumes': category_volumes,
+            'correct_answer_category': correct_category,
+            'correct_volume_db': volume_levels[answer_idx],
+            'source_files': filenames_list,
+            'use_same_clip': self.use_same_clip_different_volumes,
+            'baseline_dBFS': self.baseline_dBFS if self.normalize_to_baseline else None,
+            'original_loudness_dBFS': original_loudness,
+            'final_loudness_dBFS': final_loudness,
+            'gap_satisfied': gap_satisfied,
+            'gap_metadata': gap_metadata,
+            'mcq_question': mcq_data['question'],
+            'mcq_options': mcq_data['options'],
+            'mcq_correct_answer': mcq_data['correct_answer'],
+            'open_text_question': open_text_data['question'],
+            'open_text_answer': open_text_data['correct_answer']
+        }
+        self.logger.info(
+            f"Generated volume sample {sample_id}: {question_type}, {n_clips} clips, "
+            f"volumes={volume_levels}, gap_satisfied={gap_satisfied}, "
+            f"gap={gap_metadata['actual_gap_dB']:.1f}dB (required={gap_metadata['required_gap_dB']:.1f}dB)"
+        )
+        return metadata
+    def generate_dataset(self) -> tuple:
+        """
+        Generate the complete volume task dataset.
+        Uses generate_sample_durations_for_task() to pre-generate exact sample durations
+        that sum to exactly the target task duration. This guarantees:
+        - Exact coverage of target duration
+        - No estimation errors from average-based calculation
+        Returns:
+            Tuple of (mcq_csv_path, open_text_csv_path)
+        """
+        # Generate sample durations upfront (guarantees exact total duration)
+        sample_durations = generate_sample_durations_for_task(
+            self.task_duration_hours,
+            self.min_clip_duration,
+            self.max_clip_duration
+        )
+        num_samples = len(sample_durations)
+        self.logger.info(f"Generating {num_samples} volume task samples (target: {self.task_duration_hours}h, exact fill)...")
+        # Create balanced question type distribution (NO clips balancing for volume task)
+        question_types = self.task_config['question_types']
+        balanced_question_types = []
+        samples_per_type = num_samples // len(question_types)
+        remainder = num_samples % len(question_types)
+        for qtype in question_types:
+            count = samples_per_type + (1 if remainder > 0 else 0)
+            balanced_question_types.extend([qtype] * count)
+            remainder = max(0, remainder - 1)
+        random.shuffle(balanced_question_types)
+        from collections import Counter
+        type_dist = Counter(balanced_question_types)
+        self.logger.info(f"Balanced question type distribution: {dict(sorted(type_dist.items()))}")
+        all_metadata = []
+        for i, target_duration in enumerate(sample_durations):
+            metadata = self.generate_sample(i, target_question_type=balanced_question_types[i], target_duration_seconds=target_duration)
+            all_metadata.append(metadata)        # Save MCQ CSV
+        mcq_csv_path = self.output_base / 'volume_mcq.csv'
+        self._save_mcq_csv(all_metadata, mcq_csv_path)
+        # Save open-text CSV
+        open_text_csv_path = self.output_base / 'volume_open_text.csv'
+        self._save_open_text_csv(all_metadata, open_text_csv_path)
+        # Save metadata CSV
+        metadata_csv_path = self.output_base / 'volume_metadata.csv'
+        self._save_metadata_csv(all_metadata, metadata_csv_path)
+        self.logger.info(f"Volume task dataset generation complete!")
+        self.logger.info(f"  - MCQ CSV: {mcq_csv_path}")
+        self.logger.info(f"  - Open-text CSV: {open_text_csv_path}")
+        self.logger.info(f"  - Metadata CSV: {metadata_csv_path}")
+        self.logger.info(f"  - Audio files: {self.audio_output}")
+        return mcq_csv_path, open_text_csv_path
+    def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save MCQ format CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            # Header
+            writer.writerow([
+                'question', 'id', 'audio_path',
+                'optionA', 'optionB', 'optionC', 'optionD',
+                'correct', 'question_type', 'audio_sequence',
+                'category_volumes'
+            ])
+            # Data rows
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['mcq_question'],
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['mcq_options']['A'],
+                    meta['mcq_options']['B'],
+                    meta['mcq_options']['C'],
+                    meta['mcq_options']['D'],
+                    meta['mcq_correct_answer'],
+                    meta['question_type'],
+                    str(meta['audio_sequence']),
+                    str(meta['category_volumes'])
+                ])
+    def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save open-text format CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            # Header
+            writer.writerow([
+                'question', 'id', 'audio_path', 'answer',
+                'question_type', 'audio_sequence', 'category_volumes'
+            ])
+            # Data rows
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['open_text_question'],
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['open_text_answer'],
+                    meta['question_type'],
+                    str(meta['audio_sequence']),
+                    str(meta['category_volumes'])
+                ])
+    def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
+        """Save detailed metadata CSV."""
+        with open(output_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            # Header
+            writer.writerow([
+                'id', 'audio_path', 'n_clips', 'question_type',
+                'audio_sequence', 'volume_levels_db', 'correct_answer',
+                'correct_volume_db', 'source_files'
+            ])
+            # Data rows
+            for meta in metadata_list:
+                writer.writerow([
+                    meta['id'],
+                    meta['audio_path'],
+                    meta['n_clips'],
+                    meta['question_type'],
+                    str(meta['audio_sequence']),
+                    str(meta['volume_levels_db']),
+                    meta['correct_answer_category'],
+                    meta['correct_volume_db'],
+                    str(meta['source_files'])
+                ])
+def main(config_path: str = None):
+    """Main entry point for volume task generation."""
+    import yaml
+    # Load configuration
+    if config_path is None:
+        config_path = Path(__file__).parent.parent / 'config.yaml'
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    # Set random seed
+    set_random_seed(config['random_seed'])
+    # Setup logger
+    logger = setup_logger(
+        'volume_task',
+        log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
+        level=config['logging']['level'],
+        console_output=config['logging']['console_output']
+    )
+    # Generate dataset
+    generator = VolumeTaskGenerator(config, logger)
+    generator.generate_dataset()
+if __name__ == '__main__':
+    main()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+Utility module initialization.
+"""
+from .audio_utils import (
+    AudioProcessor, set_random_seed,
+    calculate_num_samples_for_task, generate_sample_durations_for_task,
+    generate_single_clip_duration,
+    concatenate_to_target_duration,
+    get_max_clip_num_to_be_joined,
+    build_clip_sequence_with_silences,
+    distribute_remainder_as_silences,
+    repeat_clips_to_fill_duration,
+    build_consecutive_sources_for_count_task,
+    build_random_order_for_count_task,
+    build_count_task_audio,
+    calculate_duration_slot_distribution,
+    build_duration_task_audio,
+    get_lufs_loudness,
+    normalize_to_lufs
+)
+from .dataset_utils import ESC50Dataset, PreprocessedESC50Dataset
+from .logger import setup_logger
+from .question_utils import QuestionGenerator
+from .llm_utils import LLMQuestionGenerator
+__all__ = [
+    'AudioProcessor',
+    'ESC50Dataset',
+    'PreprocessedESC50Dataset',
+    'QuestionGenerator',
+    'LLMQuestionGenerator',
+    'setup_logger',
+    'set_random_seed',
+    'calculate_num_samples_for_task',
+    'generate_sample_durations_for_task',
+    'generate_single_clip_duration',
+    'concatenate_to_target_duration',
+    'get_max_clip_num_to_be_joined',
+    'build_clip_sequence_with_silences',
+    'distribute_remainder_as_silences',
+    'repeat_clips_to_fill_duration',
+    'build_consecutive_sources_for_count_task',
+    'build_random_order_for_count_task',
+    'build_count_task_audio',
+    'calculate_duration_slot_distribution',
+    'build_duration_task_audio',
+    'get_lufs_loudness',
+    'normalize_to_lufs'
+]

utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.28 kB). View file

utils/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (1.16 kB). View file

utils/__pycache__/audio_utils.cpython-312.pyc ADDED Viewed

Binary file (48 kB). View file

utils/__pycache__/audio_utils.cpython-314.pyc ADDED Viewed

Binary file (45.1 kB). View file

utils/__pycache__/dataset_utils.cpython-312.pyc ADDED Viewed

Binary file (26.2 kB). View file

utils/__pycache__/llm_utils.cpython-312.pyc ADDED Viewed

Binary file (5.87 kB). View file

utils/__pycache__/logger.cpython-312.pyc ADDED Viewed

Binary file (2.33 kB). View file

utils/__pycache__/question_utils.cpython-312.pyc ADDED Viewed

Binary file (9.7 kB). View file

utils/audio_utils.py ADDED Viewed

	@@ -0,0 +1,1388 @@

+"""
+Audio processing utilities for temporal reasoning dataset generation.
+"""
+import os
+import random
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+from pydub import AudioSegment
+try:
+    import pyloudnorm as pyln
+    PYLOUDNORM_AVAILABLE = True
+except ImportError:
+    PYLOUDNORM_AVAILABLE = False
+from .logger import setup_logger
+logger = setup_logger(__name__)
+def get_lufs_loudness(audio: AudioSegment) -> float:
+    """
+    Calculate integrated LUFS loudness (perceived loudness) of an audio segment.
+    LUFS (Loudness Units Full Scale) is the broadcast standard for measuring
+    perceived loudness. It accounts for human hearing sensitivity to different
+    frequencies using K-weighting.
+    Args:
+        audio: Input audio segment (pydub AudioSegment)
+    Returns:
+        Loudness in LUFS (negative values, typically -70 to 0)
+        Returns dBFS if pyloudnorm is not available (fallback)
+    """
+    if not PYLOUDNORM_AVAILABLE:
+        logger.warning("pyloudnorm not available, falling back to dBFS")
+        return audio.dBFS
+    # Convert pydub AudioSegment to numpy array
+    samples = np.array(audio.get_array_of_samples())
+    # Handle stereo by reshaping
+    if audio.channels == 2:
+        samples = samples.reshape((-1, 2))
+    # Normalize to float [-1, 1]
+    if audio.sample_width == 1:
+        samples = samples.astype(np.float64) / 128.0 - 1.0
+    elif audio.sample_width == 2:
+        samples = samples.astype(np.float64) / 32768.0
+    elif audio.sample_width == 4:
+        samples = samples.astype(np.float64) / 2147483648.0
+    else:
+        samples = samples.astype(np.float64) / 32768.0  # default to 16-bit
+    # Create meter with sample rate
+    meter = pyln.Meter(audio.frame_rate)
+    # Measure integrated loudness
+    try:
+        loudness = meter.integrated_loudness(samples)
+        # Handle -inf for silent audio
+        if np.isinf(loudness):
+            loudness = -70.0  # Return very quiet value instead of -inf
+        return loudness
+    except Exception as e:
+        logger.warning(f"LUFS measurement failed: {e}, falling back to dBFS")
+        return audio.dBFS
+def normalize_to_lufs(audio: AudioSegment, target_lufs: float = -23.0) -> AudioSegment:
+    """
+    Normalize audio to a target LUFS level (perceived loudness normalization).
+    This is superior to dBFS normalization for comparing different sound types
+    because it accounts for human hearing sensitivity.
+    Args:
+        audio: Input audio segment
+        target_lufs: Target loudness level in LUFS (default: -23 LUFS, EBU R128 standard)
+    Returns:
+        Loudness-normalized audio segment
+    """
+    if not PYLOUDNORM_AVAILABLE:
+        logger.warning("pyloudnorm not available, falling back to dBFS normalization")
+        change_db = target_lufs - audio.dBFS
+        return audio.apply_gain(change_db)
+    current_lufs = get_lufs_loudness(audio)
+    # Calculate required gain change
+    gain_db = target_lufs - current_lufs
+    # Apply gain
+    normalized = audio.apply_gain(gain_db)
+    logger.debug(f"Normalized LUFS: {current_lufs:.2f} -> {get_lufs_loudness(normalized):.2f} LUFS")
+    return normalized
+class AudioProcessor:
+    """Handles audio loading, processing, and concatenation."""
+    def __init__(
+        self,
+        crossfade_duration: int = 500,
+        silence_duration: int = 1000,
+        with_silence: bool = True,
+        normalize: bool = False,
+        normalize_target_dBFS: float = -20.0,
+        synthetic_silence_path: Optional[str] = None
+    ):
+        """
+        Initialize the audio processor.
+        Args:
+            crossfade_duration: Duration of crossfade in milliseconds
+            silence_duration: Duration of silence between clips in milliseconds
+            with_silence: Whether to add silence between clips
+            normalize: Whether to normalize audio levels
+            normalize_target_dBFS: Target dBFS level for normalization
+            synthetic_silence_path: Path to synthetic silence audio files
+        """
+        self.crossfade_duration = crossfade_duration
+        self.silence_duration = silence_duration
+        self.with_silence = with_silence
+        self.normalize = normalize
+        self.normalize_target_dBFS = normalize_target_dBFS
+        self.synthetic_silence_path = synthetic_silence_path
+        self._silence_cache = {}
+    def load_audio(self, audio_path: str) -> AudioSegment:
+        """
+        Load an audio file.
+        Args:
+            audio_path: Path to the audio file
+        Returns:
+            Loaded audio segment
+        """
+        try:
+            audio = AudioSegment.from_file(audio_path, format="wav")
+            logger.debug(f"Loaded audio: {audio_path}, duration: {len(audio)}ms")
+            return audio
+        except Exception as e:
+            logger.error(f"Error loading audio {audio_path}: {e}")
+            raise
+    def normalize_audio(self, audio: AudioSegment, target_dBFS: Optional[float] = None) -> AudioSegment:
+        """
+        Normalize audio to a target dBFS level.
+        Args:
+            audio: Input audio segment
+            target_dBFS: Target dBFS level (uses default if None)
+        Returns:
+            Normalized audio segment
+        """
+        if target_dBFS is None:
+            target_dBFS = self.normalize_target_dBFS
+        change_in_dBFS = target_dBFS - audio.dBFS
+        normalized = audio.apply_gain(change_in_dBFS)
+        logger.debug(f"Normalized audio: {audio.dBFS:.2f} dBFS -> {normalized.dBFS:.2f} dBFS")
+        return normalized
+    def adjust_volume(self, audio: AudioSegment, volume_db: float) -> AudioSegment:
+        """
+        Adjust audio volume by a specific dB amount.
+        Args:
+            audio: Input audio segment
+            volume_db: Volume adjustment in dB (positive = louder, negative = quieter)
+        Returns:
+            Volume-adjusted audio segment
+        """
+        adjusted = audio.apply_gain(volume_db)
+        logger.debug(f"Adjusted volume by {volume_db} dB: {audio.dBFS:.2f} -> {adjusted.dBFS:.2f} dBFS")
+        return adjusted
+    def get_silence(self, duration: Optional[int] = None) -> AudioSegment:
+        """
+        Get a silence audio segment, using synthetic silence if available.
+        Args:
+            duration: Duration in milliseconds (uses default if None)
+        Returns:
+            Silence audio segment
+        """
+        if duration is None:
+            duration = self.silence_duration
+        # Check cache first
+        if duration in self._silence_cache:
+            return self._silence_cache[duration]
+        # Try to load synthetic silence
+        if self.synthetic_silence_path and os.path.exists(self.synthetic_silence_path):
+            silence_files = list(Path(self.synthetic_silence_path).glob("*.wav"))
+            if silence_files:
+                silence = self.load_audio(str(random.choice(silence_files)))
+                # Adjust duration if needed
+                if len(silence) < duration:
+                    # Repeat the silence
+                    repetitions = (duration // len(silence)) + 1
+                    silence = silence * repetitions
+                silence = silence[:duration]
+                self._silence_cache[duration] = silence
+                logger.debug(f"Using synthetic silence: {duration}ms")
+                return silence
+        # Fall back to pure silence
+        silence = AudioSegment.silent(duration=duration)
+        self._silence_cache[duration] = silence
+        logger.debug(f"Using pure silence: {duration}ms")
+        return silence
+    def concatenate_audios(
+        self,
+        audio_list: List[AudioSegment],
+        normalize_each: bool = False,
+        volume_adjustments: Optional[List[float]] = None
+    ) -> AudioSegment:
+        """
+        Concatenate multiple audio segments with crossfade and optional silence.
+        Args:
+            audio_list: List of audio segments to concatenate
+            normalize_each: Whether to normalize each audio before concatenation
+            volume_adjustments: Optional list of volume adjustments (in dB) for each audio
+        Returns:
+            Concatenated audio segment
+        """
+        if not audio_list:
+            raise ValueError("audio_list cannot be empty")
+        if len(audio_list) == 1:
+            audio = audio_list[0]
+            if normalize_each and self.normalize:
+                audio = self.normalize_audio(audio)
+            if volume_adjustments and len(volume_adjustments) > 0:
+                audio = self.adjust_volume(audio, volume_adjustments[0])
+            return audio
+        # Process first audio
+        merged = audio_list[0]
+        if normalize_each and self.normalize:
+            merged = self.normalize_audio(merged)
+        if volume_adjustments and len(volume_adjustments) > 0:
+            merged = self.adjust_volume(merged, volume_adjustments[0])
+        # Concatenate remaining audios
+        for i, audio in enumerate(audio_list[1:], start=1):
+            # Process current audio
+            current = audio
+            if normalize_each and self.normalize:
+                current = self.normalize_audio(current)
+            if volume_adjustments and len(volume_adjustments) > i:
+                current = self.adjust_volume(current, volume_adjustments[i])
+            # Add silence if configured
+            if self.with_silence:
+                silence = self.get_silence()
+                # Crossfade between audio and silence for smooth transition
+                merged = merged.append(silence, crossfade=self.crossfade_duration)
+            # Append current audio WITHOUT crossfade to avoid cutting it
+            # The crossfade with silence already provides smooth transition
+            merged = merged.append(current, crossfade=0)
+        logger.debug(f"Concatenated {len(audio_list)} audio segments, total duration: {len(merged)}ms")
+        return merged
+    def concatenate_audio_files(
+        self,
+        audio_paths: List[str],
+        output_path: str,
+        normalize_each: bool = False,
+        volume_adjustments: Optional[List[float]] = None,
+        target_durations: Optional[List[float]] = None
+    ) -> Tuple[AudioSegment, dict]:
+        """
+        Load, concatenate, and save multiple audio files.
+        Args:
+            audio_paths: List of paths to audio files
+            output_path: Path to save the concatenated audio
+            normalize_each: Whether to normalize each audio before concatenation
+            volume_adjustments: Optional list of volume adjustments (in dB) for each audio
+            target_durations: Optional list of target durations (in seconds) for each clip
+        Returns:
+            Tuple of (concatenated audio segment, metadata dict)
+        """
+        # Load all audio files
+        audio_segments = []
+        for i, path in enumerate(audio_paths):
+            audio = self.load_audio(path)
+            # Adjust duration if specified
+            if target_durations and i < len(target_durations):
+                target_ms = int(target_durations[i] * 1000)
+                audio = trim_or_repeat_audio(audio, target_ms)
+                logger.debug(f"Adjusted clip {i} to {len(audio)}ms (target: {target_ms}ms)")
+            audio_segments.append(audio)
+        # Concatenate
+        merged = self.concatenate_audios(audio_segments, normalize_each, volume_adjustments)
+        # Save
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        merged.export(str(output_path), format="wav")
+        logger.info(f"Saved concatenated audio: {output_path}")
+        # Create metadata
+        metadata = {
+            "output_path": str(output_path),
+            "source_files": audio_paths,
+            "num_sources": len(audio_paths),
+            "total_duration_ms": len(merged),
+            "total_duration_s": len(merged) / 1000.0,
+            "individual_durations_ms": [len(a) for a in audio_segments],
+            "individual_durations_s": [len(a) / 1000.0 for a in audio_segments],
+            "target_durations_s": target_durations if target_durations else [],
+            "volume_adjustments_db": volume_adjustments if volume_adjustments else []
+        }
+        return merged, metadata
+def generate_sample_durations_for_task(
+    task_duration_hours: float,
+    min_clip_duration: float,
+    max_clip_duration: float
+) -> list:
+    """
+    Generate sample durations that exactly fill the target task duration.
+    Algorithm:
+    1. Start with remaining = total_seconds
+    2. While remaining >= min_clip_duration:
+       - Sample d ~ Uniform(min, min(max, remaining))
+       - Append d to durations list
+       - Subtract d from remaining
+    3. Return shuffled list of durations
+    This ensures:
+    - Total of all durations ≈ task_duration (within min_clip_duration tolerance)
+    - Each duration is uniformly sampled within valid range
+    - No overshoot of target duration
+    Args:
+        task_duration_hours: Total duration for the task in hours
+        min_clip_duration: Minimum duration per clip in seconds
+        max_clip_duration: Maximum duration per clip in seconds
+    Returns:
+        List of sample durations in seconds (shuffled)
+    """
+    task_duration_seconds = task_duration_hours * 3600
+    remaining = task_duration_seconds
+    durations = []
+    while remaining >= min_clip_duration:
+        # Cap max at remaining to avoid overshoot
+        effective_max = min(max_clip_duration, remaining)
+        # If remaining is less than min, we can't fit another sample
+        if effective_max < min_clip_duration:
+            break
+        # Sample uniformly within valid range
+        d = random.uniform(min_clip_duration, effective_max)
+        durations.append(d)
+        remaining -= d
+    # Shuffle to randomize order (durations were generated sequentially)
+    random.shuffle(durations)
+    total_duration = sum(durations)
+    logger.info(f"Task duration target: {task_duration_hours}h ({task_duration_seconds:.1f}s)")
+    logger.info(f"Generated {len(durations)} sample durations, total: {total_duration:.1f}s")
+    logger.info(f"Duration range: [{min(durations):.1f}s, {max(durations):.1f}s], "
+                f"mean: {total_duration/len(durations):.1f}s")
+    logger.info(f"Unused remainder: {remaining:.1f}s ({remaining/task_duration_seconds*100:.2f}%)")
+    return durations
+def calculate_num_samples_for_task(
+    task_duration_hours: float,
+    min_clip_duration: float,
+    max_clip_duration: float
+) -> int:
+    """
+    Calculate number of samples needed to fill the task duration.
+    DEPRECATED: Use generate_sample_durations_for_task() instead for exact duration filling.
+    This function is kept for backward compatibility but uses average-based estimation.
+    Args:
+        task_duration_hours: Total duration for the task in hours
+        min_clip_duration: Minimum duration per clip in seconds
+        max_clip_duration: Maximum duration per clip in seconds
+    Returns:
+        Number of samples to generate (estimate)
+    """
+    task_duration_seconds = task_duration_hours * 3600
+    avg_clip_duration = (min_clip_duration + max_clip_duration) / 2
+    num_samples = int(task_duration_seconds / avg_clip_duration)
+    logger.info(f"Task duration: {task_duration_hours}h ({task_duration_seconds}s)")
+    logger.info(f"Avg clip duration: {avg_clip_duration}s (min: {min_clip_duration}s, max: {max_clip_duration}s)")
+    logger.info(f"Calculated number of samples: {num_samples}")
+    return max(1, num_samples)  # At least 1 sample
+def generate_single_clip_duration(
+    min_duration: float,
+    max_duration: float
+) -> float:
+    """
+    Generate a random clip duration between min and max.
+    Args:
+        min_duration: Minimum duration in seconds
+        max_duration: Maximum duration in seconds
+    Returns:
+        Random duration in seconds
+    """
+    return random.uniform(min_duration, max_duration)
+def concatenate_to_target_duration(
+    base_audio: AudioSegment,
+    target_duration_seconds: float,
+    crossfade_ms: int = 0
+) -> AudioSegment:
+    """
+    Concatenate a base audio clip to reach target duration.
+    This takes a 5-second ESC-50 clip and repeats it to create a longer clip.
+    Args:
+        base_audio: Original 5s audio segment
+        target_duration_seconds: Target duration in seconds
+        crossfade_ms: Crossfade between repetitions in milliseconds
+    Returns:
+        Audio segment of target duration
+    """
+    target_duration_ms = int(target_duration_seconds * 1000)
+    base_duration_ms = len(base_audio)
+    if target_duration_ms <= base_duration_ms:
+        # Just trim if target is shorter
+        return base_audio[:target_duration_ms]
+    # Calculate number of repetitions needed
+    num_repetitions = (target_duration_ms // base_duration_ms) + 1
+    # Concatenate with crossfade
+    result = base_audio
+    for i in range(1, num_repetitions):
+        if crossfade_ms > 0:
+            result = result.append(base_audio, crossfade=crossfade_ms)
+        else:
+            result = result + base_audio
+        # Stop if we've reached target
+        if len(result) >= target_duration_ms:
+            break
+    # Trim to exact duration
+    return result[:target_duration_ms]
+def set_random_seed(seed: int):
+    """Set random seed for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed)
+    logger.info(f"Random seed set to: {seed}")
+def get_max_clip_num_to_be_joined(
+    target_duration_seconds: float,
+    source_clip_duration_seconds: float,
+    min_silence_ms: int = 100
+) -> Tuple[int, float]:
+    """
+    Calculate the maximum number of source clips needed to reach target duration.
+    Pipeline: pick dataset -> pick class -> pick audio clip -> get duration ->
+    concatenate clips to reach target duration -> modulo to get num clips ->
+    inserting silences randomly based on remainder.
+    Args:
+        target_duration_seconds: Target total duration in seconds
+        source_clip_duration_seconds: Duration of each source clip (e.g., 5s for ESC-50)
+        min_silence_ms: Minimum silence between clips in milliseconds
+    Returns:
+        Tuple of (num_clips_needed, remainder_seconds_for_silences)
+        - num_clips_needed: How many source clips to concatenate
+        - remainder_seconds_for_silences: Extra time to distribute as random silences
+    Example:
+        target=30s, source=5s -> (6, 0.0) - exactly 6 clips, no extra silence
+        target=32s, source=5s -> (6, 2.0) - 6 clips + 2s distributed as silences
+    """
+    target_ms = target_duration_seconds * 1000
+    source_ms = source_clip_duration_seconds * 1000
+    # Account for minimum silence between each pair of clips
+    # If we have N clips, we have (N-1) gaps for silence
+    # Each gap needs at least min_silence_ms
+    # Start by computing raw number of clips (floor division)
+    num_clips = int(target_ms // source_ms)
+    num_clips = max(1, num_clips)  # At least 1 clip
+    # Total audio content from clips
+    clips_duration_ms = num_clips * source_ms
+    # Minimum required silence for gaps
+    num_gaps = max(0, num_clips - 1)
+    min_total_silence_ms = num_gaps * min_silence_ms
+    # Check if we need to reduce clips to fit silences
+    while num_clips > 1 and (clips_duration_ms + min_total_silence_ms) > target_ms:
+        num_clips -= 1
+        clips_duration_ms = num_clips * source_ms
+        num_gaps = num_clips - 1
+        min_total_silence_ms = num_gaps * min_silence_ms
+    # Calculate remainder for extra silences
+    remainder_ms = target_ms - clips_duration_ms - min_total_silence_ms
+    remainder_seconds = max(0, remainder_ms / 1000.0)
+    logger.debug(
+        f"get_max_clip_num: target={target_duration_seconds}s, source={source_clip_duration_seconds}s "
+        f"-> {num_clips} clips, {remainder_seconds:.3f}s remainder for extra silences"
+    )
+    return num_clips, remainder_seconds
+def build_clip_sequence_with_silences(
+    audio_segments: List[AudioSegment],
+    target_duration_seconds: float,
+    min_silence_ms: int = 100,
+    max_extra_silence_per_gap_ms: int = 500,
+    crossfade_ms: int = 0
+) -> AudioSegment:
+    """
+    Build a final audio clip by concatenating segments with guaranteed silences.
+    Ensures:
+    1. All clips are joined with at least min_silence_ms between them
+    2. Any remainder duration is distributed as random extra silences in gaps
+    3. Final duration matches target_duration_seconds exactly
+    Args:
+        audio_segments: List of audio segments to concatenate
+        target_duration_seconds: Target total duration in seconds
+        min_silence_ms: Minimum silence between each pair of clips (always inserted)
+        max_extra_silence_per_gap_ms: Maximum extra silence to add per gap
+        crossfade_ms: Crossfade duration in ms (applied when joining)
+    Returns:
+        Concatenated audio segment of exact target duration
+    """
+    if not audio_segments:
+        raise ValueError("audio_segments cannot be empty")
+    target_ms = int(target_duration_seconds * 1000)
+    if len(audio_segments) == 1:
+        # Single clip: just trim/repeat to target
+        audio = audio_segments[0]
+        if len(audio) >= target_ms:
+            return audio[:target_ms]
+        else:
+            # Repeat to reach target
+            return concatenate_to_target_duration(audio, target_duration_seconds, crossfade_ms)
+    # Calculate total audio content duration
+    total_audio_ms = sum(len(seg) for seg in audio_segments)
+    num_gaps = len(audio_segments) - 1
+    # Minimum silence needed
+    min_total_silence_ms = num_gaps * min_silence_ms
+    # Available time for extra silences
+    available_extra_ms = target_ms - total_audio_ms - min_total_silence_ms
+    if available_extra_ms < 0:
+        # Not enough room - need to trim clips
+        logger.warning(
+            f"Clips too long for target duration. Total audio: {total_audio_ms}ms, "
+            f"target: {target_ms}ms. Will trim final result."
+        )
+        available_extra_ms = 0
+    # Distribute extra silence randomly across gaps
+    extra_silences_ms = distribute_remainder_as_silences(
+        available_extra_ms,
+        num_gaps,
+        max_extra_silence_per_gap_ms
+    )
+    # Build the final audio
+    result = audio_segments[0]
+    for i, audio in enumerate(audio_segments[1:]):
+        # Calculate total silence for this gap
+        gap_silence_ms = min_silence_ms + extra_silences_ms[i]
+        # Add silence
+        silence = AudioSegment.silent(duration=gap_silence_ms)
+        if crossfade_ms > 0 and crossfade_ms < gap_silence_ms:
+            # Crossfade audio->silence for smooth transition, but NOT silence->audio
+            result = result.append(silence, crossfade=crossfade_ms)
+            result = result.append(audio, crossfade=0)  # No crossfade to avoid cutting audio
+        else:
+            result = result + silence + audio
+    # Trim to exact target duration
+    if len(result) > target_ms:
+        result = result[:target_ms]
+    elif len(result) < target_ms:
+        # Pad with silence if slightly short
+        padding = AudioSegment.silent(duration=target_ms - len(result))
+        result = result + padding
+    logger.debug(
+        f"Built clip sequence: {len(audio_segments)} segments, "
+        f"final duration: {len(result)}ms (target: {target_ms}ms)"
+    )
+    return result
+def distribute_remainder_as_silences(
+    remainder_ms: float,
+    num_gaps: int,
+    max_per_gap_ms: int = 500
+) -> List[int]:
+    """
+    Distribute remainder time as random silences across gaps.
+    Args:
+        remainder_ms: Total extra time to distribute (in ms)
+        num_gaps: Number of gaps between clips
+        max_per_gap_ms: Maximum extra silence per gap
+    Returns:
+        List of extra silence durations (in ms) for each gap
+    """
+    if num_gaps <= 0:
+        return []
+    remainder_ms = int(max(0, remainder_ms))
+    if remainder_ms == 0:
+        return [0] * num_gaps
+    # Generate random weights for distribution
+    weights = [random.random() for _ in range(num_gaps)]
+    total_weight = sum(weights)
+    if total_weight == 0:
+        # Fallback to uniform distribution
+        weights = [1.0] * num_gaps
+        total_weight = num_gaps
+    # Distribute proportionally, respecting max_per_gap
+    extra_silences = []
+    remaining = remainder_ms
+    for i, w in enumerate(weights):
+        if i == num_gaps - 1:
+            # Last gap gets whatever is left
+            extra = min(remaining, max_per_gap_ms)
+        else:
+            proportion = w / total_weight
+            extra = int(remainder_ms * proportion)
+            extra = min(extra, max_per_gap_ms, remaining)
+        extra_silences.append(extra)
+        remaining -= extra
+        total_weight -= w
+    # If there's still remainder (due to max_per_gap limits), do another pass
+    while remaining > 0:
+        for i in range(num_gaps):
+            if extra_silences[i] < max_per_gap_ms and remaining > 0:
+                add = min(remaining, max_per_gap_ms - extra_silences[i])
+                extra_silences[i] += add
+                remaining -= add
+        if remaining > 0:
+            # Can't distribute more (all gaps at max)
+            break
+    logger.debug(f"Distributed {remainder_ms}ms across {num_gaps} gaps: {extra_silences}")
+    return extra_silences
+def repeat_clips_to_fill_duration(
+    source_audios: List[AudioSegment],
+    source_categories: List[str],
+    target_duration_seconds: float,
+    source_clip_duration_seconds: float = 5.0,
+    min_silence_ms: int = 100
+) -> Tuple[List[AudioSegment], List[str], int]:
+    """
+    Repeat source clips to fill target duration, cycling through all sources.
+    This ensures all unique sources appear and are repeated proportionally.
+    Args:
+        source_audios: List of unique source audio segments
+        source_categories: List of category names corresponding to source_audios
+        target_duration_seconds: Target total duration
+        source_clip_duration_seconds: Duration of each source clip
+        min_silence_ms: Minimum silence between clips
+    Returns:
+        Tuple of (expanded_audio_list, expanded_categories, num_clips)
+    """
+    num_clips, remainder = get_max_clip_num_to_be_joined(
+        target_duration_seconds,
+        source_clip_duration_seconds,
+        min_silence_ms
+    )
+    num_sources = len(source_audios)
+    if num_sources == 0:
+        raise ValueError("source_audios cannot be empty")
+    # Build expanded lists by cycling through sources
+    expanded_audios = []
+    expanded_categories = []
+    for i in range(num_clips):
+        idx = i % num_sources
+        expanded_audios.append(source_audios[idx])
+        expanded_categories.append(source_categories[idx])
+    logger.debug(
+        f"Repeated {num_sources} sources to {num_clips} clips for "
+        f"{target_duration_seconds}s target duration"
+    )
+    return expanded_audios, expanded_categories, num_clips
+def build_consecutive_sources_for_count_task(
+    source_audios: List[AudioSegment],
+    source_categories: List[str],
+    target_duration_seconds: float,
+    source_clip_duration_seconds: float = 5.0,
+    min_silence_between_sources_ms: int = 100,
+    max_extra_silence_per_gap_ms: int = 500,
+    crossfade_within_source_ms: int = 50
+) -> Tuple[AudioSegment, List[str], dict]:
+    """
+    Build audio for COUNT task with consecutive same-class clips.
+    For count task, same-class clips must be consecutive (AAA BBB CCC) so they
+    are perceived as ONE sound source. Silences are only inserted BETWEEN
+    different classes, not within same-class repetitions.
+    Pipeline: pick classes -> for each class concatenate clips consecutively ->
+    insert silences only between different classes -> distribute remainder
+    Args:
+        source_audios: List of unique source audio segments (one per class)
+        source_categories: List of category names
+        target_duration_seconds: Target total duration
+        source_clip_duration_seconds: Duration of each source clip
+        min_silence_between_sources_ms: Minimum silence between different sources
+        max_extra_silence_per_gap_ms: Max extra silence per gap for remainder distribution
+        crossfade_within_source_ms: Small crossfade within same-source repetitions
+    Returns:
+        Tuple of (final_audio, category_sequence, metadata_dict)
+    """
+    target_ms = int(target_duration_seconds * 1000)
+    source_ms = int(source_clip_duration_seconds * 1000)
+    num_sources = len(source_audios)
+    if num_sources == 0:
+        raise ValueError("source_audios cannot be empty")
+    # Calculate total clips needed
+    num_clips, remainder_seconds = get_max_clip_num_to_be_joined(
+        target_duration_seconds,
+        source_clip_duration_seconds,
+        min_silence_between_sources_ms
+    )
+    # Safety check: if more sources than clips can fit, warn
+    if num_sources > num_clips:
+        logger.warning(
+            f"More sources ({num_sources}) than clips that fit ({num_clips}). "
+            f"Each source needs at least 1 clip, so output may exceed target duration. "
+            f"Consider capping n_unique_audios <= max_clips in task_count.py"
+        )
+        # Each source gets exactly 1 rep if there are more sources than clips
+        num_clips = num_sources  # This will exceed target but ensures each source is included
+    # Distribute clips across sources as evenly as possible
+    # Each source gets at least 1 clip since num_sources <= num_clips
+    base_reps = num_clips // num_sources
+    extra_reps = num_clips % num_sources
+    repetitions_per_source = []
+    for i in range(num_sources):
+        reps = base_reps + (1 if i < extra_reps else 0)
+        repetitions_per_source.append(reps)
+    # Shuffle repetition assignment to add variety
+    random.shuffle(repetitions_per_source)
+    # Build each source's audio block (consecutive clips of same class)
+    source_blocks = []
+    category_sequence = []
+    for i, (audio, category, reps) in enumerate(zip(source_audios, source_categories, repetitions_per_source)):
+        if reps == 0:
+            continue
+        # Concatenate same-source clips with minimal/no gap (just small crossfade)
+        block = audio
+        for _ in range(reps - 1):
+            if crossfade_within_source_ms > 0:
+                block = block.append(audio, crossfade=crossfade_within_source_ms)
+            else:
+                block = block + audio
+        source_blocks.append(block)
+        category_sequence.append(category)
+    # Now we have N source blocks, need to join them with silences
+    # Number of gaps = num_source_blocks - 1
+    num_gaps = len(source_blocks) - 1
+    if num_gaps <= 0:
+        # Only one source block
+        final_audio = source_blocks[0]
+    else:
+        # Calculate total audio duration from blocks
+        total_blocks_ms = sum(len(block) for block in source_blocks)
+        min_total_silence_ms = num_gaps * min_silence_between_sources_ms
+        # Available for extra silences
+        available_extra_ms = target_ms - total_blocks_ms - min_total_silence_ms
+        available_extra_ms = max(0, available_extra_ms)
+        # Distribute extra silence across gaps
+        extra_silences = distribute_remainder_as_silences(
+            available_extra_ms,
+            num_gaps,
+            max_extra_silence_per_gap_ms
+        )
+        # Build final audio with silences between source blocks
+        final_audio = source_blocks[0]
+        for i, block in enumerate(source_blocks[1:]):
+            gap_silence_ms = min_silence_between_sources_ms + extra_silences[i]
+            silence = AudioSegment.silent(duration=gap_silence_ms)
+            final_audio = final_audio + silence + block
+    # Trim or pad to exact target duration
+    if len(final_audio) > target_ms:
+        final_audio = final_audio[:target_ms]
+    elif len(final_audio) < target_ms:
+        padding = AudioSegment.silent(duration=target_ms - len(final_audio))
+        final_audio = final_audio + padding
+    # Create metadata
+    metadata = {
+        'num_unique_sources': num_sources,
+        'total_clips': num_clips,
+        'ordering_mode': 'consecutive',
+        'repetitions_per_source': dict(zip(source_categories, repetitions_per_source)),
+        'target_duration_ms': target_ms,
+        'actual_duration_ms': len(final_audio),
+        'num_gaps_between_sources': num_gaps
+    }
+    logger.debug(
+        f"Count task (consecutive): {num_sources} sources, {num_clips} total clips, "
+        f"reps={repetitions_per_source}, duration={len(final_audio)}ms"
+    )
+    return final_audio, category_sequence, metadata
+def build_random_order_for_count_task(
+    source_audios: List[AudioSegment],
+    source_categories: List[str],
+    target_duration_seconds: float,
+    source_clip_duration_seconds: float = 5.0,
+    min_silence_ms: int = 100,
+    max_extra_silence_per_gap_ms: int = 500
+) -> Tuple[AudioSegment, List[str], dict]:
+    """
+    Build audio for COUNT task with RANDOM ordering of clips.
+    Clips from different sources are shuffled randomly (A B A C B A C...).
+    This tests whether the model can recognize recurring sounds as the same source.
+    Silences are inserted between ALL clips (same or different source).
+    Pipeline:
+    1. Calculate total clips needed
+    2. Distribute clips across sources
+    3. Create expanded list with all clip instances
+    4. Shuffle randomly
+    5. Insert silences between ALL clips
+    6. Distribute remainder as extra random silences
+    Args:
+        source_audios: List of unique source audio segments (one per class)
+        source_categories: List of category names
+        target_duration_seconds: Target total duration
+        source_clip_duration_seconds: Duration of each source clip
+        min_silence_ms: Minimum silence between ALL clips
+        max_extra_silence_per_gap_ms: Max extra silence per gap
+    Returns:
+        Tuple of (final_audio, clip_sequence, metadata_dict)
+    """
+    target_ms = int(target_duration_seconds * 1000)
+    source_ms = int(source_clip_duration_seconds * 1000)
+    num_sources = len(source_audios)
+    if num_sources == 0:
+        raise ValueError("source_audios cannot be empty")
+    # Calculate total clips needed
+    num_clips, remainder_seconds = get_max_clip_num_to_be_joined(
+        target_duration_seconds,
+        source_clip_duration_seconds,
+        min_silence_ms
+    )
+    # Safety check: if more sources than clips can fit, warn and cap sources
+    if num_sources > num_clips:
+        logger.warning(
+            f"More sources ({num_sources}) than clips that fit ({num_clips}). "
+            f"Each source needs at least 1 clip, so output may exceed target duration. "
+            f"Consider capping n_unique_audios <= max_clips in task_count.py"
+        )
+        # Each source gets exactly 1 rep if there are more sources than clips
+        num_clips = num_sources  # This will exceed target but ensures each source is included
+    # Distribute clips across sources as evenly as possible
+    base_reps = num_clips // num_sources  # At least 1 since num_sources <= num_clips (after cap)
+    extra_reps = num_clips % num_sources
+    repetitions_per_source = []
+    for i in range(num_sources):
+        reps = base_reps + (1 if i < extra_reps else 0)
+        repetitions_per_source.append(reps)
+    # Build expanded list of (audio, category) pairs
+    expanded_clips = []
+    for audio, category, reps in zip(source_audios, source_categories, repetitions_per_source):
+        for _ in range(reps):
+            expanded_clips.append((audio, category))
+    # Shuffle the clips randomly
+    random.shuffle(expanded_clips)
+    # Extract shuffled audios and categories
+    shuffled_audios = [clip[0] for clip in expanded_clips]
+    clip_sequence = [clip[1] for clip in expanded_clips]
+    # Build final audio with silences between ALL clips
+    final_audio = build_clip_sequence_with_silences(
+        shuffled_audios,
+        target_duration_seconds,
+        min_silence_ms=min_silence_ms,
+        max_extra_silence_per_gap_ms=max_extra_silence_per_gap_ms,
+        crossfade_ms=0  # No crossfade for random ordering
+    )
+    # Create metadata
+    metadata = {
+        'num_unique_sources': num_sources,
+        'total_clips': len(expanded_clips),
+        'ordering_mode': 'random',
+        'repetitions_per_source': dict(zip(source_categories, repetitions_per_source)),
+        'clip_sequence': clip_sequence,
+        'target_duration_ms': target_ms,
+        'actual_duration_ms': len(final_audio),
+        'num_gaps': len(expanded_clips) - 1
+    }
+    logger.debug(
+        f"Count task (random): {num_sources} sources, {len(expanded_clips)} clips, "
+        f"sequence={clip_sequence[:5]}..., duration={len(final_audio)}ms"
+    )
+    return final_audio, clip_sequence, metadata
+def build_count_task_audio(
+    source_audios: List[AudioSegment],
+    source_categories: List[str],
+    target_duration_seconds: float,
+    ordering_mode: str = "random",
+    source_clip_duration_seconds: float = 5.0,
+    min_silence_ms: int = 100,
+    max_extra_silence_per_gap_ms: int = 500,
+    crossfade_within_source_ms: int = 50
+) -> Tuple[AudioSegment, List[str], dict]:
+    """
+    Build audio for COUNT task with configurable ordering mode.
+    Args:
+        source_audios: List of unique source audio segments (one per class)
+        source_categories: List of category names
+        target_duration_seconds: Target total duration
+        ordering_mode: "random" or "consecutive"
+            - "random": Clips shuffled (A B A C B A C) - tests sound recognition
+            - "consecutive": Same-source grouped (AAA BBB CCC) - easier
+        source_clip_duration_seconds: Duration of each source clip
+        min_silence_ms: Minimum silence between clips
+        max_extra_silence_per_gap_ms: Max extra silence per gap
+        crossfade_within_source_ms: Crossfade for consecutive mode only
+    Returns:
+        Tuple of (final_audio, clip_sequence, metadata_dict)
+    """
+    if ordering_mode == "consecutive":
+        return build_consecutive_sources_for_count_task(
+            source_audios,
+            source_categories,
+            target_duration_seconds,
+            source_clip_duration_seconds,
+            min_silence_ms,
+            max_extra_silence_per_gap_ms,
+            crossfade_within_source_ms
+        )
+    else:  # random (default)
+        return build_random_order_for_count_task(
+            source_audios,
+            source_categories,
+            target_duration_seconds,
+            source_clip_duration_seconds,
+            min_silence_ms,
+            max_extra_silence_per_gap_ms
+        )
+# =============================================================================
+# DURATION TASK FUNCTIONS
+# =============================================================================
+def calculate_duration_slot_distribution(
+    target_total_duration_s: float,
+    effective_durations: Dict[str, float],
+    target_category: str,
+    question_type: str,
+    multiplier_longest: float = 1.5,
+    multiplier_shortest: float = 0.5,
+    min_silence_between_sources_ms: int = 100
+) -> Tuple[Dict[str, int], bool, Dict]:
+    """
+    Calculate how many repetitions each source gets for duration task.
+    For LONGEST: target gets max repetitions, backgrounds get 1 each
+    For SHORTEST: target gets 1, backgrounds share remaining duration
+    Args:
+        target_total_duration_s: Target total audio duration
+        effective_durations: Dict mapping category -> effective duration in seconds
+        target_category: The category that should be longest/shortest
+        question_type: "longest" or "shortest"
+        multiplier_longest: target >= max_background * this
+        multiplier_shortest: target <= min_background * this
+        min_silence_between_sources_ms: Minimum silence between different sources
+    Returns:
+        Tuple of (slot_distribution, gap_satisfied, metadata)
+        slot_distribution: Dict mapping category -> number of repetitions
+        gap_satisfied: Whether the duration gap constraint is met
+        metadata: Additional info about the calculation
+    """
+    categories = list(effective_durations.keys())
+    n_sources = len(categories)
+    if n_sources < 2:
+        # Single source - always satisfies constraint
+        reps = max(1, int(target_total_duration_s / effective_durations[target_category]))
+        return {target_category: reps}, True, {'note': 'single_source'}
+    # Total silence between sources
+    total_silence_s = (n_sources - 1) * min_silence_between_sources_ms / 1000.0
+    available_for_audio_s = target_total_duration_s - total_silence_s
+    background_categories = [c for c in categories if c != target_category]
+    if question_type == "longest":
+        # Backgrounds get 1 rep each
+        background_duration_s = sum(effective_durations[c] for c in background_categories)
+        # Remaining for target
+        remaining_for_target_s = available_for_audio_s - background_duration_s
+        target_duration_per_rep = effective_durations[target_category]
+        # Calculate reps for target
+        target_reps = max(1, int(remaining_for_target_s / target_duration_per_rep))
+        actual_target_duration = target_reps * target_duration_per_rep
+        # Verify gap
+        max_background_duration = max(effective_durations[c] for c in background_categories)
+        required_target_duration = max_background_duration * multiplier_longest
+        gap_satisfied = actual_target_duration >= required_target_duration
+        slot_distribution = {c: 1 for c in background_categories}
+        slot_distribution[target_category] = target_reps
+        metadata = {
+            'available_for_audio_s': available_for_audio_s,
+            'background_duration_s': background_duration_s,
+            'remaining_for_target_s': remaining_for_target_s,
+            'target_reps': target_reps,
+            'actual_target_duration_s': actual_target_duration,
+            'max_background_duration_s': max_background_duration,
+            'required_target_duration_s': required_target_duration,
+            'multiplier_used': multiplier_longest
+        }
+    else:  # shortest
+        # Target gets 1 rep
+        target_duration_s = effective_durations[target_category]
+        # Remaining for backgrounds
+        remaining_for_backgrounds_s = available_for_audio_s - target_duration_s
+        # Distribute remaining to backgrounds as evenly as possible
+        # while ensuring each background is longer than target * 1/multiplier
+        slot_distribution = {target_category: 1}
+        # Calculate minimum required duration for each background
+        min_background_required = target_duration_s / multiplier_shortest
+        background_reps = {}
+        for cat in background_categories:
+            eff_dur = effective_durations[cat]
+            # How many reps needed to exceed min_background_required?
+            min_reps = max(1, int(min_background_required / eff_dur) + 1)
+            background_reps[cat] = min_reps
+        # Check if we have room for all backgrounds
+        total_background_needed = sum(
+            background_reps[c] * effective_durations[c]
+            for c in background_categories
+        )
+        if total_background_needed <= remaining_for_backgrounds_s:
+            # Distribute extra reps
+            extra_available = remaining_for_backgrounds_s - total_background_needed
+            # Add extra reps to backgrounds proportionally
+            while extra_available > 0:
+                added_any = False
+                for cat in background_categories:
+                    eff_dur = effective_durations[cat]
+                    if extra_available >= eff_dur:
+                        background_reps[cat] += 1
+                        extra_available -= eff_dur
+                        added_any = True
+                if not added_any:
+                    break
+            slot_distribution.update(background_reps)
+            gap_satisfied = True
+        else:
+            # Not enough room - use minimum reps anyway
+            slot_distribution.update(background_reps)
+            gap_satisfied = False
+        # Calculate actual durations
+        actual_durations = {
+            cat: slot_distribution[cat] * effective_durations[cat]
+            for cat in categories
+        }
+        min_background_actual = min(
+            actual_durations[c] for c in background_categories
+        )
+        # Re-verify gap
+        gap_satisfied = actual_durations[target_category] <= min_background_actual * multiplier_shortest
+        metadata = {
+            'available_for_audio_s': available_for_audio_s,
+            'target_duration_s': target_duration_s,
+            'remaining_for_backgrounds_s': remaining_for_backgrounds_s,
+            'min_background_required_s': min_background_required,
+            'actual_durations_s': actual_durations,
+            'min_background_actual_s': min_background_actual,
+            'multiplier_used': multiplier_shortest
+        }
+    return slot_distribution, gap_satisfied, metadata
+def build_duration_task_audio(
+    source_audio_lists: Dict[str, List[AudioSegment]],
+    slot_distribution: Dict[str, int],
+    effective_durations: Dict[str, float],
+    target_total_duration_s: float,
+    min_silence_between_sources_ms: int = 100,
+    max_extra_silence_per_gap_ms: int = 500,
+    crossfade_within_source_ms: int = 50
+) -> Tuple[AudioSegment, List[str], Dict]:
+    """
+    Build audio for DURATION task with consecutive ordering per source.
+    Structure: [SourceA × n] + silence + [SourceB × m] + silence + ...
+    Order of sources is randomized to avoid patterns.
+    Args:
+        source_audio_lists: Dict mapping category -> list of audio segments
+        slot_distribution: Dict mapping category -> number of repetitions
+        effective_durations: Dict mapping category -> effective duration per clip
+        target_total_duration_s: Target total duration
+        min_silence_between_sources_ms: Min silence between different sources
+        max_extra_silence_per_gap_ms: Max extra silence per gap
+        crossfade_within_source_ms: Crossfade between same-source repetitions
+    Returns:
+        Tuple of (final_audio, category_sequence, metadata)
+    """
+    categories = list(slot_distribution.keys())
+    # Randomize source order
+    random.shuffle(categories)
+    # Build audio blocks for each source
+    source_blocks = []
+    category_sequence = []
+    actual_durations = {}
+    block_durations_ms = []  # Track duration of each block for timestamp calculation
+    for category in categories:
+        reps = slot_distribution[category]
+        audio_list = source_audio_lists[category]
+        if reps == 0:
+            continue
+        # Build block for this source
+        block = audio_list[0]
+        for i in range(1, reps):
+            # Use same clip or cycle through available clips
+            next_clip = audio_list[i % len(audio_list)]
+            # Crossfade within same source
+            if crossfade_within_source_ms > 0:
+                if len(block) > crossfade_within_source_ms and len(next_clip) > crossfade_within_source_ms:
+                    block = block.append(next_clip, crossfade=crossfade_within_source_ms)
+                else:
+                    block = block + next_clip
+            else:
+                block = block + next_clip
+        source_blocks.append((category, block))
+        block_durations_ms.append(len(block))
+        category_sequence.extend([category] * reps)
+        actual_durations[category] = len(block) / 1000.0
+    # Calculate total audio duration and available extra silence
+    total_audio_ms = sum(len(block) for _, block in source_blocks)
+    num_gaps = len(source_blocks) - 1
+    min_total_silence_ms = num_gaps * min_silence_between_sources_ms
+    target_ms = int(target_total_duration_s * 1000)
+    available_extra_ms = target_ms - total_audio_ms - min_total_silence_ms
+    # Distribute extra silence
+    if available_extra_ms > 0 and num_gaps > 0:
+        extra_silences = distribute_remainder_as_silences(
+            available_extra_ms,
+            num_gaps,
+            max_extra_silence_per_gap_ms
+        )
+    else:
+        extra_silences = [0] * max(num_gaps, 1)
+    # Concatenate with silences and track timestamps
+    source_timestamps = []  # List of (category, start_ms, end_ms)
+    current_position_ms = 0
+    if len(source_blocks) == 1:
+        final_audio = source_blocks[0][1]
+        cat, block = source_blocks[0]
+        source_timestamps.append((cat, 0, len(block)))
+    else:
+        final_audio = source_blocks[0][1]
+        cat, block = source_blocks[0]
+        source_timestamps.append((cat, 0, len(block)))
+        current_position_ms = len(block)
+        for i, (cat, block) in enumerate(source_blocks[1:]):
+            gap_silence_ms = min_silence_between_sources_ms + extra_silences[i]
+            silence = AudioSegment.silent(duration=gap_silence_ms)
+            # Prefer crossfading from audio -> silence for a smooth transition,
+            # but avoid crossfading silence -> audio (it cuts the start of the next clip).
+            # Conditions for safe crossfade:
+            # - crossfade length should be less than gap silence
+            # - both segments must be longer than crossfade
+            crossfade_ms = min(500, gap_silence_ms)
+            if crossfade_ms > 0 and crossfade_ms < gap_silence_ms and len(final_audio) > crossfade_ms and len(block) > crossfade_ms:
+                final_audio = final_audio.append(silence, crossfade=crossfade_ms)
+                # Append next block without crossfade to avoid trimming its start
+                final_audio = final_audio.append(block, crossfade=0)
+                # Track timestamp after silence (start of block)
+                start_ms = current_position_ms + gap_silence_ms
+                end_ms = start_ms + len(block)
+                source_timestamps.append((cat, start_ms, end_ms))
+                current_position_ms = end_ms
+            else:
+                # Fall back to simple concatenation
+                final_audio = final_audio + silence + block
+                start_ms = current_position_ms + gap_silence_ms
+                end_ms = start_ms + len(block)
+                source_timestamps.append((cat, start_ms, end_ms))
+                current_position_ms = end_ms
+    # Adjust to target duration
+    if len(final_audio) > target_ms:
+        final_audio = final_audio[:target_ms]
+    elif len(final_audio) < target_ms:
+        padding = AudioSegment.silent(duration=target_ms - len(final_audio))
+        final_audio = final_audio + padding
+    # Build timestamp string: "category1 start-end, category2 start-end, ..."
+    timestamp_parts = []
+    for cat, start_ms, end_ms in source_timestamps:
+        start_s = round(start_ms / 1000.0, 2)
+        end_s = round(end_ms / 1000.0, 2)
+        duration_s = round((end_ms - start_ms) / 1000.0, 2)
+        timestamp_parts.append(f"{cat} {start_s}s-{end_s}s ({duration_s}s)")
+    timestamp_string = ", ".join(timestamp_parts)
+    metadata = {
+        'source_order': [cat for cat, _ in source_blocks],
+        'slot_distribution': slot_distribution,
+        'actual_durations_s': actual_durations,
+        'total_audio_ms': total_audio_ms,
+        'num_gaps': num_gaps,
+        'final_duration_ms': len(final_audio),
+        'source_timestamps': source_timestamps,  # List of (category, start_ms, end_ms)
+        'timestamp_string': timestamp_string  # Human-readable format
+    }
+    logger.debug(
+        f"Duration task audio: {len(source_blocks)} sources, "
+        f"order={metadata['source_order']}, duration={len(final_audio)}ms"
+    )
+    return final_audio, category_sequence, metadata

utils/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,536 @@

+"""
+ESC-50 dataset utilities for loading and sampling audio data.
+"""
+import csv
+import json
+import random
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import pandas as pd
+from .logger import setup_logger
+logger = setup_logger(__name__)
+def load_or_create_class_subset(config: dict, all_categories: List[str]) -> List[str]:
+    """
+    Load persisted class subset or create a new one.
+    Args:
+        config: Configuration dictionary with dataset.use_class_subset, etc.
+        all_categories: List of all available categories
+    Returns:
+        List of category names to use (either subset or all)
+    """
+    dataset_config = config.get('dataset', {})
+    use_subset = dataset_config.get('use_class_subset', False)
+    if not use_subset:
+        logger.info(f"Using all {len(all_categories)} classes")
+        return all_categories
+    num_classes = dataset_config.get('num_classes_subset', len(all_categories))
+    persist_path = Path(dataset_config.get('subset_persist_path', 'class_subset.json'))
+    subset_seed = dataset_config.get('subset_seed', 42)
+    # Try to load existing subset
+    if persist_path.exists():
+        try:
+            with open(persist_path, 'r') as f:
+                data = json.load(f)
+            subset = data.get('classes', [])
+            # Validate subset
+            if len(subset) == num_classes and all(c in all_categories for c in subset):
+                logger.info(f"Loaded persisted class subset from {persist_path}: {len(subset)} classes")
+                return subset
+            else:
+                logger.warning(f"Invalid persisted subset, regenerating...")
+        except Exception as e:
+            logger.warning(f"Failed to load persisted subset: {e}, regenerating...")
+    # Create new subset
+    random.seed(subset_seed)
+    subset = random.sample(all_categories, min(num_classes, len(all_categories)))
+    subset.sort()  # Sort for consistency
+    # Persist subset
+    persist_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(persist_path, 'w') as f:
+        json.dump({
+            'classes': subset,
+            'num_classes': len(subset),
+            'seed': subset_seed,
+            'total_available': len(all_categories)
+        }, f, indent=2)
+    logger.info(f"Created and persisted new class subset: {len(subset)} classes to {persist_path}")
+    return subset
+class ESC50Dataset:
+    """Handler for ESC-50 dataset."""
+    # All 50 ESC-50 sound categories
+    ALL_CATEGORIES = [
+        'dog', 'chirping_birds', 'vacuum_cleaner', 'thunderstorm', 'door_wood_knock',
+        'can_opening', 'crow', 'clapping', 'fireworks', 'chainsaw', 'airplane',
+        'mouse_click', 'pouring_water', 'train', 'sheep', 'water_drops', 'church_bells',
+        'clock_alarm', 'keyboard_typing', 'wind', 'footsteps', 'frog', 'cow',
+        'brushing_teeth', 'car_horn', 'crackling_fire', 'helicopter', 'drinking_sipping',
+        'rain', 'insects', 'laughing', 'hen', 'engine', 'breathing', 'crying_baby',
+        'hand_saw', 'coughing', 'glass_breaking', 'snoring', 'toilet_flush', 'pig',
+        'washing_machine', 'clock_tick', 'sneezing', 'rooster', 'sea_waves', 'siren',
+        'cat', 'door_wood_creaks', 'crickets'
+    ]
+    def __init__(self, metadata_path: str, audio_path: str, config: Optional[dict] = None):
+        """
+        Initialize ESC-50 dataset handler.
+        Args:
+            metadata_path: Path to esc50.csv metadata file
+            audio_path: Path to audio directory
+            config: Optional configuration dict with dataset.use_class_subset settings
+        """
+        self.metadata_path = Path(metadata_path)
+        self.audio_path = Path(audio_path)
+        self.config = config or {}
+        self.df = None
+        self.category_to_target = {}
+        self.target_to_category = {}
+        # Load class subset if configured
+        self.CATEGORIES = load_or_create_class_subset(self.config, self.ALL_CATEGORIES)
+        self.category_usage_counts = {cat: 0 for cat in self.CATEGORIES}
+        self.load_metadata()
+    def load_metadata(self):
+        """Load ESC-50 metadata CSV."""
+        try:
+            self.df = pd.read_csv(self.metadata_path)
+            logger.info(f"Loaded ESC-50 metadata: {len(self.df)} files")
+            # Create category mappings
+            for target, category in zip(self.df['target'], self.df['category']):
+                self.category_to_target[category] = target
+                self.target_to_category[target] = category
+            logger.info(f"Found {len(self.category_to_target)} unique categories")
+        except Exception as e:
+            logger.error(f"Error loading metadata: {e}")
+            raise
+    def get_files_by_category(self, category: str) -> List[str]:
+        """
+        Get all audio files for a specific category.
+        Args:
+            category: Sound category name
+        Returns:
+            List of filenames for the category
+        """
+        if category not in self.category_to_target:
+            raise ValueError(f"Unknown category: {category}")
+        target = self.category_to_target[category]
+        files = self.df[self.df['target'] == target]['filename'].tolist()
+        return files
+    def get_files_by_target(self, target: int) -> List[str]:
+        """
+        Get all audio files for a specific target ID.
+        Args:
+            target: Target class ID (0-49)
+        Returns:
+            List of filenames for the target
+        """
+        files = self.df[self.df['target'] == target]['filename'].tolist()
+        return files
+    def sample_categories(self, n: int, exclude: Optional[List[str]] = None) -> List[str]:
+        """
+        Sample n unique random categories from the active subset.
+        Args:
+            n: Number of categories to sample
+            exclude: Optional list of categories to exclude
+        Returns:
+            List of sampled category names
+        """
+        available = [c for c in self.CATEGORIES if c not in (exclude or [])]
+        if n > len(available):
+            raise ValueError(f"Cannot sample {n} categories from subset, only {len(available)} available (subset size: {len(self.CATEGORIES)})")
+        return random.sample(available, n)
+    def sample_targets(self, n: int, exclude: Optional[List[int]] = None) -> List[int]:
+        """
+        Sample n unique random targets from the active subset.
+        Args:
+            n: Number of targets to sample
+            exclude: Optional list of targets to exclude
+        Returns:
+            List of sampled target IDs corresponding to categories in the subset
+        """
+        # Get targets corresponding to categories in the subset
+        available_targets = [self.category_to_target[cat] for cat in self.CATEGORIES]
+        available = [t for t in available_targets if t not in (exclude or [])]
+        if n > len(available):
+            raise ValueError(f"Cannot sample {n} targets from subset, only {len(available)} available (subset size: {len(self.CATEGORIES)})")
+        return random.sample(available, n)
+    def sample_file_from_category(self, category: str) -> Tuple[str, str]:
+        """
+        Sample a random audio file from a category.
+        Args:
+            category: Sound category name
+        Returns:
+            Tuple of (filename, full_path)
+        """
+        files = self.get_files_by_category(category)
+        filename = random.choice(files)
+        full_path = str(self.audio_path / filename)
+        return filename, full_path
+    def sample_file_from_target(self, target: int) -> Tuple[str, str, str]:
+        """
+        Sample a random audio file from a target.
+        Args:
+            target: Target class ID
+        Returns:
+            Tuple of (filename, category, full_path)
+        """
+        files = self.get_files_by_target(target)
+        filename = random.choice(files)
+        category = self.target_to_category[target]
+        full_path = str(self.audio_path / filename)
+        return filename, category, full_path
+    def get_category_from_filename(self, filename: str) -> str:
+        """Get category name from filename."""
+        row = self.df[self.df['filename'] == filename]
+        if len(row) == 0:
+            raise ValueError(f"Unknown filename: {filename}")
+        return row.iloc[0]['category']
+    def get_file_path(self, filename: str) -> str:
+        """Get full path for a filename."""
+        return str(self.audio_path / filename)
+    def sample_categories_balanced(self, n: int, exclude: Optional[List[str]] = None,
+                                   answer_category: Optional[str] = None) -> List[str]:
+        """
+        Sample n unique categories with balanced usage tracking.
+        This method ensures that over many samples, all categories appear
+        roughly equally as answers by preferentially sampling underused categories.
+        Args:
+            n: Number of categories to sample
+            exclude: Optional list of categories to exclude
+            answer_category: If provided, ensures this category is included and tracks it
+        Returns:
+            List of sampled category names with answer_category first if provided
+        """
+        available = [c for c in self.CATEGORIES if c not in (exclude or [])]
+        if n > len(available):
+            raise ValueError(f"Cannot sample {n} categories, only {len(available)} available")
+        if answer_category:
+            # Track answer category usage
+            self.category_usage_counts[answer_category] += 1
+            # Remove answer category from available and sample the rest
+            available = [c for c in available if c != answer_category]
+            other_categories = random.sample(available, n - 1)
+            return [answer_category] + other_categories
+        else:
+            # Sample without specific answer category
+            return random.sample(available, n)
+    def get_least_used_categories(self, n: int, exclude: Optional[List[str]] = None) -> List[str]:
+        """
+        Get n categories that have been used least as answers.
+        Args:
+            n: Number of categories to get
+            exclude: Optional list of categories to exclude
+        Returns:
+            List of least-used category names
+        """
+        available = [c for c in self.CATEGORIES if c not in (exclude or [])]
+        if n > len(available):
+            raise ValueError(f"Cannot get {n} categories, only {len(available)} available")
+        # Sort by usage count (ascending) and take n least used
+        sorted_categories = sorted(available, key=lambda c: self.category_usage_counts[c])
+        # Among least used, get all with same minimum count
+        min_count = self.category_usage_counts[sorted_categories[0]]
+        candidates = [c for c in sorted_categories if self.category_usage_counts[c] == min_count]
+        if len(candidates) >= n:
+            # Randomly sample from least used
+            return random.sample(candidates, n)
+        else:
+            # Take all minimum and fill with next tier
+            result = candidates.copy()
+            remaining = n - len(result)
+            next_tier = [c for c in sorted_categories if c not in candidates][:remaining]
+            result.extend(next_tier)
+            return result
+    def get_category_usage_stats(self) -> Dict[str, int]:
+        """Get current category usage statistics."""
+        return self.category_usage_counts.copy()
+    def reset_category_usage(self):
+        """Reset category usage tracking."""
+        self.category_usage_counts = {cat: 0 for cat in self.CATEGORIES}
+        logger.info("Reset category usage tracking")
+class PreprocessedESC50Dataset(ESC50Dataset):
+    """
+    Handler for preprocessed ESC-50 dataset with effective durations.
+    Extends ESC50Dataset to use trimmed audio files and effective duration
+    metadata from amplitude-based preprocessing.
+    """
+    def __init__(
+        self,
+        metadata_path: str,
+        audio_path: str,
+        preprocessed_path: str,
+        config: Optional[dict] = None
+    ):
+        """
+        Initialize preprocessed ESC-50 dataset handler.
+        Args:
+            metadata_path: Path to original esc50.csv metadata file
+            audio_path: Path to original audio directory (fallback)
+            preprocessed_path: Path to preprocessed data directory
+            config: Optional configuration dict with dataset.use_class_subset settings
+        """
+        super().__init__(metadata_path, audio_path, config)
+        self.preprocessed_path = Path(preprocessed_path)
+        self.trimmed_audio_path = self.preprocessed_path / "trimmed_audio"
+        self.effective_durations_path = self.preprocessed_path / "effective_durations.csv"
+        # Load effective durations
+        self.effective_df = None
+        self.load_effective_durations()
+    def load_effective_durations(self):
+        """Load effective durations from preprocessed CSV."""
+        try:
+            self.effective_df = pd.read_csv(self.effective_durations_path)
+            logger.info(f"Loaded effective durations for {len(self.effective_df)} clips")
+            # Create quick lookup dictionaries
+            self.filename_to_effective = dict(
+                zip(self.effective_df['filename'], self.effective_df['effective_duration_s'])
+            )
+            self.filename_to_category = dict(
+                zip(self.effective_df['filename'], self.effective_df['category'])
+            )
+            # Category-level statistics
+            self.category_effective_stats = self.effective_df.groupby('category').agg({
+                'effective_duration_s': ['mean', 'std', 'min', 'max', 'count']
+            }).round(4)
+            self.category_effective_stats.columns = ['mean', 'std', 'min', 'max', 'count']
+            logger.info("Created effective duration lookup tables")
+        except Exception as e:
+            logger.error(f"Error loading effective durations: {e}")
+            raise
+    def get_effective_duration(self, filename: str) -> float:
+        """
+        Get effective duration for a specific file.
+        Args:
+            filename: Audio filename
+        Returns:
+            Effective duration in seconds
+        """
+        if filename not in self.filename_to_effective:
+            logger.warning(f"No effective duration for {filename}, using default 5.0s")
+            return 5.0
+        return self.filename_to_effective[filename]
+    def get_category_effective_stats(self, category: str) -> Dict:
+        """
+        Get effective duration statistics for a category.
+        Args:
+            category: Category name
+        Returns:
+            Dict with mean, std, min, max, count
+        """
+        if category not in self.category_effective_stats.index:
+            return {'mean': 5.0, 'std': 0.0, 'min': 5.0, 'max': 5.0, 'count': 0}
+        stats = self.category_effective_stats.loc[category]
+        return {
+            'mean': stats['mean'],
+            'std': stats['std'],
+            'min': stats['min'],
+            'max': stats['max'],
+            'count': int(stats['count'])
+        }
+    def get_files_by_category_with_durations(self, category: str) -> List[Dict]:
+        """
+        Get all files for a category with their effective durations.
+        Args:
+            category: Category name
+        Returns:
+            List of dicts with filename, effective_duration_s, filepath
+        """
+        cat_df = self.effective_df[self.effective_df['category'] == category]
+        results = []
+        for _, row in cat_df.iterrows():
+            results.append({
+                'filename': row['filename'],
+                'effective_duration_s': row['effective_duration_s'],
+                'filepath': str(self.trimmed_audio_path / row['filename']),
+                'raw_duration_s': row['raw_duration_s'],
+                'peak_amplitude_db': row['peak_amplitude_db']
+            })
+        return results
+    def sample_file_from_category_with_duration(
+        self,
+        category: str,
+        min_effective_duration: float = None,
+        max_effective_duration: float = None
+    ) -> Tuple[str, str, float]:
+        """
+        Sample a file from category with optional duration constraints.
+        Args:
+            category: Category name
+            min_effective_duration: Minimum effective duration (optional)
+            max_effective_duration: Maximum effective duration (optional)
+        Returns:
+            Tuple of (filename, filepath, effective_duration_s)
+        """
+        files = self.get_files_by_category_with_durations(category)
+        # Filter by duration if constraints provided
+        if min_effective_duration is not None:
+            files = [f for f in files if f['effective_duration_s'] >= min_effective_duration]
+        if max_effective_duration is not None:
+            files = [f for f in files if f['effective_duration_s'] <= max_effective_duration]
+        if not files:
+            # Fallback to any file from category
+            logger.warning(f"No files match duration constraints for {category}, using any file")
+            files = self.get_files_by_category_with_durations(category)
+        selected = random.choice(files)
+        return selected['filename'], selected['filepath'], selected['effective_duration_s']
+    def sample_files_from_category_to_reach_duration(
+        self,
+        category: str,
+        target_duration_s: float,
+        prefer_same_file: bool = True
+    ) -> Tuple[List[str], List[str], float]:
+        """
+        Sample files from a category to reach a target total effective duration.
+        Args:
+            category: Category name
+            target_duration_s: Target total effective duration
+            prefer_same_file: If True, try repeating same file first
+        Returns:
+            Tuple of (filenames_list, filepaths_list, actual_total_duration_s)
+        """
+        files = self.get_files_by_category_with_durations(category)
+        if not files:
+            raise ValueError(f"No files found for category: {category}")
+        selected_filenames = []
+        selected_filepaths = []
+        total_duration = 0.0
+        if prefer_same_file:
+            # Sort by effective duration descending (prefer longer clips)
+            files_sorted = sorted(files, key=lambda x: x['effective_duration_s'], reverse=True)
+            selected_file = files_sorted[0]
+            # Calculate how many repetitions needed
+            reps_needed = max(1, int(target_duration_s / selected_file['effective_duration_s']) + 1)
+            for _ in range(reps_needed):
+                selected_filenames.append(selected_file['filename'])
+                selected_filepaths.append(selected_file['filepath'])
+                total_duration += selected_file['effective_duration_s']
+                if total_duration >= target_duration_s:
+                    break
+        else:
+            # Use different files
+            random.shuffle(files)
+            file_idx = 0
+            while total_duration < target_duration_s:
+                selected_file = files[file_idx % len(files)]
+                selected_filenames.append(selected_file['filename'])
+                selected_filepaths.append(selected_file['filepath'])
+                total_duration += selected_file['effective_duration_s']
+                file_idx += 1
+                # Safety limit
+                if file_idx > 100:
+                    logger.warning(f"Hit safety limit when sampling files for {category}")
+                    break
+        return selected_filenames, selected_filepaths, total_duration
+    def get_categories_sorted_by_effective_duration(self, ascending: bool = True) -> List[str]:
+        """
+        Get categories sorted by their mean effective duration.
+        Args:
+            ascending: If True, shortest first; if False, longest first
+        Returns:
+            List of category names sorted by mean effective duration
+        """
+        sorted_stats = self.category_effective_stats.sort_values('mean', ascending=ascending)
+        return sorted_stats.index.tolist()

utils/llm_utils.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+LLM-based question generation utilities.
+Supports multiple LLM providers for generating natural, lexically consistent questions.
+"""
+import os
+import random
+from typing import Dict, List, Optional, Tuple
+import json
+from .logger import setup_logger
+logger = setup_logger(__name__)
+class LLMQuestionGenerator:
+    """Generate questions using local Llama 3.1 8B Instruct LLM."""
+    def __init__(
+        self,
+        enabled: bool = False,
+        template_questions: Optional[Dict] = None
+    ):
+        """
+        Initialize LLM question generator.
+        Args:
+            enabled: Whether LLM generation is enabled
+            template_questions: Template questions for fallback
+        """
+        self.enabled = enabled
+        self.template_questions = template_questions or {}
+        if not self.enabled:
+            logger.info("LLM generation disabled, using templates")
+            return
+        # TODO: Initialize local Llama 3.1 8B model connection
+        # This will be implemented based on your local LLM setup
+        logger.info("LLM generation enabled (local Llama 3.1 8B)")
+        logger.warning("Local LLM integration not yet implemented, falling back to templates")
+    def generate_count_questions(
+        self,
+        correct_count: int,
+        categories_present: List[str],
+        generate_both: bool = True
+    ) -> Dict:
+        """
+        Generate count task questions.
+        Args:
+            correct_count: Correct number of unique sounds
+            categories_present: List of sound categories in the audio
+            generate_both: Whether to generate both MCQ and open-text
+        Returns:
+            Dictionary with mcq_question and/or open_text_question
+        """
+        # TODO: Implement LLM generation when enabled
+        # For now, always use templates
+        return self._generate_count_template(correct_count)
+    def generate_category_questions(
+        self,
+        task_type: str,
+        correct_category: str,
+        categories_present: List[str],
+        context: Optional[Dict] = None
+    ) -> Dict:
+        """
+        Generate questions where the answer is a sound category.
+        Args:
+            task_type: Type of task (duration, order, volume)
+            correct_category: Correct answer category
+            categories_present: All categories in the audio
+            context: Additional context (e.g., question_type, reference_sound)
+        Returns:
+            Dictionary with mcq_question and open_text_question
+        """
+        # TODO: Implement LLM generation when enabled
+        # For now, always use templates
+        return self._generate_category_template(task_type, correct_category, context)
+    def _generate_count_template(self, correct_count: int) -> Dict:
+        """Generate count questions from templates."""
+        mcq_templates = self.template_questions.get("count", {}).get("mcq", [
+            "What is the number of distinct sound sources in the audio file?",
+            "How many different types of sounds can be identified in this recording?"
+        ])
+        open_templates = self.template_questions.get("count", {}).get("open_text", [
+            "How many distinct sound sources are present in the audio?",
+            "Count the number of unique sounds in this recording."
+        ])
+        return {
+            "mcq_question": random.choice(mcq_templates),
+            "open_text_question": random.choice(open_templates)
+        }
+    def _generate_category_template(
+        self,
+        task_type: str,
+        correct_category: str,
+        context: Optional[Dict]
+    ) -> Dict:
+        """Generate category questions from templates."""
+        context = context or {}
+        if task_type == "duration":
+            q_type = context.get("question_type", "shortest")
+            mcq_q = f"Which of the following sounds is heard for the {q_type} duration?"
+            open_q = f"Which sound is heard for the {q_type} duration in the audio?"
+        elif task_type == "order":
+            q_subtype = context.get("question_subtype", "first")
+            if q_subtype == "first":
+                mcq_q = "Which sound appears first in the audio clip?"
+                open_q = "What is the first sound you hear in the audio?"
+            elif q_subtype == "last":
+                mcq_q = "Which sound appears last in the audio clip?"
+                open_q = "What is the last sound you hear in the audio?"
+            elif q_subtype == "after":
+                ref = context.get("reference_sound", "")
+                mcq_q = f"Which sound comes after {ref}?"
+                open_q = f"What sound comes after {ref}?"
+            else:
+                ref = context.get("reference_sound", "")
+                mcq_q = f"Which sound comes before {ref}?"
+                open_q = f"What sound comes before {ref}?"
+        else:  # volume
+            q_type = context.get("question_type", "loudest")
+            mcq_q = f"Which sound is the {q_type} in the audio?"
+            open_q = f"Identify the {q_type} sound in the audio clip."
+        return {
+            "mcq_question": mcq_q,
+            "open_text_question": open_q
+        }