malay-36 commited on
Commit
fec9168
·
verified ·
1 Parent(s): 1140d11

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +19 -0
  2. DOCS.md +1296 -0
  3. README.md +112 -0
  4. config.yaml +348 -0
  5. llm_answer_generator.py +268 -0
  6. main.py +272 -0
  7. preprocess_esc50.py +714 -0
  8. requirements.txt +6 -0
  9. run_llm_answers_all.sh +28 -0
  10. run_pipeline.sh +166 -0
  11. synthetic_silences/silent_1.wav +3 -0
  12. synthetic_silences/silent_10.wav +3 -0
  13. synthetic_silences/silent_11.wav +3 -0
  14. synthetic_silences/silent_12.wav +3 -0
  15. synthetic_silences/silent_13.wav +3 -0
  16. synthetic_silences/silent_14.wav +3 -0
  17. synthetic_silences/silent_15.wav +3 -0
  18. synthetic_silences/silent_16.wav +3 -0
  19. synthetic_silences/silent_17.wav +3 -0
  20. synthetic_silences/silent_18.wav +3 -0
  21. synthetic_silences/silent_19.wav +0 -0
  22. synthetic_silences/silent_2.wav +3 -0
  23. synthetic_silences/silent_20.wav +3 -0
  24. synthetic_silences/silent_3.wav +3 -0
  25. synthetic_silences/silent_4.wav +3 -0
  26. synthetic_silences/silent_5.wav +3 -0
  27. synthetic_silences/silent_6.wav +3 -0
  28. synthetic_silences/silent_7.wav +3 -0
  29. synthetic_silences/silent_8.wav +3 -0
  30. synthetic_silences/silent_9.wav +3 -0
  31. tasks/__pycache__/task_count.cpython-312.pyc +0 -0
  32. tasks/__pycache__/task_duration.cpython-312.pyc +0 -0
  33. tasks/__pycache__/task_order.cpython-312.pyc +0 -0
  34. tasks/__pycache__/task_volume.cpython-312.pyc +0 -0
  35. tasks/task_count.py +472 -0
  36. tasks/task_duration.py +820 -0
  37. tasks/task_order.py +598 -0
  38. tasks/task_volume.py +732 -0
  39. utils/__init__.py +50 -0
  40. utils/__pycache__/__init__.cpython-312.pyc +0 -0
  41. utils/__pycache__/__init__.cpython-314.pyc +0 -0
  42. utils/__pycache__/audio_utils.cpython-312.pyc +0 -0
  43. utils/__pycache__/audio_utils.cpython-314.pyc +0 -0
  44. utils/__pycache__/dataset_utils.cpython-312.pyc +0 -0
  45. utils/__pycache__/llm_utils.cpython-312.pyc +0 -0
  46. utils/__pycache__/logger.cpython-312.pyc +0 -0
  47. utils/__pycache__/question_utils.cpython-312.pyc +0 -0
  48. utils/audio_utils.py +1388 -0
  49. utils/dataset_utils.py +536 -0
  50. utils/llm_utils.py +144 -0
.gitattributes CHANGED
@@ -33,3 +33,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ synthetic_silences/silent_1.wav filter=lfs diff=lfs merge=lfs -text
37
+ synthetic_silences/silent_10.wav filter=lfs diff=lfs merge=lfs -text
38
+ synthetic_silences/silent_11.wav filter=lfs diff=lfs merge=lfs -text
39
+ synthetic_silences/silent_12.wav filter=lfs diff=lfs merge=lfs -text
40
+ synthetic_silences/silent_13.wav filter=lfs diff=lfs merge=lfs -text
41
+ synthetic_silences/silent_14.wav filter=lfs diff=lfs merge=lfs -text
42
+ synthetic_silences/silent_15.wav filter=lfs diff=lfs merge=lfs -text
43
+ synthetic_silences/silent_16.wav filter=lfs diff=lfs merge=lfs -text
44
+ synthetic_silences/silent_17.wav filter=lfs diff=lfs merge=lfs -text
45
+ synthetic_silences/silent_18.wav filter=lfs diff=lfs merge=lfs -text
46
+ synthetic_silences/silent_2.wav filter=lfs diff=lfs merge=lfs -text
47
+ synthetic_silences/silent_20.wav filter=lfs diff=lfs merge=lfs -text
48
+ synthetic_silences/silent_3.wav filter=lfs diff=lfs merge=lfs -text
49
+ synthetic_silences/silent_4.wav filter=lfs diff=lfs merge=lfs -text
50
+ synthetic_silences/silent_5.wav filter=lfs diff=lfs merge=lfs -text
51
+ synthetic_silences/silent_6.wav filter=lfs diff=lfs merge=lfs -text
52
+ synthetic_silences/silent_7.wav filter=lfs diff=lfs merge=lfs -text
53
+ synthetic_silences/silent_8.wav filter=lfs diff=lfs merge=lfs -text
54
+ synthetic_silences/silent_9.wav filter=lfs diff=lfs merge=lfs -text
DOCS.md ADDED
@@ -0,0 +1,1296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TREA 2.0 - Technical Documentation
2
+
3
+ Comprehensive technical documentation for the TREA 2.0 audio dataset generation pipeline. This document covers the complete implementation including algorithms, mathematical formulations, configuration parameters, preprocessing details, and capacity-aware balancing mechanisms.
4
+
5
+ **For Quick Start Guide**: See [README.md](README.md)
6
+
7
+ ---
8
+
9
+ ## Table of Contents
10
+
11
+ 1. [Pipeline Overview](#pipeline-overview)
12
+ 2. [How Sample Durations Are Generated](#how-sample-durations-are-generated)
13
+ 3. [Configuration Reference](#configuration-reference)
14
+ 4. [ESC-50 Preprocessing](#esc-50-preprocessing-duration-task-only)
15
+ 5. [Audio Utilities](#audio-utilities)
16
+ 6. [Task: COUNT](#task-count)
17
+ 7. [Task: DURATION](#task-duration)
18
+ 8. [Task: ORDER](#task-order)
19
+ 9. [Task: VOLUME](#task-volume)
20
+ 10. [Deterministic Balancing Mechanisms](#deterministic-balancing-mechanisms)
21
+ 11. [Rejection Logic and Retry Mechanisms](#rejection-logic-and-retry-mechanisms)
22
+ 12. [Command-Line Arguments](#command-line-arguments)
23
+ 13. [Summary](#summary)
24
+
25
+ ---
26
+
27
+ ## Pipeline Overview
28
+
29
+ ### Architecture
30
+
31
+ The pipeline generates four types of audio-based question-answering samples:
32
+
33
+ | Task | Question Type | Example Question |
34
+ |------|---------------|------------------|
35
+ | **COUNT** | Counting unique sounds | "How many unique sounds do you hear?" |
36
+ | **DURATION** | Temporal comparison | "Which sound plays for the longest duration?" |
37
+ | **ORDER** | Temporal ordering | "Which sound plays first/last/after X?" |
38
+ | **VOLUME** | Loudness comparison | "Which sound is the loudest/softest?" |
39
+
40
+ ### Directory Structure
41
+
42
+ ```
43
+ pipeline/
44
+ ├── main.py # Entry point - orchestrates all tasks
45
+ ├── config.yaml # All configuration parameters
46
+ ├── tasks/
47
+ │ ├── task_count.py # CountTaskGenerator class
48
+ │ ├── task_duration.py # DurationTaskGenerator class
49
+ │ ├── task_order.py # OrderTaskGenerator class
50
+ │ └── task_volume.py # VolumeTaskGenerator class
51
+ ├── utils/
52
+ │ ├── __init__.py # Exports all utilities
53
+ │ ├── audio_utils.py # Audio processing functions
54
+ │ ├── dataset_utils.py # ESC50Dataset, PreprocessedESC50Dataset
55
+ │ ├── question_utils.py # QuestionGenerator
56
+ │ ├── llm_utils.py # LLMQuestionGenerator
57
+ │ └── logger.py # setup_logger
58
+ └── output/ # Generated outputs
59
+ ```
60
+
61
+ ### Data Flow
62
+
63
+ ```
64
+ ESC-50 Dataset (2000 clips, 50 categories, 5s each)
65
+
66
+ [DURATION TASK ONLY] Preprocessing Script (preprocess_esc50.py)
67
+ ├── Detects sound regions using adaptive noise-floor thresholding
68
+ ├── Trims leading/trailing silence (keeps internal structure)
69
+ ├── Calculates effective durations
70
+
71
+ ESC-50_preprocessed/
72
+ ├── effective_durations.csv (metadata with effective durations)
73
+ └── trimmed_audio/*.wav (edge-trimmed clips)
74
+
75
+ Pipeline (task-specific generation with balancing)
76
+ ├── COUNT: Uses raw ESC-50 clips
77
+ ├── DURATION: Uses preprocessed clips with effective durations
78
+ ├── ORDER: Uses raw ESC-50 clips
79
+ └── VOLUME: Uses raw ESC-50 clips (normalized then volume-adjusted)
80
+
81
+ output/{task}/
82
+ ├── audios/*.wav (generated audio samples)
83
+ ├── {task}_mcq.csv (multiple choice questions)
84
+ ├── {task}_open_text.csv (open-ended questions)
85
+ └── {task}_metadata.csv (detailed metadata)
86
+ ```
87
+
88
+ ### Entry Point: `main.py`
89
+
90
+ The main orchestration happens via individual task runner functions:
91
+
92
+ ```python
93
+ def run_count_task(config: dict, logger):
94
+ generator = CountTaskGenerator(config, logger)
95
+ generator.dataset.reset_category_usage()
96
+ generator.generate_dataset()
97
+
98
+ def run_duration_task(config: dict, logger):
99
+ generator = DurationTaskGenerator(config, logger)
100
+ generator.dataset.reset_category_usage()
101
+ generator.generate_dataset()
102
+
103
+ def run_order_task(config: dict, logger):
104
+ generator = OrderTaskGenerator(config, logger)
105
+ generator.dataset.reset_category_usage()
106
+ generator.generate_dataset()
107
+
108
+ def run_volume_task(config: dict, logger):
109
+ generator = VolumeTaskGenerator(config, logger)
110
+ generator.dataset.reset_category_usage()
111
+ generator.generate_dataset()
112
+ ```
113
+
114
+ ---
115
+
116
+ ## How Sample Durations Are Generated
117
+
118
+ **IMPORTANT**: Sample durations are generated upfront to **exactly fill the target task duration**.
119
+
120
+ ### The Algorithm
121
+
122
+ Located in `utils/audio_utils.py`:
123
+
124
+ ```python
125
+ def generate_sample_durations_for_task(
126
+ task_duration_hours: float,
127
+ min_clip_duration: float,
128
+ max_clip_duration: float
129
+ ) -> list:
130
+ """
131
+ Generate sample durations that exactly fill the target task duration.
132
+ """
133
+ task_duration_seconds = task_duration_hours * 3600
134
+ remaining = task_duration_seconds
135
+ durations = []
136
+
137
+ while remaining >= min_clip_duration:
138
+ # Cap max at remaining to avoid overshoot
139
+ effective_max = min(max_clip_duration, remaining)
140
+
141
+ # If remaining is less than min, we can't fit another sample
142
+ if effective_max < min_clip_duration:
143
+ break
144
+
145
+ # Sample uniformly within valid range
146
+ d = random.uniform(min_clip_duration, effective_max)
147
+ durations.append(d)
148
+ remaining -= d
149
+
150
+ # Shuffle to randomize order
151
+ random.shuffle(durations)
152
+
153
+ return durations
154
+ ```
155
+
156
+ 1. Start with `remaining = total_seconds`
157
+ 2. While `remaining >= min_clip_duration`:
158
+ - Sample `d ~ Uniform(min, min(max, remaining))`
159
+ - Append `d` to durations list
160
+ - Subtract `d` from remaining
161
+ 3. Shuffle and return
162
+
163
+ ### Mathematical Properties
164
+
165
+ **Guarantee**: $\sum_{i=1}^{N} d_i \leq T$ and $T - \sum d_i < d_{\min}$
166
+
167
+ Where:
168
+ - $T$ = total task duration
169
+ - $d_i$ = duration of sample $i$
170
+ - $d_{\min}$ = minimum clip duration
171
+ - $N$ = number of samples generated (variable, not fixed!)
172
+
173
+ **Each duration**: $d_i \sim \text{Uniform}(d_{\min}, \min(d_{\max}, \text{remaining}_i))$
174
+
175
+ ### Example
176
+
177
+ With `task_duration_size = 1.0` hours (3600s), `min = 20s`, `max = 60s`:
178
+
179
+ ```
180
+ remaining=3600 → d₁=45.2s → remaining=3554.8
181
+ remaining=3554.8 → d₂=28.7s → remaining=3526.1
182
+ remaining=3526.1 → d₃=52.1s → remaining=3474.0
183
+ ...
184
+ remaining=35.2 → d₈₉=35.2s → remaining=0 (capped at remaining)
185
+ ```
186
+
187
+ Result: 89 samples totaling exactly 3600s (instead of estimated 90)
188
+
189
+ ### Where It's Called
190
+
191
+ Each task's `generate_dataset()` method uses this:
192
+
193
+ ```python
194
+ def generate_dataset(self) -> tuple:
195
+ # Generate all durations upfront
196
+ sample_durations = generate_sample_durations_for_task(
197
+ self.task_duration_hours,
198
+ self.min_clip_duration,
199
+ self.max_clip_duration
200
+ )
201
+ num_samples = len(sample_durations)
202
+
203
+ self.logger.info(f"Generating {num_samples} samples...")
204
+
205
+ # Each sample uses its pre-assigned duration
206
+ for i, target_duration in enumerate(sample_durations):
207
+ metadata = self.generate_sample(i, target_duration=target_duration, ...)
208
+ ```
209
+ ```
210
+
211
+ ---
212
+
213
+ ## Configuration Reference
214
+
215
+ All parameters are defined in `config.yaml`.
216
+
217
+ ### Dataset Class Subset Configuration
218
+
219
+ ```yaml
220
+ dataset:
221
+ use_class_subset: false # Enable to use only a subset of ESC-50 classes
222
+ num_classes_subset: 40 # Number of classes for train/val/test (e.g., 40 of 50)
223
+ subset_persist_path: "output/class_subset.json" # Path to save/load class subset
224
+ subset_seed: 42 # Random seed for subset selection (persisted)
225
+ ```
226
+
227
+ **Purpose**: Create in-distribution (ID) splits using a subset of classes, then optionally test on out-of-distribution (OOD) using all classes.
228
+
229
+ **Workflow**:
230
+ 1. Set `use_class_subset: true` and `num_classes_subset: 40`
231
+ 2. Run pipeline - 40 classes randomly selected and saved to `class_subset.json`
232
+ 3. Generate train/val/test splits - all use same 40 classes
233
+ 4. For OOD test: Set `use_class_subset: false`, use different output path
234
+
235
+ ### Global Audio Parameters
236
+
237
+ ```yaml
238
+ audio:
239
+ min_clip_duration: 20.0 # Minimum generated clip duration (seconds)
240
+ max_clip_duration: 60.0 # Maximum generated clip duration (seconds)
241
+ source_clip_duration: 5.0 # ESC-50 clip length (seconds)
242
+
243
+ # Silence and crossfade parameters (applied to ALL tasks)
244
+ min_silence_duration: 100 # Minimum silence ALWAYS between clips (ms)
245
+ max_extra_silence_per_gap: 500 # Max extra silence per gap when distributing remainder (ms)
246
+ crossfade_duration: 500 # Crossfade between audio-silence transitions (ms) for smooth joins
247
+ crossfade_within_source: 50 # Small crossfade within same-source repetitions (ms) for COUNT task
248
+ with_silence: true # Enable silence insertion between clips
249
+
250
+ normalize: false
251
+ normalize_target_dBFS: -20.0
252
+ ```
253
+
254
+ ### Task-Specific Parameters
255
+
256
+ #### COUNT Task
257
+ ```yaml
258
+ count:
259
+ enabled: true
260
+ task_duration_size: 2.0 # Hours of total audio to generate
261
+ max_clips_per_sample: 10 # Maximum unique sounds per sample (1 to 10)
262
+ ordering_mode: "random" # "random" (shuffled clips) or "consecutive" (grouped by source)
263
+
264
+ # CAPACITY-AWARE ANSWER BALANCING:
265
+ # - Creates balanced distribution of answers from 1 to max_clips_per_sample
266
+ # - Sorts samples by capacity (max_clips each can fit)
267
+ # - Assigns higher targets to high-capacity samples
268
+ # - Clamps targets to what actually fits (reduces excessive silence)
269
+ ```
270
+
271
+ #### DURATION Task
272
+ ```yaml
273
+ duration:
274
+ enabled: true
275
+ task_duration_size: 2.0
276
+ preprocessed_data_path: "/home/debarpanb1/TREA_2.0/ESC-50_preprocessed"
277
+ question_types: ["shortest", "longest"]
278
+ num_unique_sources: 10 # Can be int or list (e.g., [2,3,4,5])
279
+ ordering_methods: ["consecutive"] # Only consecutive for duration task
280
+
281
+ # Preprocessing parameters (adaptive noise-floor thresholding)
282
+ threshold_strategy: "noise_floor" # Adaptive per-clip (recommended)
283
+ noise_floor_percentile: 2.0 # Use 2nd percentile as noise floor
284
+ noise_floor_delta_db: 5.0 # Threshold = noise_floor + 5dB
285
+ min_sound_duration_ms: 25 # Filter transient spikes
286
+
287
+ # Gap multipliers
288
+ multiplier_longest: 1.5 # Target must be ≥ 1.5x max background
289
+ multiplier_shortest: 0.75 # Target must be ≤ 0.75x min background (changed from 0.5)
290
+ min_effective_duration_per_source: 1.0 # Minimum duration per source (seconds)
291
+
292
+ reject_if_gap_not_met: true
293
+ sample_different_clips_same_class: true
294
+ ```
295
+
296
+ #### ORDER Task
297
+ ```yaml
298
+ order:
299
+ enabled: true
300
+ task_duration_size: 2.0
301
+ max_clips_per_sample: 10 # Cap for maximum clips to join
302
+ question_types: ["first", "last", "second", "second_last", "after", "before"]
303
+ min_clips_for_second_questions: 3 # "second" and "second_last" require ≥3 clips
304
+ allow_source_repetition: false # Each clip from unique source
305
+
306
+ # CAPACITY-AWARE QUESTION TYPE BALANCING:
307
+ # - Each question type appears equally across samples
308
+ # - Advanced types (second, second_last) assigned to high-capacity samples
309
+ # - Basic types (first, last, after, before) for lower-capacity samples
310
+ # - NO n_clips balancing: randomly samples from [max(2, max_clips-3), max_clips_per_sample]
311
+ ```
312
+
313
+ #### VOLUME Task
314
+ ```yaml
315
+ volume:
316
+ enabled: true
317
+ task_duration_size: 2.0
318
+ max_clips_per_sample: 10 # Cap for maximum clips with different volumes
319
+ question_types: ["max_loudness", "min_loudness"]
320
+
321
+ # Normalization (CRITICAL for controlled volume comparison)
322
+ normalize_to_baseline: true
323
+ baseline_dBFS: -20.0 # All clips normalized to this level first
324
+ use_lufs: false # DISABLED - LUFS makes everything same perceived loudness!
325
+ baseline_lufs: -23.0 # EBU R128 standard (not used when use_lufs=false)
326
+
327
+ # Volume gap constraints (multipliers)
328
+ multiplier_max_loudness: 4.0 # Max must be ≥ 4x second-loudest (~12 dB)
329
+ multiplier_min_loudness: 0.25 # Min must be ≤ 0.25x second-softest (~12 dB)
330
+ reject_if_gap_not_met: true
331
+
332
+ # Source clip options
333
+ use_same_clip_different_volumes: false # Use different clips (not same clip repeated)
334
+ repetitions_per_source: [2, 3, 4] # If same clip used, how many repetitions
335
+
336
+ # QUESTION TYPE BALANCING: Each question type appears equally across samples
337
+ # NO n_clips balancing: randomly samples from [max(2, max_clips-3), max_clips_per_sample]
338
+ ```
339
+
340
+ ---
341
+
342
+ ## ESC-50 Preprocessing (Duration Task Only)
343
+
344
+ **File**: `preprocess_esc50.py`
345
+ **Purpose**: Preprocess ESC-50 clips for duration task by detecting actual sound regions and trimming silence.
346
+
347
+ ### Why Preprocessing?
348
+
349
+ The DURATION task compares sound durations. Raw ESC-50 clips have variable amounts of leading/trailing silence, which would make duration comparisons ambiguous. Preprocessing:
350
+
351
+ 1. **Detects actual sound regions** using adaptive amplitude thresholding
352
+ 2. **Trims leading and trailing silence** (preserves internal structure)
353
+ 3. **Calculates effective duration** (sum of all sound regions)
354
+ 4. **Generates metadata CSV** with per-clip durations
355
+
356
+ ### Preprocessing Pipeline
357
+
358
+ ```
359
+ Raw ESC-50 clip (5s with silence)
360
+
361
+ 1. Load audio and convert to amplitude array
362
+ 2. Compute RMS envelope (frame-by-frame energy)
363
+ 3. Convert RMS to dB values
364
+ 4. Apply adaptive threshold strategy
365
+ 5. Detect contiguous sound regions
366
+ 6. Trim edges (only if silence >= 100ms)
367
+ 7. Calculate effective duration
368
+ 8. Save trimmed audio + metadata
369
+ ```
370
+
371
+ ### Adaptive Noise-Floor Thresholding
372
+
373
+ The preprocessing uses an **adaptive per-clip threshold** strategy:
374
+
375
+ ```python
376
+ # Strategy: 'noise_floor' (adaptive, recommended)
377
+ noise_floor_db = np.percentile(db_values, noise_floor_percentile) # e.g., 2nd percentile
378
+ absolute_threshold = noise_floor_db + noise_floor_delta_db # e.g., +5 dB above noise floor
379
+ ```
380
+
381
+ **Key Parameters** (from `config.yaml`):
382
+ ```yaml
383
+ duration:
384
+ threshold_strategy: "noise_floor" # Adaptive per-clip (recommended)
385
+ noise_floor_percentile: 2.0 # Use 2nd percentile as noise floor estimate
386
+ noise_floor_delta_db: 5.0 # Threshold = noise_floor + 5 dB
387
+ min_sound_duration_ms: 25 # Filter out transient spikes < 25ms
388
+ ```
389
+
390
+ **Why Adaptive?**
391
+ - Each clip has different background noise levels
392
+ - Fixed threshold (e.g., -40 dB) works poorly across diverse sounds
393
+ - Adaptive threshold adjusts per-clip based on its own noise floor
394
+
395
+ **Alternative** (legacy):
396
+ ```yaml
397
+ threshold_strategy: "peak_relative" # threshold = peak_dB - 20 dB (fixed offset)
398
+ amplitude_threshold_db: -20.0
399
+ ```
400
+
401
+ ### Edge Trimming Strategy
402
+
403
+ **ADAPTIVE EDGE-ONLY TRIMMING** - preserves natural periodicity:
404
+
405
+ ```python
406
+ def extract_sound_with_edges_trimmed(audio, regions, min_silence_to_trim_ms=100, buffer_ratio=0.1):
407
+ """
408
+ Trim ONLY leftmost and rightmost silence IF significant.
409
+ Preserves ALL internal structure (perfect for periodic sounds).
410
+ """
411
+ leading_silence_ms = regions[0][0] # Time before first sound
412
+ trailing_silence_ms = len(audio) - regions[-1][1] # Time after last sound
413
+
414
+ # Only trim if silence >= 100ms
415
+ if leading_silence_ms >= min_silence_to_trim_ms:
416
+ buffer_ms = max(200, int(leading_silence_ms * 0.1)) # Keep 10% as buffer
417
+ trim_start_ms = max(0, regions[0][0] - buffer_ms)
418
+ else:
419
+ trim_start_ms = 0 # Keep from start
420
+
421
+ # Similar for trailing silence
422
+ ...
423
+
424
+ return audio[trim_start_ms:trim_end_ms]
425
+ ```
426
+
427
+ **Why Edge-Only?**
428
+ - Clock ticks, footsteps, typing have periodic silence between sounds
429
+ - Removing internal silences destroys natural rhythm
430
+ - Edge trimming removes irrelevant silence while preserving periodicity
431
+
432
+ ### Output Files
433
+
434
+ ```
435
+ ESC-50_preprocessed/
436
+ ├── effective_durations.csv
437
+ │ ├── filename
438
+ │ ├── category
439
+ │ ├── raw_duration_s (original 5.0s)
440
+ │ ├── final_duration_s (after edge trimming)
441
+ │ ├── effective_duration_s (sum of sound regions)
442
+ │ ├── num_sound_regions
443
+ │ ├── peak_amplitude_db
444
+ │ ├── avg_rms_db
445
+ │ └── threshold_strategy, noise_floor_percentile, noise_floor_delta_db
446
+ └── trimmed_audio/
447
+ ├── 1-100032-A-0.wav (edge-trimmed clips)
448
+ └── ...
449
+ ```
450
+
451
+ ### Running Preprocessing
452
+
453
+ ```bash
454
+ # Using config defaults
455
+ python preprocess_esc50.py --config config.yaml
456
+
457
+ # Override parameters
458
+ python preprocess_esc50.py --config config.yaml \
459
+ --threshold-strategy noise_floor \
460
+ --noise-floor-percentile 2.0 \
461
+ --noise-floor-delta-db 5.0 \
462
+ --min-sound-ms 25
463
+
464
+ # Don't save trimmed audio (only CSV)
465
+ python preprocess_esc50.py --config config.yaml --no-trimmed-audio
466
+ ```
467
+
468
+ ### Preprocessing Statistics Example
469
+
470
+ ```
471
+ ESC-50 Preprocessing Summary
472
+ ============================================================
473
+ Total clips processed: 2000
474
+ Successfully processed: 2000
475
+
476
+ Raw duration statistics:
477
+ Mean: 5.000s Std: 0.000s Min: 5.000s Max: 5.000s
478
+
479
+ Final duration statistics (edges trimmed):
480
+ Mean: 4.723s Std: 0.412s Min: 2.134s Max: 5.000s
481
+
482
+ Effective duration statistics (sum of sound regions):
483
+ Mean: 3.856s Std: 0.823s Min: 0.542s Max: 4.982s
484
+
485
+ Comparison:
486
+ Avg effective: 3.856s
487
+ Avg final: 4.723s
488
+ Difference: 0.867s (internal silences preserved)
489
+
490
+ Average edge trimming reduction: 5.5%
491
+ ```
492
+
493
+ ### How Duration Task Uses Preprocessed Data
494
+
495
+ The `DurationTaskGenerator` loads preprocessed data:
496
+
497
+ ```python
498
+ self.preprocessed_dataset = PreprocessedESC50Dataset(
499
+ metadata_csv=config['tasks']['duration']['preprocessed_data_path'] + '/effective_durations.csv',
500
+ audio_dir=config['tasks']['duration']['preprocessed_data_path'] + '/trimmed_audio'
501
+ )
502
+
503
+ # Calculate average effective duration for slot distribution
504
+ effective_durations = self.preprocessed_dataset.metadata_df['effective_duration_s']
505
+ self.avg_effective_duration = effective_durations.mean() # ~3.856s
506
+ ```
507
+
508
+ ---
509
+
510
+ ## Audio Utilities
511
+
512
+ Located in `utils/audio_utils.py`.
513
+
514
+ ### `generate_single_clip_duration(min_duration, max_duration) → float`
515
+
516
+ **Purpose**: Generate a random target clip duration using UNIFORM sampling.
517
+
518
+ **Implementation**:
519
+ ```python
520
+ def generate_single_clip_duration(min_duration: float, max_duration: float) -> float:
521
+ return random.uniform(min_duration, max_duration)
522
+ ```
523
+
524
+ **Mathematical Formulation**:
525
+ $$d \sim \text{Uniform}(d_{\min}, d_{\max})$$
526
+
527
+ With default values (20s, 60s):
528
+ - Mean: $\mu = \frac{20 + 60}{2} = 40$ seconds
529
+ - Standard Deviation: $\sigma = \frac{60 - 20}{\sqrt{12}} \approx 11.5$ seconds
530
+
531
+ ---
532
+
533
+ ### `get_max_clip_num_to_be_joined(target_duration_s, source_duration_s, min_silence_ms) → Tuple[int, float]`
534
+
535
+ **Purpose**: Calculate maximum number of source clips that can fit in target duration.
536
+
537
+ **Returns**: Tuple of (max_clips, remainder_seconds)
538
+
539
+ **Implementation** (conceptual):
540
+ ```python
541
+ def get_max_clip_num_to_be_joined(target_s, source_s, min_silence_ms):
542
+ silence_s = min_silence_ms / 1000.0
543
+ # Each clip + silence except last
544
+ effective_unit = source_s + silence_s
545
+ max_clips = int((target_s + silence_s) / effective_unit)
546
+ remainder = target_s - (max_clips * source_s + (max_clips - 1) * silence_s)
547
+ return max_clips, remainder
548
+ ```
549
+
550
+ **Mathematical Formula**:
551
+ $$N_{\max} = \left\lfloor \frac{T + g}{S + g} \right\rfloor$$
552
+
553
+ Where:
554
+ - $T$ = target duration (seconds)
555
+ - $S$ = source clip duration (5.0s for ESC-50)
556
+ - $g$ = minimum silence gap (seconds)
557
+
558
+ ---
559
+
560
+ ### `build_count_task_audio(source_audios, source_categories, target_duration, ...)`
561
+
562
+ **Purpose**: Build the final audio for COUNT task.
563
+
564
+ **Parameters**:
565
+ - `source_audios`: List of AudioSegment objects (one per category)
566
+ - `source_categories`: List of category names
567
+ - `target_duration`: Target total duration in seconds
568
+ - `ordering_mode`: "random" or "consecutive"
569
+ - `source_clip_duration_seconds`: Duration of each source clip
570
+ - `min_silence_ms`, `max_extra_silence_per_gap_ms`: Silence parameters
571
+
572
+ **Returns**: Tuple of (final_audio, clip_sequence, build_metadata)
573
+
574
+ ---
575
+
576
+ ### `build_duration_task_audio(...)`
577
+
578
+ **Purpose**: Build audio for DURATION task with slot distribution.
579
+
580
+ ---
581
+
582
+ ### `build_clip_sequence_with_silences(clips, target_duration_s, min_silence_ms, max_extra_silence_per_gap_ms, crossfade_ms)`
583
+
584
+ **Purpose**: Concatenate clips with random silence gaps and smooth crossfades.
585
+
586
+ **Algorithm**:
587
+ 1. Calculate total audio content duration
588
+ 2. Calculate minimum required silence: `(n_clips - 1) × min_silence_ms`
589
+ 3. Calculate available extra time: `target_duration - total_audio - min_silence`
590
+ 4. Distribute extra time randomly across gaps (up to `max_extra_silence_per_gap_ms` per gap)
591
+ 5. Build sequence with crossfades:
592
+ - Audio → Silence: crossfade for smooth transition
593
+ - Silence → Audio: No crossfade (preserves audio start)
594
+
595
+ **Crossfade Benefits**:
596
+ - Smooth transitions between audio and silence
597
+ - Reduces clicks/pops at audio boundaries
598
+ - Preserves natural sound attack (no crossfade at audio start)
599
+
600
+ ---
601
+
602
+ ## Task: COUNT
603
+
604
+ **File**: `tasks/task_count.py`
605
+ **Class**: `CountTaskGenerator`
606
+
607
+ ### Complete Flow
608
+
609
+ ```
610
+ CountTaskGenerator.__init__(config, logger)
611
+
612
+ Initialize:
613
+ - ESC50Dataset (loads metadata, tracks category usage)
614
+ - AudioProcessor
615
+ - QuestionGenerator
616
+ - LLMQuestionGenerator (if enabled)
617
+
618
+ generate_dataset()
619
+
620
+ 1. num_samples = calculate_num_samples_for_task(task_duration_hours, min, max)
621
+ 2. Create balanced_answers list from num_clips_per_sample
622
+ 3. Shuffle balanced_answers
623
+ 4. For each sample:
624
+ generate_sample(sample_id, target_unique_count=balanced_answers[i])
625
+ 5. Save CSVs
626
+ ```
627
+
628
+ ### Key Method: `generate_sample(sample_id, target_unique_count)`
629
+
630
+ **Pipeline**:
631
+ 1. Generate random target duration: `clip_duration_seconds = generate_single_clip_duration(min, max)`
632
+ 2. Calculate max clips: `max_clips, remainder = get_max_clip_num_to_be_joined(...)`
633
+ 3. Cap `n_unique_audios` at min(target_unique_count, max_clips, 50)
634
+ 4. Select categories: `selected_categories = dataset.get_least_used_categories(n_unique_audios)`
635
+ 5. Track usage: Increment `category_usage_counts` for each selected category
636
+ 6. Sample one file per category: `dataset.sample_file_from_category(category)`
637
+ 7. Load source audios
638
+ 8. Build final audio: `build_count_task_audio(source_audios, categories, target_duration, ordering_mode, ...)`
639
+ 9. Export audio file
640
+ 10. Generate MCQ and open-text questions
641
+ 11. Return metadata dict
642
+
643
+ ### Balanced Answer Distribution (Updated with max_clips_per_sample)
644
+
645
+ ```python
646
+ # In generate_dataset()
647
+ max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10) # Single number: 10
648
+ possible_answers = list(range(1, max_clips_per_sample + 1)) # [1, 2, 3, ..., 10]
649
+
650
+ samples_per_answer = num_samples // len(possible_answers)
651
+ remainder = num_samples % len(possible_answers)
652
+
653
+ balanced_answers = []
654
+ for answer in possible_answers:
655
+ count = samples_per_answer + (1 if remainder > 0 else 0)
656
+ balanced_answers.extend([answer] * count)
657
+ remainder = max(0, remainder - 1)
658
+
659
+ random.shuffle(balanced_answers)
660
+ ```
661
+
662
+ **For 90 samples, max_clips_per_sample=10**: Each answer (1-10) appears exactly 9 times.
663
+
664
+ ### Silence Reduction Strategy (NEW)
665
+
666
+ Each sample's target answer is capped at what actually fits in the duration:
667
+
668
+ ```python
669
+ # In generate_sample()
670
+ max_clips, _ = get_max_clip_num_to_be_joined(clip_duration_seconds, source_clip_duration, min_silence_ms)
671
+
672
+ if target_unique_count is not None:
673
+ # Cap target at what actually fits (reduces silence)
674
+ n_unique_audios = min(target_unique_count, max_clips, len(CATEGORIES))
675
+ ```
676
+
677
+ **Example**:
678
+ - Target answer from balanced pool: **8 unique sounds**
679
+ - Duration allows: **max_clips = 7**
680
+ - Actual n_unique_audios: **min(8, 7) = 7** ✓ (uses max possible, reduces silence)
681
+
682
+ **Why?** Prevents excessive silence when target exceeds what fits in duration.
683
+
684
+ ---
685
+
686
+ ## Task: DURATION
687
+
688
+ **File**: `tasks/task_duration.py`
689
+ **Class**: `DurationTaskGenerator`
690
+
691
+ ### Complete Flow
692
+
693
+ ```
694
+ DurationTaskGenerator.__init__(config, logger)
695
+
696
+ Initialize:
697
+ - PreprocessedESC50Dataset (uses effective_durations.csv)
698
+ - Calculate avg_effective_duration from preprocessed data
699
+ - AudioProcessor, QuestionGenerator
700
+ - Load multiplier_longest, multiplier_shortest from config
701
+
702
+ generate_dataset()
703
+
704
+ 1. num_samples = calculate_num_samples_for_task(...)
705
+ 2. Create balanced question types: ["longest"] * 45 + ["shortest"] * 45
706
+ 3. Shuffle balanced_types
707
+ 4. While len(samples) < num_samples:
708
+ generate_sample(sample_idx, question_type=balanced_types[idx])
709
+ If returns None → increment rejection_count, continue
710
+ 5. Save CSVs
711
+ ```
712
+
713
+ ### Key Methods
714
+
715
+ #### `_calculate_max_clips_and_sources(target_duration_s, question_type)`
716
+
717
+ **Purpose**: Determine valid number of sources based on question type and duration.
718
+
719
+ **For LONGEST**:
720
+ - Target needs ≥2 clips to beat backgrounds by 1.5x
721
+ - `min_valid_sources = 2`
722
+ - `max_valid_sources = max_clips - 2 + 1`
723
+
724
+ **For SHORTEST**:
725
+ - Target gets 1 clip
726
+ - Each background needs ≥2 clips to be 2x target
727
+ - `max_valid_sources = 1 + (max_clips - 1) // 2`
728
+
729
+ ```python
730
+ # Filter config values to valid range, then pick RANDOMLY
731
+ valid_config_sources = [n for n in num_sources_config if min_valid <= n <= max_valid]
732
+ n_sources = random.choice(valid_config_sources)
733
+ ```
734
+
735
+ #### `_try_generate_sample(sample_id, question_type)`
736
+
737
+ **Full Algorithm**:
738
+ 1. Generate target duration: `generate_single_clip_duration(min, max)`
739
+ 2. Calculate max_clips and n_sources: `_calculate_max_clips_and_sources(...)`
740
+ 3. Select target category (least used)
741
+ 4. Select background categories (from remaining least used)
742
+ 5. Calculate slot distribution based on question_type
743
+ 6. For each category, select source files and generate clip durations
744
+ 7. Load and trim clips
745
+ 8. Calculate total effective duration per category
746
+ 9. Verify gap constraint
747
+ 10. If gap not satisfied, try `_try_improve_slot_distribution()`
748
+ 11. If still not satisfied, return None (triggers retry)
749
+ 12. Build audio and generate questions
750
+ 13. Return metadata
751
+
752
+ #### `_try_improve_slot_distribution(slot_distribution, durations, question_type, max_clips)`
753
+
754
+ **Purpose**: Redistribute slots to satisfy gap constraint.
755
+
756
+ ---
757
+
758
+ ## Task: ORDER
759
+
760
+ **File**: `tasks/task_order.py`
761
+ **Class**: `OrderTaskGenerator`
762
+
763
+ ### Complete Flow
764
+
765
+ ```
766
+ OrderTaskGenerator.__init__(config, logger)
767
+
768
+ Initialize ESC50Dataset, AudioProcessor, QuestionGenerator
769
+
770
+ generate_dataset()
771
+
772
+ 1. Generate sample durations upfront (exact fill)
773
+ 2. num_samples = len(sample_durations)
774
+ 3. Create balanced question_types distribution
775
+ 4. For each sample:
776
+ generate_sample(sample_id, target_question_type=balanced_types[i])
777
+ → n_clips randomly selected from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)]
778
+ 5. Save CSVs
779
+ ```
780
+
781
+ ### Key Method: `_get_valid_question_types(n_clips)`
782
+
783
+ Filters question types based on clip count:
784
+ - `second`, `second_last`: require `n_clips >= min_clips_for_second_questions` (default: 4)
785
+ - `after`, `before`: require `n_clips >= 2`
786
+ - `first`, `last`: always valid
787
+
788
+ ### Key Method: `generate_sample(sample_id, target_question_type, target_duration_seconds)`
789
+
790
+ **Algorithm**:
791
+ 1. Use pre-generated `target_duration_seconds` (from sample_durations)
792
+ 2. Calculate max_clips from duration: `get_max_clip_num_to_be_joined(...)`
793
+ 3. **Silence reduction - randomly select n_clips**:
794
+ ```python
795
+ min_clips = max(2, max_clips - 3)
796
+ max_clips_allowed = min(max_clips, max_clips_per_sample, len(CATEGORIES))
797
+ if min_clips > max_clips_allowed: # Handle edge case
798
+ min_clips = max_clips_allowed
799
+ n_clips = random.randint(min_clips, max_clips_allowed)
800
+ ```
801
+ 4. Get valid question types for n_clips
802
+ 5. Select answer position based on question type:
803
+ - `first` → position 0
804
+ - `last` → position n_clips - 1
805
+ - `second` → position 1
806
+ - `second_last` → position n_clips - 2
807
+ - `after` → random position 1 to n-1
808
+ - `before` → random position 0 to n-2
809
+ 6. Select categories using least-used balancing (answer first, then others)
810
+ 7. Build audio with `build_clip_sequence_with_silences` (includes crossfade)
811
+ 8. Generate questions including sequence question
812
+ 9. Return metadata
813
+
814
+ **Silence Reduction**: Target n_clips is capped at `max_clips` to avoid excessive silence.
815
+
816
+ ---
817
+
818
+ ## Task: VOLUME
819
+
820
+ **File**: `tasks/task_volume.py`
821
+ **Class**: `VolumeTaskGenerator`
822
+
823
+ ### Complete Flow
824
+
825
+ ```
826
+ VolumeTaskGenerator.__init__(config, logger)
827
+
828
+ Initialize ESC50Dataset, AudioProcessor, QuestionGenerator
829
+ Load multiplier_max_loudness, multiplier_min_loudness, baseline normalization settings
830
+
831
+ generate_dataset()
832
+
833
+ 1. Generate sample durations upfront (exact fill)
834
+ 2. num_samples = len(sample_durations)
835
+ 3. Create balanced clips_count_pool from 2 to max_clips_per_sample
836
+ 4. Create balanced question_types: ["max_loudness"] * N/2 + ["min_loudness"] * N/2
837
+ 5. Shuffle both pools
838
+ 6. Store clips_count_pool as instance variable
839
+ 7. For each sample:
840
+ generate_sample(sample_id, target_question_type=balanced_types[i])
841
+ → Uses clips_count_pool.pop(0) internally, capped at max_clips_that_fit
842
+ → Normalizes clips to baseline, applies volume adjustments
843
+ → Verifies gap constraints (up to 10 attempts)
844
+ 8. Save CSVs
845
+ ```
846
+
847
+ ### Key Methods
848
+
849
+ #### `_normalize_to_baseline(audio)`
850
+
851
+ ```python
852
+ def _normalize_to_baseline(self, audio):
853
+ if not self.normalize_to_baseline:
854
+ return audio
855
+ change_in_dBFS = self.baseline_dBFS - audio.dBFS
856
+ return audio.apply_gain(change_in_dBFS)
857
+ ```
858
+
859
+ #### `_verify_loudness_gap(volume_levels, question_type)`
860
+
861
+ **For MAX_LOUDNESS**:
862
+ ```python
863
+ required_gap_dB = 20 * math.log10(self.multiplier_max_loudness) # ≈ 3.52 dB
864
+ actual_gap_dB = max_level - second_max
865
+ gap_satisfied = actual_gap_dB >= required_gap_dB
866
+ ```
867
+
868
+ **For MIN_LOUDNESS**:
869
+ ```python
870
+ required_gap_dB = abs(20 * math.log10(self.multiplier_min_loudness)) # ≈ 6.02 dB
871
+ actual_gap_dB = second_min - min_level
872
+ gap_satisfied = actual_gap_dB >= required_gap_dB
873
+ ```
874
+
875
+ #### Volume Level Generation
876
+
877
+ Volume levels are generated to satisfy gap constraints:
878
+ - For `max_loudness`: target gets +gap_dB above baseline, backgrounds at/below baseline
879
+ - For `min_loudness`: target gets -gap_dB below baseline, backgrounds at/above baseline
880
+
881
+ ---
882
+
883
+ ## Deterministic Balancing Mechanisms
884
+
885
+ ### Overview
886
+
887
+ The pipeline ensures balanced distributions across multiple dimensions with **capacity-aware assignment**.
888
+
889
+ ### 1. Capacity-Aware Answer Balancing (COUNT Task)
890
+
891
+ Each possible answer (1-10) appears equally often, but **higher targets are assigned to samples with higher capacity**.
892
+
893
+ ```python
894
+ # Calculate capacity for each sample
895
+ for duration in sample_durations:
896
+ max_clips, _ = get_max_clip_num_to_be_joined(duration, source_clip_duration, min_silence_ms)
897
+ max_for_sample = min(max_clips, max_clips_per_sample, len(CATEGORIES))
898
+ sample_max_clips.append(max_for_sample)
899
+
900
+ # Create balanced pool
901
+ possible_answers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
902
+ samples_per_answer = num_samples // len(possible_answers)
903
+ remainder = num_samples % len(possible_answers)
904
+
905
+ assignment_pool = []
906
+ for answer in possible_answers:
907
+ count = samples_per_answer + (1 if remainder > 0 else 0)
908
+ assignment_pool.extend([answer] * count)
909
+ remainder = max(0, remainder - 1)
910
+
911
+ # Sort samples by capacity (descending)
912
+ sample_info.sort(key=lambda x: x[2], reverse=True)
913
+
914
+ # Sort pool descending - assign high targets first
915
+ assignment_pool.sort(reverse=True)
916
+
917
+ # Assign targets, clamped to capacity
918
+ for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
919
+ target = min(assignment_pool[idx], capacity)
920
+ balanced_assignments[sample_idx] = target
921
+ ```
922
+
923
+ **Guarantee**: Each answer value appears equally, and high targets go to samples that can fit them.
924
+
925
+ ### 2. Capacity-Aware Question Type Balancing (ORDER Task)
926
+
927
+ ORDER task uses **capacity-aware balancing** - advanced question types assigned to high-capacity samples.
928
+
929
+ ```python
930
+ # Separate question types by requirements
931
+ basic_types = ['first', 'last', 'after', 'before'] # Need >= 2 clips
932
+ advanced_types = ['second', 'second_last'] # Need >= min_clips_for_second (e.g., 3)
933
+
934
+ # Sort samples by capacity (descending)
935
+ sample_info.sort(key=lambda x: x[2], reverse=True)
936
+
937
+ # Build assignment pool - advanced types first
938
+ samples_per_type = num_samples // len(question_types)
939
+ remainder = num_samples % len(question_types)
940
+
941
+ assignment_pool = []
942
+ # Add advanced types first (for high-capacity samples)
943
+ for qtype in advanced_types:
944
+ count = samples_per_type + (1 if remainder > 0 else 0)
945
+ assignment_pool.extend([qtype] * count)
946
+ remainder = max(0, remainder - 1)
947
+
948
+ # Then basic types
949
+ for qtype in basic_types:
950
+ count = samples_per_type + (1 if remainder > 0 else 0)
951
+ assignment_pool.extend([qtype] * count)
952
+ remainder = max(0, remainder - 1)
953
+
954
+ # Assign with validation
955
+ for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
956
+ target_qtype = assignment_pool[idx]
957
+ valid_types = _get_valid_question_types(capacity)
958
+
959
+ if target_qtype not in valid_types:
960
+ # Downgrade to valid type
961
+ target_qtype = random.choice(valid_types)
962
+
963
+ balanced_assignments[sample_idx] = target_qtype
964
+ ```
965
+
966
+ ### 3. Simple Question Type Balancing (DURATION, VOLUME Tasks)
967
+
968
+ ```python
969
+ # DURATION: 2 types → N/2 each
970
+ # VOLUME: 2 types → N/2 each
971
+
972
+ samples_per_type = num_samples // len(question_types)
973
+ remainder = num_samples % len(question_types)
974
+
975
+ balanced_types = []
976
+ for qtype in question_types:
977
+ count = samples_per_type + (1 if remainder > 0 else 0)
978
+ balanced_types.extend([qtype] * count)
979
+ remainder = max(0, remainder - 1)
980
+
981
+ random.shuffle(balanced_types)
982
+ ```
983
+
984
+ ### 4. Category Usage Balancing
985
+
986
+ All 50 ESC-50 categories are used equally via least-used selection:
987
+
988
+ ```python
989
+ def get_least_used_categories(self, n: int, exclude: List[str] = None) -> List[str]:
990
+ # Sort categories by usage count
991
+ sorted_cats = sorted(
992
+ self.category_usage_counts.items(),
993
+ key=lambda x: (x[1], x[0]) # Sort by count, then alphabetically for ties
994
+ )
995
+ # Filter excluded and return first n
996
+ available = [cat for cat, _ in sorted_cats if cat not in (exclude or [])]
997
+ return available[:n]
998
+ ```
999
+
1000
+ Each task calls `reset_category_usage()` at the start to ensure independent balancing.
1001
+
1002
+ ### 5. N_Clips Selection Strategy
1003
+
1004
+ **COUNT Task**: Uses capacity-aware answer balancing (see #1 above)
1005
+
1006
+ **ORDER and VOLUME Tasks**: Use **silence reduction strategy** (NOT balanced):
1007
+ ```python
1008
+ # Randomly sample n_clips from valid range to minimize silence
1009
+ min_clips = max(2, max_clips - 3)
1010
+ max_clips_allowed = min(max_clips, max_clips_per_sample, len(CATEGORIES))
1011
+
1012
+ if min_clips > max_clips_allowed:
1013
+ min_clips = max_clips_allowed # Handle edge case
1014
+
1015
+ n_clips = random.randint(min_clips, max_clips_allowed)
1016
+ ```
1017
+
1018
+ This maximizes clip usage within the allowed range, minimizing excessive silence.
1019
+
1020
+ ---
1021
+
1022
+ ## Rejection Logic and Retry Mechanisms
1023
+
1024
+ ### When Samples Are Rejected
1025
+
1026
+ Rejections occur only in tasks with gap constraints:
1027
+
1028
+ 1. **DURATION Task**: Gap constraint not satisfied
1029
+ - LONGEST: target_duration < max_background × 1.5
1030
+ - SHORTEST: target_duration > min_background × 0.5
1031
+
1032
+ 2. **VOLUME Task**: Gap constraint not satisfied
1033
+ - MAX_LOUDNESS: actual_gap_dB < required_gap_dB (3.52 dB)
1034
+ - MIN_LOUDNESS: actual_gap_dB < required_gap_dB (6.02 dB)
1035
+
1036
+ ### DURATION Task Retry Logic
1037
+
1038
+ ```python
1039
+ def generate_dataset(self):
1040
+ all_metadata = []
1041
+ sample_idx = 0
1042
+ type_idx = 0
1043
+
1044
+ while len(all_metadata) < num_samples and type_idx < len(balanced_types) * 2:
1045
+ question_type = balanced_types[type_idx % len(balanced_types)]
1046
+
1047
+ metadata = self.generate_sample(sample_idx, question_type)
1048
+
1049
+ if metadata is not None:
1050
+ all_metadata.append(metadata)
1051
+ sample_idx += 1
1052
+ # If None, sample was rejected - just move to next
1053
+
1054
+ type_idx += 1
1055
+ ```
1056
+
1057
+ ### Rejection Rate Calculation
1058
+
1059
+ $$\text{Rejection Rate} = \frac{\text{rejections}}{\text{rejections} + \text{successes}} \times 100\%$$
1060
+
1061
+ ---
1062
+
1063
+ ## Complete Task Creation Explanation
1064
+
1065
+ ### How Each Task Is Generated (Step-by-Step)
1066
+
1067
+ #### COUNT TASK - "How many unique sounds?"
1068
+
1069
+ **Goal**: Create audio with N unique sound sources, ask how many distinct sounds exist.
1070
+
1071
+ **Process**:
1072
+ 1. **Preprocessing**: None (uses raw ESC-50 clips)
1073
+ 2. **Duration Generation**: `target_duration ~ Uniform(20s, 60s)` per sample
1074
+ 3. **Calculate Max Clips**: `max_clips = get_max_clip_num_to_be_joined(target_duration, 5s, 100ms)`
1075
+ - Example: 45s duration → ~8 clips of 5s each with 100ms silence between
1076
+ 4. **Balanced Answer Selection**: Pre-generated pool of answers [1,2,3,...,10] balanced equally
1077
+ - Target answer (e.g., 5 unique sounds) selected from pool
1078
+ 5. **Silence Reduction**: Cap target at `min(target_answer, max_clips)`
1079
+ - If target=8 but max_clips=6 → use 6 (prevents excessive silence)
1080
+ 6. **Category Selection**: Pick N least-used categories from ESC-50 (balancing)
1081
+ 7. **Audio Construction**:
1082
+ - Load one file per category
1083
+ - Calculate repetitions needed: `total_clips = max_clips`
1084
+ - Distribute repetitions across N sources
1085
+ - **Ordering mode**:
1086
+ - `random`: Shuffle clips (A B A C B...) - harder, tests recognition
1087
+ - `consecutive`: Group same-source (AAA BBB CCC) - easier
1088
+ 8. **Silence Insertion**:
1089
+ - Minimum 100ms silence between EVERY clip
1090
+ - Extra silence (up to 500ms per gap) distributed from remainder
1091
+ - **Crossfade**: 50ms within same-source, 500ms at audio-silence boundaries
1092
+ 9. **Question Generation**: MCQ + open-text asking "How many unique sounds?"
1093
+ 10. **Export**: Save audio WAV + metadata
1094
+
1095
+ **Example**:
1096
+ - Target duration: 40s
1097
+ - Max clips that fit: 7 clips (7×5s + 6×0.1s = 35.6s)
1098
+ - Target answer: 3 unique sounds
1099
+ - Actual: 3 unique sounds (7 total clips: 3+2+2 repetitions)
1100
+ - Ordering: Random shuffle → [A B A C B A C]
1101
+ - Result: Audio with 3 distinct sounds, some repeated, with silences and crossfades
1102
+
1103
+ #### DURATION TASK - "Which sound is longest/shortest?"
1104
+
1105
+ **Goal**: Create audio where one sound has clearly longest/shortest duration compared to others.
1106
+
1107
+ **Process**:
1108
+ 1. **Preprocessing** (preprocess_esc50.py - REQUIRED):
1109
+ - Load raw ESC-50 clips
1110
+ - Detect sound regions using adaptive noise-floor thresholding
1111
+ - Trim leading/trailing silence (preserve internal structure)
1112
+ - Calculate effective duration per clip
1113
+ - Save trimmed audio + effective_durations.csv
1114
+ 2. **Duration Generation**: `target_duration ~ Uniform(20s, 60s)` per sample
1115
+ 3. **Calculate Max Clips**: Based on average effective duration (~3.86s)
1116
+ 4. **Determine N Sources**: Based on question type and max_clips
1117
+ - **LONGEST**: Target needs ≥2 clips, backgrounds get 1 each → `n_sources ≤ max_clips - 1`
1118
+ - **SHORTEST**: Target gets 1 clip, backgrounds need ≥2 each → `n_sources ≤ 1 + (max_clips-1)//2`
1119
+ 5. **Category Selection**: Pick target + backgrounds from least-used categories
1120
+ 6. **Slot Distribution**: Allocate clips to each source
1121
+ - LONGEST: Give most clips to target, 1 to each background
1122
+ - SHORTEST: Give 1 to target, multiple to each background
1123
+ 7. **Clip Selection**: For each source, select clips from preprocessed dataset
1124
+ 8. **Gap Verification**:
1125
+ - LONGEST: `target_duration ≥ max_background × 1.5` ✓
1126
+ - SHORTEST: `target_duration ≤ min_background × 0.75` ✓
1127
+ - If gap not satisfied: Try redistributing slots, or reject sample
1128
+ 9. **Audio Construction**:
1129
+ - Load trimmed clips
1130
+ - Concatenate with consecutive ordering (preserve periodicity)
1131
+ - Insert silences with crossfades
1132
+ 10. **Question Generation**: "Which sound is longest/shortest?"
1133
+ 11. **Export**: Audio + metadata
1134
+
1135
+ **Example**:
1136
+ - Question type: LONGEST
1137
+ - Target duration: 50s, max_clips: 12
1138
+ - N sources: 4 (target + 3 backgrounds)
1139
+ - Slot distribution: Target=6 clips (6×3.8s=22.8s), Backgrounds=2 clips each (2×3.8s=7.6s)
1140
+ - Gap check: 22.8s ≥ 7.6s × 1.5 = 11.4s ✓
1141
+ - Result: Target sound clearly longest
1142
+
1143
+ #### ORDER TASK - "Which sound is first/last/after X?"
1144
+
1145
+ **Goal**: Create ordered sequence of sounds, ask about temporal relationships.
1146
+
1147
+ **Process**:
1148
+ 1. **Preprocessing**: None (uses raw ESC-50)
1149
+ 2. **Duration Generation**: Pre-generated durations to exactly fill task duration
1150
+ 3. **Calculate Max Clips**: `get_max_clip_num_to_be_joined(target_duration, 5s, 100ms)`
1151
+ 4. **Balanced N_Clips Selection**: Pre-generated pool [2,3,4,...,10] balanced equally
1152
+ - Target n_clips (e.g., 5) selected from pool
1153
+ - Capped at `min(target_n_clips, max_clips)` (silence reduction)
1154
+ 5. **Question Type Selection**: From balanced pool (first, last, second, after, before, second_last)
1155
+ 6. **Answer Position Determination**: Based on question type
1156
+ - `first` → position 0
1157
+ - `last` → position n_clips-1
1158
+ - `second` → position 1
1159
+ - `second_last` → position n_clips-2
1160
+ - `after`/`before` → random valid position
1161
+ 7. **Category Selection**: Answer category at determined position, others from least-used
1162
+ 8. **Audio Construction**:
1163
+ - Load one clip per position
1164
+ - Build sequence with silences (min 100ms + random extra up to 500ms per gap)
1165
+ - **Crossfade**: 500ms at audio-silence boundaries for smooth transitions
1166
+ 9. **Question Generation**:
1167
+ - MCQ: "Which sound is first?" with 4 options
1168
+ - Open-text: "What is the first sound?" + full sequence
1169
+ 10. **Export**: Audio + metadata
1170
+
1171
+ **Example**:
1172
+ - Target n_clips: 4, max_clips: 8 → use 4 ✓
1173
+ - Question: "Which sound is second?"
1174
+ - Answer position: 1 (0-indexed)
1175
+ - Sequence: [dog, cat, bird, rain] → Answer: cat
1176
+ - Audio: 4 clips in order with silences and crossfades
1177
+
1178
+ #### VOLUME TASK - "Which sound is loudest/softest?"
1179
+
1180
+ **Goal**: Create audio with clips at different volume levels, ask about loudness comparison.
1181
+
1182
+ **Process**:
1183
+ 1. **Preprocessing**: None (uses raw ESC-50)
1184
+ 2. **Duration Generation**: Pre-generated durations
1185
+ 3. **Calculate Max Clips**: `get_max_clip_num_to_be_joined(...)`
1186
+ 4. **Balanced N_Clips Selection**: From pool [2,3,...,10], capped at max_clips
1187
+ 5. **Question Type Selection**: "max_loudness" or "min_loudness" (balanced 50/50)
1188
+ 6. **Volume Level Generation**: Create n_clips volume adjustments (in dB)
1189
+ - Ensure gap constraint (multiplier 4.0 for max, 0.25 for min)
1190
+ - Example: [+12dB, 0dB, -6dB] → max at +12dB has ≥12dB gap from second
1191
+ 7. **Gap Verification** (up to 10 attempts):
1192
+ - MAX: `max_level - second_max ≥ 20×log10(4.0) ≈ 12dB`
1193
+ - MIN: `second_min - min_level ≥ 20×log10(4.0) ≈ 12dB`
1194
+ - If not satisfied: Regenerate levels or reject
1195
+ 8. **Category Selection**: Answer at determined position, others from least-used
1196
+ 9. **Audio Construction**:
1197
+ - Load clips
1198
+ - **CRITICAL: Normalize all to baseline (-20 dBFS)** → ensures controlled comparison
1199
+ - Apply volume adjustments to normalized clips
1200
+ - Concatenate with silences and crossfades
1201
+ 10. **Question Generation**: "Which sound has maximum/minimum loudness?"
1202
+ 11. **Export**: Audio + metadata with volume levels
1203
+
1204
+ **Example**:
1205
+ - Target n_clips: 3, max_clips: 6 → use 3 ✓
1206
+ - Question: "max_loudness"
1207
+ - Volume levels: [+12dB, 0dB, -6dB]
1208
+ - Gap check: 12 - 0 = 12dB ≥ 12dB ✓
1209
+ - Process: Normalize all clips to -20dBFS, then adjust to [-8dBFS, -20dBFS, -26dBFS]
1210
+ - Result: First sound clearly loudest
1211
+
1212
+ ### Key Innovations
1213
+
1214
+ 1. **Crossfade Everywhere**: Smooth transitions at audio-silence boundaries (500ms), small crossfade within same-source repetitions (50ms)
1215
+ 2. **Adaptive Preprocessing**: Noise-floor thresholding adapts per-clip (duration task)
1216
+ 3. **Silence Reduction**: ORDER/VOLUME tasks sample n_clips from [max_clips-3, max_clips_per_sample] to minimize silence
1217
+ 4. **Balanced Distribution**:
1218
+ - **COUNT**: Balances answers (1 to max_clips_per_sample) + question types
1219
+ - **ORDER/VOLUME**: Balances question types only (n_clips uses silence reduction)
1220
+ 5. **Category Balancing**: Least-used selection ensures all 50 ESC-50 categories used evenly
1221
+ 6. **Gap Constraints**: Mathematical guarantees for duration/volume comparisons
1222
+ 7. **Exact Duration Filling**: Pre-generate sample durations to exactly fill task duration (no wasted time)
1223
+
1224
+ ---
1225
+
1226
+ ## Command-Line Arguments
1227
+
1228
+ ### Main Pipeline (`main.py`)
1229
+
1230
+ ```bash
1231
+ python main.py [OPTIONS]
1232
+
1233
+ Options:
1234
+ --config, -c PATH Path to config YAML (default: config.yaml)
1235
+ --tasks, -t TASKS Specific tasks to run (choices: count, duration, order, volume)
1236
+ --output, -o PATH Custom output directory (overrides config)
1237
+
1238
+ Examples:
1239
+ # Run all enabled tasks with default config
1240
+ python main.py
1241
+
1242
+ # Run specific tasks only
1243
+ python main.py --tasks count order
1244
+
1245
+ # Use custom config and output
1246
+ python main.py --config my_config.yaml --output ./my_dataset
1247
+ ```
1248
+
1249
+ ### Preprocessing Script (`preprocess_esc50.py`)
1250
+
1251
+ ```bash
1252
+ python preprocess_esc50.py [OPTIONS]
1253
+
1254
+ Options:
1255
+ --config PATH Path to config YAML (default: config.yaml)
1256
+ --threshold-strategy STRATEGY "noise_floor" or "peak_relative"
1257
+ --threshold-db FLOAT Threshold in dB (for peak_relative)
1258
+ --noise-floor-percentile FLOAT Percentile for noise floor estimation
1259
+ --noise-floor-delta-db FLOAT Delta above noise floor in dB
1260
+ --min-sound-ms INT Minimum sound duration in ms
1261
+ --no-trimmed-audio Skip saving trimmed audio files
1262
+ --output-dir PATH Custom output directory
1263
+
1264
+ Examples:
1265
+ # Use config defaults
1266
+ python preprocess_esc50.py --config config.yaml
1267
+
1268
+ # Override threshold parameters
1269
+ python preprocess_esc50.py --config config.yaml \
1270
+ --threshold-strategy noise_floor \
1271
+ --noise-floor-percentile 2.0 \
1272
+ --noise-floor-delta-db 5.0 \
1273
+ --min-sound-ms 25
1274
+
1275
+ # Generate metadata only (no trimmed audio)
1276
+ python preprocess_esc50.py --config config.yaml --no-trimmed-audio
1277
+ ```
1278
+
1279
+ ---
1280
+
1281
+ ## Summary
1282
+
1283
+ The TREA 2.0 pipeline generates balanced, constraint-satisfying audio QA samples through:
1284
+
1285
+ 1. **Preprocessing** (Duration only): Adaptive noise-floor thresholding + edge trimming
1286
+ 2. **Exact Duration Filling**: Pre-generate sample durations to sum exactly to task duration
1287
+ 3. **Capacity-Aware Balancing**:
1288
+ - **COUNT**: High answer targets → high-capacity samples
1289
+ - **ORDER**: Advanced question types → high-capacity samples
1290
+ 4. **Silence Reduction**: ORDER/VOLUME randomly sample n_clips from [max_clips-3, max_clips_per_sample]
1291
+ 5. **Crossfade Transitions**: Smooth audio-silence boundaries (500ms) + within-source (50ms)
1292
+ 6. **Category Balancing**: Least-used selection ensures even ESC-50 category distribution
1293
+ 7. **Gap Constraints**: Mathematical guarantees (1.5x for longest, 0.75x for shortest, 4.0x/0.25x for volume)
1294
+ 8. **Retry Mechanisms**: Failed samples rejected, pipeline continues until target count reached
1295
+
1296
+ All randomness is seeded (`random_seed: 42`) for reproducibility.
README.md ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TREA 2.0 Pipeline
2
+
3
+ Audio question-answering dataset generator using ESC-50. Creates four task types: COUNT, DURATION, ORDER, and VOLUME.
4
+
5
+ ## Quick Start
6
+
7
+ ```bash
8
+ # 1. Install dependencies
9
+ pip install -r requirements.txt
10
+
11
+ # 2. Preprocess ESC-50 (required for DURATION task only)
12
+ python preprocess_esc50.py --config config.yaml
13
+
14
+ # 3. Generate datasets
15
+ python main.py --config config.yaml
16
+ ```
17
+
18
+ ## Configuration
19
+
20
+ Edit `config.yaml` to set:
21
+ - **Task duration**: `task_duration_size` (hours) per task
22
+ - **Clip duration range**: `min_clip_duration` to `max_clip_duration` (seconds)
23
+ - **ESC-50 paths**: Point to your ESC-50 dataset location
24
+ - **Enable/disable tasks**: Set `enabled: true/false` for each task
25
+
26
+ ## Key Files
27
+
28
+ - **`config.yaml`** - All configuration parameters
29
+ - **`main.py`** - Pipeline entry point (runs all tasks)
30
+ - **`preprocess_esc50.py`** - Preprocess ESC-50 for duration task
31
+ - **`tasks/task_*.py`** - Individual task generators
32
+
33
+ ## Tasks
34
+
35
+ | Task | Question | Example |
36
+ |------|----------|---------|
37
+ | **COUNT** | "How many unique sounds?" | Audio with 5 distinct sound types |
38
+ | **DURATION** | "Which sound is longest/shortest?" | Compare sound durations |
39
+ | **ORDER** | "Which sound is first/last/after X?" | Temporal sequence questions |
40
+ | **VOLUME** | "Which sound is loudest/softest?" | Loudness comparison |
41
+
42
+ ## Output Structure
43
+
44
+ ```
45
+ output/{task}/
46
+ ├── audios/*.wav # Generated audio files
47
+ ├── {task}_mcq.csv # Multiple choice questions
48
+ ├── {task}_open_text.csv # Open-ended questions
49
+ └── {task}_metadata.csv # Detailed metadata
50
+ ```
51
+
52
+ ## Shell scripts (quick)
53
+
54
+ Use the provided shell helpers for simple runs.
55
+
56
+ Run full pipeline (uses `python main.py` under the hood):
57
+
58
+ ```bash
59
+ # Make executable and run (from pipeline/)
60
+ ./run_pipeline.sh
61
+
62
+ # With custom config, tasks, and output
63
+ ./run_pipeline.sh --config my_config.yaml --tasks count,order --output ./my_dataset
64
+ ```
65
+
66
+ Run the LLM answer generation across splits (uses `llm_answer_generator.py`):
67
+
68
+ ```bash
69
+ # Processes open_text CSVs across splits/tasks defined in the script
70
+ ./run_llm_answers_all.sh
71
+
72
+ # Or run per-file with the helper script directly
73
+ python llm_answer_generator.py --input /path/to/count_open_text.csv --mode open_text --task count
74
+ ```
75
+
76
+
77
+ ## Advanced Usage
78
+
79
+ ```bash
80
+ # Run specific tasks only
81
+ python main.py --tasks count order
82
+
83
+ # Use custom config
84
+ python main.py --config my_config.yaml
85
+
86
+ # Custom output directory
87
+ python main.py --output /path/to/output
88
+
89
+ # Preprocess with custom parameters
90
+ python preprocess_esc50.py --config config.yaml \
91
+ --threshold-strategy noise_floor \
92
+ --noise-floor-percentile 2.0 \
93
+ --noise-floor-delta-db 5.0
94
+ ```
95
+
96
+ ## Documentation
97
+
98
+ See **`DOCS.md`** for complete technical documentation including:
99
+ - Mathematical formulations
100
+ - Detailed algorithm explanations
101
+ - Configuration parameter reference
102
+ - Preprocessing pipeline details
103
+ - Balancing mechanisms
104
+
105
+ ## Requirements
106
+
107
+ - Python 3.8+
108
+ - pydub
109
+ - numpy
110
+ - pandas
111
+ - tqdm
112
+ - pyyaml
config.yaml ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Temporal Reasoning Audio Dataset Pipeline Configuration
2
+ ##uniform distributuon for clip duration
3
+ ##not mixing datasets
4
+
5
+ ##count
6
+ ##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder
7
+
8
+ ##duration
9
+ ##amplitude based filtering -> normalize -> threshold based selection
10
+ ##gap between audio clips - x2/1.5 the shorter one -> add as param
11
+ ##different clips of the same class can be contatenated to reach target duration
12
+ ##consecutive ordering only
13
+ ##based on n unique sources and total clips we can have -> shortest and longest duration calculation
14
+
15
+ ##reject datapoint if same target audio clip cannot be repeated to maintain the gap - arg
16
+ ##sample different clip from the same class -> check if different clips can be used to fill the gap - arg
17
+
18
+ ##amplitude filtered durations in metadata csv
19
+
20
+ ##get_max_clip_num_to_be_joined()
21
+ ##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder
22
+
23
+ ##ensure_silence_between_clips()
24
+ ##silence should always be there between two clips
25
+
26
+ ##order
27
+ ##repeat target clips
28
+ ##second and second last - modify question types
29
+
30
+ ##volume
31
+ ##amplitude average loudness for a audio clip -> repetitions but same clip(argument) -> different volume levels based on dB levels
32
+
33
+ ##add crossfade
34
+
35
+ ##trimming - threshold separately for each audio clip - normalize 1 and 0 - get threshold -> trim -> concatenate
36
+ ##leftmost and rightmost silence trimming
37
+ ##buffer for trimming - cutting early and cutting a bit late to avoid cutting important parts
38
+ ##periodicity affect
39
+
40
+ ##volume - trim and get average loudness -> normalize -> adjust volume levels
41
+
42
+ ##number of clips per samples to avoid silence
43
+
44
+
45
+ # ESC-50 Dataset paths (each clip is 5 seconds)
46
+ esc50:
47
+ audio_path: "/path/to/ESC-50_github/audio"
48
+ metadata_path: "/path/to/ESC-50_github/meta/esc50.csv"
49
+
50
+ # Synthetic silence audio for concatenation
51
+ synthetic_silence:
52
+ path: "/path/to/synthetic_silences"
53
+
54
+ # Output configuration
55
+ output:
56
+ base_path: "/path/to/pipeline/test_ood"
57
+ # Dataset class-subset configuration
58
+ # Use this to create datasets (train/val/test) from a persistent subset
59
+ # of classes (e.g. use 40 of 50 classes for in-distribution splits and
60
+ # optionally create an OOD test set using all 50 classes).
61
+ dataset:
62
+ use_class_subset: false # if false, use all available classes
63
+ num_classes_subset: 40 # number of classes to use for train/val/test
64
+ subset_persist_path: "/path/to/class_subset.json"
65
+ subset_seed: 42 # RNG seed when sampling the subset (persisted)
66
+
67
+ # Audio generation parameters
68
+ audio:
69
+ # Duration range for each GENERATED clip (in seconds)
70
+ # Original ESC-50 clips are 5s and will be concatenated to create clips in this range
71
+ min_clip_duration: 20.0 # Minimum duration for each generated clip
72
+ max_clip_duration: 60.0 # Maximum duration for each generated clip
73
+
74
+ # Crossfade and silence
75
+ crossfade_duration: 500 # Crossfade between audio and silence (milliseconds) for smooth transitions
76
+ silence_duration: 1000 # Default silence between clips (milliseconds)
77
+ min_silence_duration: 100 # Minimum silence ALWAYS inserted between clips (milliseconds)
78
+ max_extra_silence_per_gap: 500 # Maximum extra silence per gap when distributing remainder
79
+ crossfade_within_source: 50 # Small crossfade within same-source repetitions (count task)
80
+ with_silence: true # Add silence between clips
81
+ # Duration (seconds) of individual source clips (ESC-50 are 5s by default).
82
+ # Used to compute how many source clips are concatenated to reach a target
83
+ # generated clip duration. Change only if your source clips differ.
84
+ source_clip_duration: 5.0
85
+
86
+ # Audio normalization
87
+ normalize: false
88
+ normalize_target_dBFS: -20.0
89
+
90
+ # Random seed for reproducibility
91
+ random_seed: 42
92
+
93
+ # LLM for question generation (local Llama 3.1 8B)
94
+ llm:
95
+ enabled: false # Set to true to use LLM for question generation
96
+
97
+ # Task-specific configurations
98
+ tasks:
99
+ count:
100
+ enabled: true
101
+ # Total duration for ALL samples in this task combined (in hours)
102
+ # Pipeline will calculate number of samples based on min/max clip durations
103
+ task_duration_size: 2.0 # hours
104
+
105
+ # Maximum unique sound sources per sample (single number)
106
+ # Actual number will be subsampled from max(1, max_clips-3) to min(max_clips, max_clips_per_sample)
107
+ max_clips_per_sample: 10
108
+
109
+ # Ordering mode for repeated clips of same source:
110
+ # "random": Clips are shuffled randomly (A B A C B A C...) - tests recognition of recurring sounds
111
+ # "consecutive": Same-source clips grouped together (AAA BBB CCC) - easier, just count blocks
112
+ ordering_mode: "random"
113
+
114
+ # Question templates for MCQ
115
+ mcq_questions:
116
+ - "What is the number of distinct sound sources in the audio file?"
117
+ - "How many different types of sounds can be identified in this recording?"
118
+ - "How many unique types of sound are present in this audio?"
119
+ - "Identify the count of different sound sources in this clip."
120
+ - "What is the total number of unique sounds heard in this audio?"
121
+ - "How many distinct sound categories are there in this audio file?"
122
+ - "Determine the number of unique sound sources in this recording."
123
+ - "How many separate sound sources are included in the audio?"
124
+ - "What is the total number of unique sound types in this audio?"
125
+ - "How many different sound sources can be heard in this clip?"
126
+ # Question templates for open-text
127
+ open_text_questions:
128
+ - "How many distinct sound sources are present in the audio?"
129
+ - "Count the number of unique sounds in this recording."
130
+ - "What is the total count of different sound categories heard?"
131
+ - "Identify and count all unique sound types in the clip."
132
+
133
+ duration:
134
+ enabled: true
135
+ # Total duration for ALL samples in this task combined (in hours)
136
+ task_duration_size: 2.0 # hours
137
+
138
+ # Number of unique sound sources per sample (can be single int or list)
139
+ # Single int (e.g., 15): randomly samples from 1 to 15 (like count/order tasks)
140
+ # List (e.g., [2,3,4]): randomly picks from the list
141
+ # The script will automatically generate repetition patterns to create
142
+ # shortest/longest variations based on the target clip duration
143
+ num_unique_sources: 10
144
+
145
+ # Ordering: only keep "consecutive" so repeated segments of the same
146
+ # source remain grouped together, ensuring that multiple consecutive
147
+ # clips of the same audio yield the longest duration unambiguously.
148
+ ordering_methods: ["consecutive"]
149
+
150
+ # =====================================================
151
+ # Amplitude-based filtering parameters (preprocessing)
152
+ # =====================================================
153
+ # RELATIVE dB threshold below peak to consider as silence
154
+ # For each clip: silence_threshold = clip_peak_dB + amplitude_threshold_db
155
+ # Example: If clip peak is -5 dB and threshold is -20, silence threshold = -25 dB
156
+ # Based on ESC-50 analysis: -20 dB gives ~60% effective duration (good balance)
157
+ # More aggressive (removes more silence): -15 dB
158
+ # More conservative (keeps more sound): -25 dB
159
+ amplitude_threshold_db: -20.0
160
+
161
+ # Minimum duration of sound region to keep (milliseconds)
162
+ # Filters out very short transient noise spikes
163
+ # ESC-50 is curated, so 20-30ms is sufficient
164
+ min_sound_duration_ms: 25
165
+
166
+ # =====================================================
167
+ # Adaptive threshold strategy
168
+ # =====================================================
169
+ # "peak_relative": threshold = peak_dB + amplitude_threshold_db (fixed offset from peak)
170
+ # - Simple but not adaptive to actual noise levels
171
+ # "noise_floor": threshold = percentile(dB, N) + delta_dB (RECOMMENDED)
172
+ # - Fully adaptive per-clip based on its own noise floor
173
+ # - Each clip analyzed independently - no fixed dB values needed
174
+ # - Better for diverse audio with varying noise levels
175
+ threshold_strategy: "noise_floor"
176
+
177
+ # Noise floor estimation percentile (used when threshold_strategy = noise_floor)
178
+ # Lower percentile = more conservative estimate of background noise
179
+ # 5 = use 5th percentile of dB values as noise floor estimate (better for sparse sounds)
180
+ noise_floor_percentile: 2.0
181
+
182
+ # Delta above noise floor (dB) to set as threshold
183
+ # This is relative to EACH clip's own noise floor, not a fixed dB value
184
+ # 8dB above the clip's noise floor works well for most ESC-50 clips
185
+ # Higher = more conservative (keeps more), Lower = more aggressive (removes more)
186
+ noise_floor_delta_db: 5.0
187
+
188
+ # Path to preprocessed ESC-50 data (effective durations + trimmed audio)
189
+ preprocessed_data_path: "/path/to/ESC-50_preprocessed"
190
+
191
+ # =====================================================
192
+ # Duration gap multipliers
193
+ # =====================================================
194
+ # For LONGEST questions: target_effective >= max_background × multiplier_longest
195
+ multiplier_longest: 1.5
196
+ # For SHORTEST questions: target_effective <= min_background × multiplier_shortest
197
+ # Using 0.75 instead of 0.5 for smaller gaps (easier to distinguish)
198
+ multiplier_shortest: 0.75
199
+
200
+ # Minimum effective duration per source (seconds)
201
+ # Clips with less than this duration are harder to distinguish
202
+ min_effective_duration_per_source: 1.0
203
+
204
+ # =====================================================
205
+ # Fallback/rejection options
206
+ # =====================================================
207
+ # Reject sample if duration gap cannot be satisfied
208
+ reject_if_gap_not_met: true
209
+ # Try different clips from same class if one clip isn't enough
210
+ sample_different_clips_same_class: true
211
+
212
+ # Question types
213
+ question_types: ["shortest", "longest"]
214
+ # MCQ questions
215
+ mcq_questions:
216
+ shortest: "Which of the following sounds is heard for the shortest duration?"
217
+ longest: "Which of the following sounds is heard for the longest duration?"
218
+ # Open-text questions
219
+ open_text_questions:
220
+ shortest: "Which sound is heard for the shortest duration in the audio?"
221
+ longest: "Which sound is heard for the longest duration in the audio?"
222
+
223
+ order:
224
+ enabled: true
225
+ # Total duration for ALL samples in this task combined (in hours)
226
+ task_duration_size: 2.0 # hours
227
+
228
+ # Maximum clips to join per sample (minimum 2 for ordering)
229
+ # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
230
+ max_clips_per_sample: 10
231
+
232
+ # Whether to allow repeating clips from the same source category
233
+ # If true: sequence could be [dog, dog, cat, bird] (same clip repeated)
234
+ # If false: sequence is always unique sources
235
+ allow_source_repetition: false
236
+
237
+ # Minimum clips needed for "second" and "second_last" questions
238
+ # Set to 4 to ensure second and second_last refer to different positions
239
+ # (with 3 clips, both would refer to middle clip at position 1)
240
+ min_clips_for_second_questions: 3
241
+
242
+ # Question types: "first", "last", "after", "before", "second", "second_last"
243
+ # "second" and "second_last" only generated when n_clips >= min_clips_for_second_questions
244
+ question_types: ["first", "last", "after", "before", "second", "second_last"]
245
+
246
+ # MCQ question templates
247
+ mcq_questions:
248
+ first: "Which sound appears first in the audio clip?"
249
+ last: "Which sound appears last in the audio clip?"
250
+ after: "Which sound comes after {sound1}?"
251
+ before: "Which sound comes before {sound2}?"
252
+ second: "Which sound appears second in the audio clip?"
253
+ second_last: "Which sound appears second to last in the audio clip?"
254
+ # Open-text question templates
255
+ open_text_questions:
256
+ first: "What is the first sound you hear in the audio?"
257
+ last: "What is the last sound you hear in the audio?"
258
+ after: "What sound comes after {sound1}?"
259
+ before: "What sound comes before {sound2}?"
260
+ second: "What is the second sound you hear in the audio?"
261
+ second_last: "What sound is second to last in the audio?"
262
+ sequence: "List the sounds in the order they appear in the audio."
263
+
264
+ volume:
265
+ enabled: true
266
+ # Total duration for ALL samples in this task combined (in hours)
267
+ task_duration_size: 2.0 # hours
268
+
269
+ # Maximum clips with different volumes per sample
270
+ # Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
271
+ max_clips_per_sample: 10
272
+
273
+ # =====================================================
274
+ # Normalization settings (CRITICAL for volume comparison)
275
+ # =====================================================
276
+ # All clips are FIRST normalized to baseline, THEN volume adjusted
277
+ # This ensures volume differences are controlled and comparable
278
+ normalize_to_baseline: true
279
+ baseline_dBFS: -20.0 # Normalize all clips to this level first (used if use_lufs=false)
280
+
281
+ # =====================================================
282
+ # LUFS (Perceived Loudness) Settings
283
+ # =====================================================
284
+ # LUFS (Loudness Units Full Scale) measures PERCEIVED loudness
285
+ # Unlike dBFS which only measures RMS amplitude, LUFS accounts for
286
+ # human hearing sensitivity to different frequencies (K-weighting)
287
+ #
288
+ # IMPORTANT: For volume comparison task, we DISABLE LUFS normalization!
289
+ # LUFS makes everything the same perceived loudness, defeating the purpose.
290
+ # Instead, we normalize to a baseline dBFS then apply LARGE volume adjustments.
291
+ use_lufs: false # DISABLED for audible volume differences
292
+ baseline_lufs: -23.0 # EBU R128 standard (not used when use_lufs=false)
293
+
294
+ # =====================================================
295
+ # Volume gap multipliers (similar to duration task)
296
+ # =====================================================
297
+ # For MAX_LOUDNESS questions: target_loudness >= second_loudest × multiplier_max
298
+ # Multiplier 2.5 = ~8dB difference = clearly audible
299
+ # Multiplier 4.0 = ~12dB difference = very obvious (4x perceived loudness)
300
+ multiplier_max_loudness: 4.0
301
+
302
+ # For MIN_LOUDNESS questions: target_loudness <= second_softest × multiplier_min
303
+ # Multiplier 0.25 = ~12dB quieter = clearly distinguishable
304
+ multiplier_min_loudness: 0.25
305
+
306
+ # Reject sample if loudness gap cannot be satisfied
307
+ reject_if_gap_not_met: true
308
+
309
+ # =====================================================
310
+ # Source clip options
311
+ # =====================================================
312
+ # If true: same clip can be repeated at different volumes
313
+ # If false: always use different source clips (default behavior)
314
+ use_same_clip_different_volumes: false
315
+
316
+ # If use_same_clip_different_volumes is true, how many repetitions per source?
317
+ # Can be a single int or list for variety
318
+ repetitions_per_source: [2, 3, 4]
319
+
320
+ # Question types: "max_loudness", "min_loudness"
321
+ question_types: ["max_loudness", "min_loudness"]
322
+
323
+ # MCQ questions
324
+ mcq_questions:
325
+ max_loudness: "Which sound has the maximum loudness in the audio?"
326
+ min_loudness: "Which sound has the minimum loudness in the audio?"
327
+ # Open-text questions
328
+ open_text_questions:
329
+ max_loudness: "Identify the sound with maximum loudness in the audio clip."
330
+ min_loudness: "Identify the sound with minimum loudness in the audio clip."
331
+ order_volume: "List the sounds in order from maximum to minimum loudness."
332
+
333
+ # MCQ options configuration
334
+ mcq:
335
+ num_options: 4
336
+ option_labels: ["A", "B", "C", "D"]
337
+ # Strategy for generating distractor options
338
+ # "present_only": only use sounds present in audio
339
+ # "mixed": mix of present and absent sounds
340
+ # "balanced": balanced distribution
341
+ distractor_strategy: "balanced"
342
+
343
+ # Logging configuration
344
+ logging:
345
+ level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
346
+ log_file: "pipeline.log"
347
+ console_output: true
348
+
llm_answer_generator.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import argparse
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ import torch
5
+ import random
6
+
7
+ # Convert MCQ CSV to NL answers using a text-only LLM (meta-llama/Llama-3.1-8B-Instruct)
8
+ # Adds: (1) stronger LLM-driven variability for duration/volume in open_text mode via system prompt
9
+ # (2) --one_word_ratio (default 0.2) to skip forward pass for a fraction of rows,
10
+ # outputting the normalized (underscore-removed) answer only.
11
+
12
+
13
+ def convert_to_natural_phrase(val):
14
+ """Convert underscore-separated tokens to natural phrases."""
15
+ if isinstance(val, str) and "_" in val:
16
+ val = val.replace("_", " ")
17
+ return val
18
+
19
+
20
+ def generate_answer(tokenizer, model, question, correct_value, device, mode="mcq"):
21
+ """Generate a natural language answer using a text-only LLM.
22
+
23
+ mode: "mcq" (default) uses the original MCQ-oriented prompt.
24
+ "open_text" uses a direct rewrite prompt for provided question/answer pairs.
25
+ """
26
+ correct_value = convert_to_natural_phrase(correct_value)
27
+
28
+ if mode == "open_text":
29
+ system_preamble = (
30
+ "You convert (Question, short Answer) into EXACTLY ONE natural English sentence that answers the Question.\n\n"
31
+ "HARD RULES:\n"
32
+ "- Output exactly ONE sentence. No newlines, no bullet points, no labels, no quotes.\n"
33
+ "- Use ONLY the provided Answer content as the factual answer; do not add any new facts.\n"
34
+ "- Be concise and direct.\n"
35
+ "- Do NOT include any numbers unless the question is a COUNT question.\n"
36
+ "- Vary phrasing strongly across items; avoid repeating the same structure.\n\n"
37
+ "VARIABILITY REQUIREMENT (IMPORTANT):\n"
38
+ "- For all questions, you MUST vary sentence structure.\n"
39
+ "- Randomly choose ONE of these patterns each time:\n"
40
+ " (A) Start with the sound name (Answer) -> then the relation.\n"
41
+ " (B) Start with the relation -> then the sound name (Answer).\n"
42
+ " (C) Use an 'it`s...' style clause after the Answer.\n"
43
+ " (D) Use a short, natural rephrase with different verbs (e.g., lasts, continues, stands out, comes through).\n"
44
+ "- Do not always use 'The sound with the ... is ...' — that pattern should be rare.\n\n"
45
+ "TASK HANDLING (infer from the Question):\n"
46
+ "- COUNT questions (how many / count / number):\n"
47
+ " * If Answer is numeric, write it EITHER as digits (e.g., 10) OR as a word (e.g., ten). Do NOT include both.\n"
48
+ "- DURATION questions (longest/shortest):\n"
49
+ " * Clearly state longest vs shortest, and use the Answer as the sound name. Do not include any numbers.\n"
50
+ "- VOLUME questions (minimum/maximum loudness, quietest/loudest):\n"
51
+ " * Match minimum vs maximum loudness and use the Answer as the sound name. No dB values.\n"
52
+ "- ORDER questions (first/second/before/after/second-to-last):\n"
53
+ " * Match the requested relation and use the Answer as the sound name.\n\n"
54
+ "Return only the sentence."
55
+ )
56
+
57
+ user_prompt = (
58
+ f"Question: {question}\n"
59
+ f"Answer: {correct_value}\n"
60
+ "Rewrite the answer as a single, natural sentence that directly answers the question."
61
+ )
62
+ else:
63
+ system_preamble = (
64
+ "You are a helpful assistant that converts multiple-choice QA pairs into natural language answers.\n"
65
+ "CRITICAL RULES:\n"
66
+ "1. Write as a human would naturally speak - vary sentence structure and avoid repetitive patterns\n"
67
+ "2. Keep responses concise but natural and affirmative avoiding words like 'might/may' or 'could' - one clear sentence\n"
68
+ "3. Do not mention 'among the options/among the following' even if the question mentions it. This natural language statement is supposed to be a direct answer.\n"
69
+ "4. Do NOT invent sounds.\n"
70
+ "5. Do not reason to answer the question, you're just supposed to provide the correct mcq answer as a natural language answer in a single sentence.\n"
71
+ "Return only the natural language answer, nothing else."
72
+ )
73
+ user_prompt = (
74
+ f"Now, given the question: '{question}' and the correct answer: '{correct_value}', "
75
+ f"write one natural-language answer as you would expect from a human."
76
+ )
77
+
78
+ # Chat format
79
+ messages = [
80
+ {"role": "system", "content": system_preamble},
81
+ {"role": "user", "content": user_prompt},
82
+ ]
83
+ inputs = tokenizer.apply_chat_template(
84
+ messages,
85
+ tokenize=True,
86
+ add_generation_prompt=True,
87
+ return_tensors="pt",
88
+ ).to(device)
89
+
90
+ input_length = inputs.shape[1]
91
+
92
+ with torch.no_grad():
93
+ output = model.generate(
94
+ inputs,
95
+ max_new_tokens=64,
96
+ do_sample=True,
97
+ temperature=0.8,
98
+ top_p=0.9,
99
+ repetition_penalty=1.05,
100
+ no_repeat_ngram_size=3,
101
+ pad_token_id=tokenizer.eos_token_id,
102
+ eos_token_id=tokenizer.eos_token_id,
103
+ )
104
+
105
+ generated_ids = output[0, input_length:]
106
+ response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
107
+ print(f"Model response: {response}")
108
+ return response
109
+
110
+
111
+ def detect_csv_format(df):
112
+ """
113
+ Detect CSV layout and return column mappings.
114
+ Supports:
115
+ - original MCQ format
116
+ - perturbed MCQ format
117
+ - open-text format (question/answer present)
118
+ """
119
+ columns = df.columns.tolist()
120
+
121
+ if "correct" in columns and "id" in columns and "audio_path" in columns:
122
+ # Original format (count.csv)
123
+ return {
124
+ "id_col": "id",
125
+ "audio_path_col": "audio_path",
126
+ "answer_col": "correct",
127
+ "question_col": "question",
128
+ "format_type": "original",
129
+ }
130
+ if "answer" in columns and "idx" in columns and "new_audio_path" in columns:
131
+ # Perturbed format (count_perturbed.csv)
132
+ return {
133
+ "id_col": "idx",
134
+ "audio_path_col": "new_audio_path",
135
+ "answer_col": "answer",
136
+ "question_col": "question",
137
+ "format_type": "perturbed",
138
+ }
139
+ if "answer" in columns and "question" in columns:
140
+ # Open-text format
141
+ return {
142
+ "id_col": "id" if "id" in columns else None,
143
+ "audio_path_col": "audio_path" if "audio_path" in columns else None,
144
+ "answer_col": "answer",
145
+ "question_col": "question",
146
+ "format_type": "open_text",
147
+ }
148
+
149
+ raise ValueError(f"Unknown CSV format. Columns found: {columns}")
150
+
151
+
152
+ def main():
153
+ parser = argparse.ArgumentParser(
154
+ description="Convert CSV to NL answers (MCQ or open-text) using meta-llama/Llama-3.1-8B-Instruct"
155
+ )
156
+ parser.add_argument("--input", required=True, help="Input CSV file")
157
+ parser.add_argument("--output", required=False, help="Output CSV file (defaults to input for in-place append)")
158
+ parser.add_argument(
159
+ "--mode",
160
+ required=True,
161
+ choices=["mcq", "open_text"],
162
+ help="Conversion mode: mcq -> convert MCQ correct option to natural answer; open_text -> rewrite provided short answer to a natural sentence",
163
+ )
164
+ parser.add_argument(
165
+ "--task",
166
+ required=True,
167
+ choices=["count", "duration", "order", "volume"],
168
+ help="Task type this CSV belongs to (used for bookkeeping/logging)",
169
+ )
170
+
171
+ # NEW: one-word skipping
172
+ parser.add_argument(
173
+ "--one_word_ratio",
174
+ type=float,
175
+ default=0.2,
176
+ help="Fraction of samples to output as just the normalized one-word/phrase answer (no LLM forward pass). Default 0.2",
177
+ )
178
+ parser.add_argument(
179
+ "--seed",
180
+ type=int,
181
+ default=123,
182
+ help="Random seed for reproducible one_word sampling.",
183
+ )
184
+
185
+ args = parser.parse_args()
186
+ random.seed(args.seed)
187
+
188
+ print("Loading meta-llama/Llama-3.1-8B-Instruct tokenizer and model...")
189
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", use_fast=False)
190
+ model = AutoModelForCausalLM.from_pretrained(
191
+ "meta-llama/Llama-3.1-8B-Instruct",
192
+ torch_dtype="auto",
193
+ device_map="auto",
194
+ )
195
+ model.eval()
196
+
197
+ df = pd.read_csv(args.input)
198
+
199
+ # Detect CSV format and get column mappings
200
+ format_info = detect_csv_format(df)
201
+ print(f"Detected CSV format: {format_info['format_type']}")
202
+
203
+ # Validate requested mode against detected CSV format
204
+ if args.mode == "mcq" and format_info["format_type"] == "open_text":
205
+ raise ValueError(
206
+ "Requested mode=mcq but input appears to be open_text format. Use --mode open_text or supply an MCQ CSV."
207
+ )
208
+ if args.mode == "open_text" and format_info["format_type"] != "open_text":
209
+ raise ValueError(
210
+ "Requested mode=open_text but input does not appear to be open_text format. Use --mode mcq or supply an open_text CSV."
211
+ )
212
+
213
+ output_path = args.output if args.output else args.input
214
+
215
+ nl_rows = []
216
+ device = model.device
217
+
218
+ for i, row in df.iterrows():
219
+ question = row[format_info["question_col"]]
220
+
221
+ # Resolve correct_value from CSV format
222
+ if format_info["format_type"] == "open_text":
223
+ correct_value = row[format_info["answer_col"]]
224
+ else:
225
+ correct_letter = row[format_info["answer_col"]]
226
+ option_map = {"A": "optionA", "B": "optionB", "C": "optionC", "D": "optionD"}
227
+ correct_value = row[option_map[correct_letter]]
228
+
229
+ # Normalize underscores BEFORE deciding one_word skip
230
+ correct_value = convert_to_natural_phrase(correct_value)
231
+
232
+ print(f"[{i+1}/{len(df)}] Q: {question} | Ans: {correct_value}")
233
+
234
+ # 20%: one-word/phrase answer, no forward pass
235
+ if random.random() < args.one_word_ratio:
236
+ nl_answer = correct_value
237
+ print(f"Skipped LLM (one_word_ratio). Output: {nl_answer}")
238
+ else:
239
+ nl_answer = generate_answer(
240
+ tokenizer,
241
+ model,
242
+ question,
243
+ correct_value,
244
+ device,
245
+ mode=("open_text" if format_info["format_type"] == "open_text" else "mcq"),
246
+ )
247
+
248
+ nl_rows.append(
249
+ {
250
+ "question": question,
251
+ "id": row[format_info["id_col"]] if format_info.get("id_col") and format_info["id_col"] in row else None,
252
+ "audio_path": row[format_info["audio_path_col"]]
253
+ if format_info.get("audio_path_col")
254
+ else None,
255
+ "original_answer": correct_value,
256
+ "open_text_answer": nl_answer,
257
+ }
258
+ )
259
+
260
+ # Merge back as new column to the original CSV to preserve all fields
261
+ nl_df = pd.DataFrame(nl_rows)
262
+ df["open_text_answer"] = nl_df["open_text_answer"]
263
+ df.to_csv(output_path, index=False)
264
+ print(f"Appended natural language answers to {output_path}")
265
+
266
+
267
+ if __name__ == "__main__":
268
+ main()
main.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main pipeline runner for temporal reasoning audio dataset generation.
3
+
4
+ This script orchestrates the generation of all task datasets.
5
+ """
6
+
7
+ import argparse
8
+ import sys
9
+ import yaml
10
+ from pathlib import Path
11
+ from typing import List, Optional
12
+
13
+ # Add project root to path
14
+ sys.path.append(str(Path(__file__).parent))
15
+
16
+ from utils import setup_logger, set_random_seed
17
+ from tasks.task_count import CountTaskGenerator
18
+ from tasks.task_duration import DurationTaskGenerator
19
+ from tasks.task_order import OrderTaskGenerator
20
+ from tasks.task_volume import VolumeTaskGenerator
21
+
22
+
23
+ def load_config(config_path: str) -> dict:
24
+ """Load configuration from YAML file."""
25
+ with open(config_path, 'r') as f:
26
+ config = yaml.safe_load(f)
27
+ return config
28
+
29
+
30
+ def run_count_task(config: dict, logger):
31
+ """Run the count task generation."""
32
+ if not config['tasks']['count']['enabled']:
33
+ logger.info("Count task is disabled, skipping...")
34
+ return
35
+
36
+ logger.info("=" * 80)
37
+ logger.info("STARTING COUNT TASK GENERATION")
38
+ logger.info("=" * 80)
39
+
40
+ generator = CountTaskGenerator(config, logger)
41
+ generator.dataset.reset_category_usage() # Reset counter for this task
42
+ generator.generate_dataset()
43
+
44
+ # Log category usage statistics
45
+ usage_stats = generator.dataset.get_category_usage_stats()
46
+ sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
47
+ logger.info("Category usage statistics (as answers):")
48
+ logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
49
+ logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
50
+ logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
51
+
52
+ logger.info("Count task completed successfully!")
53
+
54
+
55
+ def run_duration_task(config: dict, logger):
56
+ """Run the duration task generation."""
57
+ if not config['tasks']['duration']['enabled']:
58
+ logger.info("Duration task is disabled, skipping...")
59
+ return
60
+
61
+ logger.info("=" * 80)
62
+ logger.info("STARTING DURATION TASK GENERATION")
63
+ logger.info("=" * 80)
64
+
65
+ generator = DurationTaskGenerator(config, logger)
66
+ generator.dataset.reset_category_usage() # Reset counter for this task
67
+ generator.generate_dataset()
68
+
69
+ # Log category usage statistics
70
+ usage_stats = generator.dataset.get_category_usage_stats()
71
+ sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
72
+ logger.info("Category usage statistics (as longest/shortest answers):")
73
+ logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
74
+ logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
75
+ logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
76
+
77
+ logger.info("Duration task completed successfully!")
78
+
79
+
80
+ def run_order_task(config: dict, logger):
81
+ """Run the order task generation."""
82
+ if not config['tasks']['order']['enabled']:
83
+ logger.info("Order task is disabled, skipping...")
84
+ return
85
+
86
+ logger.info("=" * 80)
87
+ logger.info("STARTING ORDER TASK GENERATION")
88
+ logger.info("=" * 80)
89
+
90
+ generator = OrderTaskGenerator(config, logger)
91
+ generator.dataset.reset_category_usage() # Reset counter for this task
92
+ generator.generate_dataset()
93
+
94
+ # Log category usage statistics
95
+ usage_stats = generator.dataset.get_category_usage_stats()
96
+ sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
97
+ logger.info("Category usage statistics (as first/last/after/before answers):")
98
+ logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
99
+ logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
100
+ logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
101
+
102
+ logger.info("Order task completed successfully!")
103
+
104
+
105
+ def run_volume_task(config: dict, logger):
106
+ """Run the volume task generation."""
107
+ if not config['tasks']['volume']['enabled']:
108
+ logger.info("Volume task is disabled, skipping...")
109
+ return
110
+
111
+ logger.info("=" * 80)
112
+ logger.info("STARTING VOLUME TASK GENERATION")
113
+ logger.info("=" * 80)
114
+
115
+ generator = VolumeTaskGenerator(config, logger)
116
+ generator.dataset.reset_category_usage() # Reset counter for this task
117
+ generator.generate_dataset()
118
+
119
+ # Log category usage statistics
120
+ usage_stats = generator.dataset.get_category_usage_stats()
121
+ sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
122
+ logger.info("Category usage statistics (as loudest/softest answers):")
123
+ logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
124
+ logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
125
+ logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
126
+
127
+ logger.info("Volume task completed successfully!")
128
+
129
+
130
+ def run_pipeline(
131
+ config_path: str,
132
+ tasks: Optional[List[str]] = None,
133
+ output_path: Optional[str] = None
134
+ ):
135
+ """
136
+ Run the complete dataset generation pipeline.
137
+
138
+ Args:
139
+ config_path: Path to configuration YAML file
140
+ tasks: Optional list of specific tasks to run (default: all enabled tasks)
141
+ output_path: Optional custom output path (overrides config)
142
+ """
143
+ # Load configuration
144
+ config = load_config(config_path)
145
+
146
+ # Override output path if provided
147
+ if output_path:
148
+ config['output']['base_path'] = output_path
149
+
150
+ # Create output directory
151
+ output_base = Path(config['output']['base_path'])
152
+ output_base.mkdir(parents=True, exist_ok=True)
153
+
154
+ # Set random seed
155
+ set_random_seed(config['random_seed'])
156
+
157
+ # Setup main logger
158
+ logger = setup_logger(
159
+ 'pipeline',
160
+ log_file=str(output_base / config['logging']['log_file']),
161
+ level=config['logging']['level'],
162
+ console_output=config['logging']['console_output']
163
+ )
164
+
165
+ logger.info("=" * 80)
166
+ logger.info("TEMPORAL REASONING AUDIO DATASET GENERATION PIPELINE")
167
+ logger.info("=" * 80)
168
+ logger.info(f"Configuration: {config_path}")
169
+ logger.info(f"Output directory: {output_base}")
170
+ logger.info(f"Random seed: {config['random_seed']}")
171
+ logger.info(f"ESC-50 audio path: {config['esc50']['audio_path']}")
172
+ logger.info(f"ESC-50 metadata path: {config['esc50']['metadata_path']}")
173
+
174
+ # Determine which tasks to run
175
+ task_map = {
176
+ 'count': run_count_task,
177
+ 'duration': run_duration_task,
178
+ 'order': run_order_task,
179
+ 'volume': run_volume_task
180
+ }
181
+
182
+ if tasks:
183
+ tasks_to_run = {k: v for k, v in task_map.items() if k in tasks}
184
+ logger.info(f"Running specific tasks: {', '.join(tasks)}")
185
+ else:
186
+ tasks_to_run = task_map
187
+ logger.info("Running all enabled tasks")
188
+
189
+ # Run tasks
190
+ for task_name, task_func in tasks_to_run.items():
191
+ try:
192
+ task_func(config, logger)
193
+ except Exception as e:
194
+ logger.error(f"Error running {task_name} task: {e}", exc_info=True)
195
+ raise
196
+
197
+ logger.info("=" * 80)
198
+ logger.info("PIPELINE COMPLETED SUCCESSFULLY!")
199
+ logger.info("=" * 80)
200
+ logger.info(f"All outputs saved to: {output_base}")
201
+
202
+
203
+ def main():
204
+ """Main entry point with argument parsing."""
205
+ parser = argparse.ArgumentParser(
206
+ description="Temporal Reasoning Audio Dataset Generation Pipeline",
207
+ formatter_class=argparse.RawDescriptionHelpFormatter,
208
+ epilog="""
209
+ Examples:
210
+ # Run all tasks with default config
211
+ python main.py
212
+
213
+ # Run with custom config
214
+ python main.py --config my_config.yaml
215
+
216
+ # Run specific tasks only
217
+ python main.py --tasks count duration
218
+
219
+ # Use custom output directory
220
+ python main.py --output /path/to/output
221
+
222
+ # Combine options
223
+ python main.py --config custom.yaml --tasks count order --output ./my_dataset
224
+ """
225
+ )
226
+
227
+ parser.add_argument(
228
+ '--config', '-c',
229
+ type=str,
230
+ default='config.yaml',
231
+ help='Path to configuration YAML file (default: config.yaml)'
232
+ )
233
+
234
+ parser.add_argument(
235
+ '--tasks', '-t',
236
+ nargs='+',
237
+ choices=['count', 'duration', 'order', 'volume'],
238
+ help='Specific tasks to run (default: all enabled tasks)'
239
+ )
240
+
241
+ parser.add_argument(
242
+ '--output', '-o',
243
+ type=str,
244
+ help='Custom output directory (overrides config)'
245
+ )
246
+
247
+ args = parser.parse_args()
248
+
249
+ # Check if config file exists
250
+ config_path = Path(args.config)
251
+ if not config_path.exists():
252
+ # Try relative to script directory
253
+ script_dir = Path(__file__).parent
254
+ config_path = script_dir / args.config
255
+ if not config_path.exists():
256
+ print(f"Error: Config file not found: {args.config}")
257
+ sys.exit(1)
258
+
259
+ # Run pipeline
260
+ try:
261
+ run_pipeline(
262
+ config_path=str(config_path),
263
+ tasks=args.tasks,
264
+ output_path=args.output
265
+ )
266
+ except Exception as e:
267
+ print(f"Pipeline failed with error: {e}")
268
+ sys.exit(1)
269
+
270
+
271
+ if __name__ == '__main__':
272
+ main()
preprocess_esc50.py ADDED
@@ -0,0 +1,714 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ESC-50 Preprocessing Script for Duration Task
4
+
5
+ This script processes all ESC-50 audio clips to:
6
+ 1. Apply amplitude-based filtering to detect actual sound regions
7
+ 2. Calculate effective duration (portion containing actual sound)
8
+ 3. Save trimmed audio files (with silence removed)
9
+ 4. Generate a CSV with all metadata including effective durations
10
+
11
+ Usage:
12
+ python preprocess_esc50.py --config config.yaml
13
+ python preprocess_esc50.py --config config.yaml --threshold-db -40 --min-sound-ms 50
14
+ """
15
+
16
+ import argparse
17
+ import os
18
+ import sys
19
+ from pathlib import Path
20
+ from typing import Dict, List, Optional, Tuple
21
+
22
+ import numpy as np
23
+ import pandas as pd
24
+ from pydub import AudioSegment
25
+ from tqdm import tqdm
26
+
27
+ # Add parent directory to path for imports
28
+ sys.path.insert(0, str(Path(__file__).parent))
29
+
30
+ from utils.logger import setup_logger
31
+
32
+ logger = setup_logger(__name__)
33
+
34
+
35
+ def get_amplitude_array(audio: AudioSegment) -> np.ndarray:
36
+ """
37
+ Convert AudioSegment to numpy array of amplitudes.
38
+
39
+ Args:
40
+ audio: Input audio segment
41
+
42
+ Returns:
43
+ Numpy array of amplitude values (normalized to -1 to 1)
44
+ """
45
+ samples = np.array(audio.get_array_of_samples())
46
+
47
+ # Handle stereo by averaging channels
48
+ if audio.channels == 2:
49
+ samples = samples.reshape((-1, 2)).mean(axis=1)
50
+
51
+ # Normalize to -1 to 1 range
52
+ max_val = float(2 ** (audio.sample_width * 8 - 1))
53
+ samples = samples / max_val
54
+
55
+ return samples
56
+
57
+
58
+ def compute_rms_envelope(samples: np.ndarray, frame_size_ms: int, hop_size_ms: int,
59
+ sample_rate: int) -> Tuple[np.ndarray, np.ndarray]:
60
+ """
61
+ Compute RMS envelope of audio signal.
62
+
63
+ Args:
64
+ samples: Audio samples as numpy array
65
+ frame_size_ms: Frame size in milliseconds
66
+ hop_size_ms: Hop size in milliseconds
67
+ sample_rate: Audio sample rate
68
+
69
+ Returns:
70
+ Tuple of (rms_values, time_stamps_ms)
71
+ """
72
+ frame_size = int(sample_rate * frame_size_ms / 1000)
73
+ hop_size = int(sample_rate * hop_size_ms / 1000)
74
+
75
+ rms_values = []
76
+ time_stamps = []
77
+
78
+ for i in range(0, len(samples) - frame_size + 1, hop_size):
79
+ frame = samples[i:i + frame_size]
80
+ rms = np.sqrt(np.mean(frame ** 2))
81
+ rms_values.append(rms)
82
+ time_stamps.append(i / sample_rate * 1000) # Convert to ms
83
+
84
+ return np.array(rms_values), np.array(time_stamps)
85
+
86
+
87
+ def rms_to_db(rms: np.ndarray, reference: float = 1.0) -> np.ndarray:
88
+ """
89
+ Convert RMS values to decibels.
90
+
91
+ Args:
92
+ rms: RMS values
93
+ reference: Reference value (default 1.0 for normalized audio)
94
+
95
+ Returns:
96
+ dB values
97
+ """
98
+ # Avoid log(0) by using a small epsilon
99
+ epsilon = 1e-10
100
+ return 20 * np.log10(np.maximum(rms, epsilon) / reference)
101
+
102
+
103
+ def detect_sound_regions(
104
+ audio: AudioSegment,
105
+ threshold_db: float = -40.0,
106
+ min_sound_duration_ms: int = 50,
107
+ frame_size_ms: int = 20,
108
+ hop_size_ms: int = 10,
109
+ merge_gap_ms: int = 100,
110
+ threshold_strategy: str = 'noise_floor',
111
+ noise_floor_percentile: float = 10.0,
112
+ noise_floor_delta_db: float = 15.0
113
+ ) -> List[Tuple[int, int]]:
114
+ """
115
+ Detect regions in audio that contain actual sound (above threshold).
116
+
117
+ Supports two threshold strategies:
118
+ - 'peak_relative': threshold = peak_db + threshold_db (old behavior)
119
+ - 'noise_floor': threshold = percentile(db_values, p) + delta_db (adaptive per-clip)
120
+
121
+ The 'noise_floor' strategy is recommended as it adapts to each clip's
122
+ actual background noise level rather than using a fixed offset from peak.
123
+
124
+ Args:
125
+ audio: Input audio segment
126
+ threshold_db: dB threshold below peak (used if strategy='peak_relative')
127
+ min_sound_duration_ms: Minimum duration of sound region to keep
128
+ frame_size_ms: Frame size for RMS computation
129
+ hop_size_ms: Hop size for RMS computation
130
+ merge_gap_ms: Merge regions separated by less than this gap
131
+ threshold_strategy: 'peak_relative' or 'noise_floor'
132
+ noise_floor_percentile: Percentile for noise floor estimation (default 10)
133
+ noise_floor_delta_db: dB above noise floor to set threshold (default 15)
134
+
135
+ Returns:
136
+ List of (start_ms, end_ms) tuples for sound regions
137
+ """
138
+ samples = get_amplitude_array(audio)
139
+ sample_rate = audio.frame_rate
140
+
141
+ # Compute RMS envelope
142
+ rms_values, time_stamps = compute_rms_envelope(
143
+ samples, frame_size_ms, hop_size_ms, sample_rate
144
+ )
145
+
146
+ if len(rms_values) == 0:
147
+ return []
148
+
149
+ # Convert to dB
150
+ db_values = rms_to_db(rms_values)
151
+
152
+ # Compute threshold based on strategy
153
+ peak_db = np.max(db_values)
154
+
155
+ if threshold_strategy == 'noise_floor':
156
+ # ADAPTIVE: Use noise floor (low percentile) + delta
157
+ # This adapts to each clip's actual background noise level
158
+ noise_floor_db = np.percentile(db_values, noise_floor_percentile)
159
+ absolute_threshold = noise_floor_db + noise_floor_delta_db
160
+
161
+ # Safeguard: don't exceed peak (would detect nothing)
162
+ # Leave at least 1 dB below peak
163
+ absolute_threshold = min(absolute_threshold, peak_db - 1.0)
164
+
165
+ logger.debug(
166
+ f"Noise-floor threshold: floor={noise_floor_db:.1f}dB (p{noise_floor_percentile}), "
167
+ f"delta={noise_floor_delta_db}dB, threshold={absolute_threshold:.1f}dB, peak={peak_db:.1f}dB"
168
+ )
169
+ else:
170
+ # OLD: peak-relative threshold
171
+ absolute_threshold = peak_db + threshold_db # threshold_db is negative
172
+ logger.debug(
173
+ f"Peak-relative threshold: peak={peak_db:.1f}dB, offset={threshold_db}dB, "
174
+ f"threshold={absolute_threshold:.1f}dB"
175
+ )
176
+
177
+ # Find frames above threshold
178
+ above_threshold = db_values > absolute_threshold
179
+
180
+ # Find contiguous regions
181
+ regions = []
182
+ in_region = False
183
+ region_start = 0
184
+
185
+ for i, (is_above, time_ms) in enumerate(zip(above_threshold, time_stamps)):
186
+ if is_above and not in_region:
187
+ # Start of new region
188
+ in_region = True
189
+ region_start = time_ms
190
+ elif not is_above and in_region:
191
+ # End of region
192
+ in_region = False
193
+ region_end = time_ms
194
+ if region_end - region_start >= min_sound_duration_ms:
195
+ regions.append((int(region_start), int(region_end)))
196
+
197
+ # Handle case where audio ends while still in a region
198
+ if in_region:
199
+ region_end = time_stamps[-1] + hop_size_ms
200
+ if region_end - region_start >= min_sound_duration_ms:
201
+ regions.append((int(region_start), int(region_end)))
202
+
203
+ # Merge regions that are close together
204
+ if len(regions) > 1:
205
+ merged_regions = [regions[0]]
206
+ for start, end in regions[1:]:
207
+ prev_start, prev_end = merged_regions[-1]
208
+ if start - prev_end <= merge_gap_ms:
209
+ # Merge with previous region
210
+ merged_regions[-1] = (prev_start, end)
211
+ else:
212
+ merged_regions.append((start, end))
213
+ regions = merged_regions
214
+
215
+ return regions
216
+
217
+
218
+ def get_sound_regions(
219
+ audio: AudioSegment,
220
+ threshold_db: float = -40.0,
221
+ min_sound_duration_ms: int = 50,
222
+ threshold_strategy: str = 'noise_floor',
223
+ noise_floor_percentile: float = 10.0,
224
+ noise_floor_delta_db: float = 15.0
225
+ ) -> List[Tuple[int, int]]:
226
+ """
227
+ Detect sound regions in audio using adaptive threshold.
228
+
229
+ Args:
230
+ audio: Input audio segment
231
+ threshold_db: dB threshold below peak (used if strategy='peak_relative')
232
+ min_sound_duration_ms: Minimum duration of sound region to keep
233
+ threshold_strategy: 'peak_relative' or 'noise_floor'
234
+ noise_floor_percentile: Percentile for noise floor estimation
235
+ noise_floor_delta_db: dB above noise floor to set threshold
236
+
237
+ Returns:
238
+ List of (start_ms, end_ms) tuples for sound regions
239
+ """
240
+ return detect_sound_regions(
241
+ audio,
242
+ threshold_db=threshold_db,
243
+ min_sound_duration_ms=min_sound_duration_ms,
244
+ threshold_strategy=threshold_strategy,
245
+ noise_floor_percentile=noise_floor_percentile,
246
+ noise_floor_delta_db=noise_floor_delta_db
247
+ )
248
+
249
+
250
+ def extract_sound_with_edges_trimmed(
251
+ audio: AudioSegment,
252
+ regions: List[Tuple[int, int]],
253
+ min_silence_to_trim_ms: int = 100,
254
+ buffer_ratio: float = 0.1
255
+ ) -> AudioSegment:
256
+ """
257
+ Extract audio with ONLY leftmost and rightmost silence removed IF present.
258
+
259
+ Trimming is ADAPTIVE:
260
+ - Only trims if edge silence >= min_silence_to_trim_ms
261
+ - Keeps a small percentage (buffer_ratio) of the silence to preserve transients
262
+ - Buffer size adapts to actual silence duration (not fixed)
263
+
264
+ Preserves all internal structure and silence between sounds.
265
+ Perfect for periodic sounds (clock ticks, footsteps, typing).
266
+
267
+ Args:
268
+ audio: Input audio segment
269
+ regions: List of (start_ms, end_ms) tuples for sound regions
270
+ min_silence_to_trim_ms: Only trim if edge silence is at least this long (default 100ms)
271
+ buffer_ratio: Keep this fraction of the silence as buffer (default 0.1 = 10%)
272
+ Example: 500ms silence -> keep 50ms buffer
273
+
274
+ Returns:
275
+ Audio segment with edges trimmed (or original if no significant silence)
276
+ """
277
+ if not regions:
278
+ # No sound detected - return original
279
+ return audio
280
+
281
+ # Find the overall sound boundaries (first sound start, last sound end)
282
+ first_sound_start_ms = regions[0][0]
283
+ last_sound_end_ms = regions[-1][1]
284
+ audio_duration_ms = len(audio)
285
+
286
+ # Calculate actual silence durations at edges
287
+ leading_silence_ms = first_sound_start_ms
288
+ trailing_silence_ms = audio_duration_ms - last_sound_end_ms
289
+
290
+ # Adaptive trimming: only trim if there's significant silence
291
+ # Keep a small percentage as buffer to avoid cutting transients
292
+ if leading_silence_ms >= min_silence_to_trim_ms:
293
+ buffer_ms = max(200, int(leading_silence_ms * buffer_ratio)) # At least 200ms buffer
294
+ trim_start_ms = max(0, first_sound_start_ms - buffer_ms)
295
+ else:
296
+ # Not enough silence to trim - keep from start
297
+ trim_start_ms = 0
298
+
299
+ if trailing_silence_ms >= min_silence_to_trim_ms:
300
+ buffer_ms = max(200, int(trailing_silence_ms * buffer_ratio))
301
+ trim_end_ms = min(audio_duration_ms, last_sound_end_ms + buffer_ms)
302
+ else:
303
+ # Not enough silence to trim - keep to end
304
+ trim_end_ms = audio_duration_ms
305
+
306
+ # Extract the edge-trimmed portion (internal structure preserved)
307
+ trimmed_audio = audio[trim_start_ms:trim_end_ms]
308
+
309
+ logger.debug(
310
+ f"Edge trim: {audio_duration_ms}ms -> {len(trimmed_audio)}ms "
311
+ f"(leading_silence={leading_silence_ms}ms, trailing_silence={trailing_silence_ms}ms, "
312
+ f"trim_start={trim_start_ms}ms, trim_end={trim_end_ms}ms)"
313
+ )
314
+
315
+ return trimmed_audio
316
+
317
+
318
+ def extract_all_sound_regions(
319
+ audio: AudioSegment,
320
+ regions: List[Tuple[int, int]],
321
+ crossfade_ms: int = 10,
322
+ padding_ms: int = 20
323
+ ) -> AudioSegment:
324
+ """
325
+ Extract ALL sound portions and join them, removing ALL silence.
326
+
327
+ WARNING: This destroys natural periodicity! Use trim_edges_only() instead
328
+ for most use cases. This function is kept for backward compatibility.
329
+
330
+ Args:
331
+ audio: Input audio segment
332
+ regions: List of (start_ms, end_ms) tuples for sound regions
333
+ crossfade_ms: Crossfade duration when joining regions
334
+ padding_ms: Padding around each region to avoid cutting transients
335
+
336
+ Returns:
337
+ Audio segment containing only sound portions (internal silence removed)
338
+ """
339
+ if not regions:
340
+ return audio
341
+
342
+ # Extract each region
343
+ extracted_parts = []
344
+ for start_ms, end_ms in regions:
345
+ # Add padding to avoid cutting off transients
346
+ padded_start = max(0, start_ms - padding_ms)
347
+ padded_end = min(len(audio), end_ms + padding_ms)
348
+ part = audio[padded_start:padded_end]
349
+ extracted_parts.append(part)
350
+
351
+ # Concatenate with crossfade
352
+ if len(extracted_parts) == 1:
353
+ return extracted_parts[0]
354
+
355
+ result = extracted_parts[0]
356
+ for part in extracted_parts[1:]:
357
+ if len(result) > crossfade_ms and len(part) > crossfade_ms:
358
+ result = result.append(part, crossfade=crossfade_ms)
359
+ else:
360
+ result = result + part
361
+
362
+ return result
363
+
364
+
365
+ def process_esc50_dataset(
366
+ audio_dir: str,
367
+ metadata_path: str,
368
+ output_dir: str,
369
+ threshold_db: float = -40.0,
370
+ min_sound_duration_ms: int = 50,
371
+ save_trimmed_audio: bool = True,
372
+ threshold_strategy: str = 'noise_floor',
373
+ noise_floor_percentile: float = 10.0,
374
+ noise_floor_delta_db: float = 15.0
375
+ ) -> pd.DataFrame:
376
+ """
377
+ Process entire ESC-50 dataset and compute effective durations.
378
+
379
+ Uses ADAPTIVE EDGE-ONLY trimming to preserve natural periodicity of sounds.
380
+ Only leading and trailing silence is removed IF significant (>=100ms).
381
+ Trimming is adaptive: keeps a small percentage of silence as buffer for transients.
382
+ All internal structure is preserved.
383
+
384
+ Supports two threshold strategies for adaptive per-clip thresholding:
385
+ - 'peak_relative': threshold = peak_db + threshold_db (fixed offset from peak)
386
+ - 'noise_floor': threshold = percentile(db, p) + delta (adapts to noise floor)
387
+
388
+ Args:
389
+ audio_dir: Path to ESC-50 audio directory
390
+ metadata_path: Path to ESC-50 metadata CSV
391
+ output_dir: Output directory for processed files
392
+ threshold_db: dB threshold for silence detection (peak_relative mode)
393
+ min_sound_duration_ms: Minimum sound duration to keep
394
+ save_trimmed_audio: Whether to save trimmed audio files
395
+ threshold_strategy: 'peak_relative' or 'noise_floor' (recommended)
396
+ noise_floor_percentile: Percentile for noise floor estimation (default 5)
397
+ noise_floor_delta_db: dB above noise floor to set threshold (default 8)
398
+
399
+ Returns:
400
+ DataFrame with processed metadata
401
+ """
402
+ # Load original metadata
403
+ original_metadata = pd.read_csv(metadata_path)
404
+ logger.info(f"Loaded metadata for {len(original_metadata)} clips")
405
+
406
+ # Create output directories
407
+ output_path = Path(output_dir)
408
+ output_path.mkdir(parents=True, exist_ok=True)
409
+
410
+ if save_trimmed_audio:
411
+ trimmed_audio_dir = output_path / "trimmed_audio"
412
+ trimmed_audio_dir.mkdir(exist_ok=True)
413
+
414
+ # Process each audio file
415
+ results = []
416
+
417
+ for _, row in tqdm(original_metadata.iterrows(), total=len(original_metadata),
418
+ desc="Processing ESC-50 clips"):
419
+ filename = row['filename']
420
+ category = row['category']
421
+ audio_path = Path(audio_dir) / filename
422
+
423
+ try:
424
+ # Load audio
425
+ audio = AudioSegment.from_file(str(audio_path), format="wav")
426
+ raw_duration_s = len(audio) / 1000.0
427
+
428
+ # Detect sound regions (using adaptive threshold)
429
+ regions = get_sound_regions(
430
+ audio,
431
+ threshold_db=threshold_db,
432
+ min_sound_duration_ms=min_sound_duration_ms,
433
+ threshold_strategy=threshold_strategy,
434
+ noise_floor_percentile=noise_floor_percentile,
435
+ noise_floor_delta_db=noise_floor_delta_db
436
+ )
437
+
438
+ # Trim edges only (leftmost and rightmost silence)
439
+ # Adaptive trimming: only trims if silence >= 100ms, keeps 10% as buffer
440
+ trimmed_audio = extract_sound_with_edges_trimmed(audio, regions)
441
+ final_duration_s = len(trimmed_audio) / 1000.0
442
+
443
+ # Calculate peak amplitude and RMS from trimmed audio
444
+ samples = get_amplitude_array(trimmed_audio)
445
+ peak_amplitude = np.max(np.abs(samples))
446
+ peak_amplitude_db = 20 * np.log10(peak_amplitude + 1e-10)
447
+ rms = np.sqrt(np.mean(samples ** 2))
448
+ avg_rms_db = 20 * np.log10(rms + 1e-10)
449
+
450
+ # Calculate effective duration (sum of sound regions)
451
+ effective_duration_s = sum(end - start for start, end in regions) / 1000.0 if regions else final_duration_s
452
+
453
+ # Save trimmed audio
454
+ trimmed_filename = None
455
+ if save_trimmed_audio:
456
+ trimmed_filename = filename
457
+ trimmed_path = trimmed_audio_dir / trimmed_filename
458
+ trimmed_audio.export(str(trimmed_path), format="wav")
459
+
460
+ # Store results
461
+ results.append({
462
+ 'filename': filename,
463
+ 'category': category,
464
+ 'fold': row['fold'],
465
+ 'target': row['target'],
466
+ 'esc10': row['esc10'],
467
+ 'raw_duration_s': round(raw_duration_s, 4),
468
+ 'final_duration_s': round(final_duration_s, 4),
469
+ 'effective_duration_s': round(effective_duration_s, 4),
470
+ 'num_sound_regions': len(regions),
471
+ 'peak_amplitude_db': round(peak_amplitude_db, 2),
472
+ 'avg_rms_db': round(avg_rms_db, 2),
473
+ 'trimmed_filename': trimmed_filename if save_trimmed_audio else None,
474
+ 'threshold_strategy': threshold_strategy,
475
+ 'threshold_db_used': threshold_db if threshold_strategy == 'peak_relative' else None,
476
+ 'noise_floor_percentile': noise_floor_percentile if threshold_strategy == 'noise_floor' else None,
477
+ 'noise_floor_delta_db': noise_floor_delta_db if threshold_strategy == 'noise_floor' else None,
478
+ 'min_sound_duration_ms_used': min_sound_duration_ms
479
+ })
480
+
481
+ except Exception as e:
482
+ logger.error(f"Error processing {filename}: {e}")
483
+ results.append({
484
+ 'filename': filename,
485
+ 'category': category,
486
+ 'fold': row['fold'],
487
+ 'target': row['target'],
488
+ 'esc10': row['esc10'],
489
+ 'raw_duration_s': None,
490
+ 'final_duration_s': None,
491
+ 'effective_duration_s': None,
492
+ 'num_sound_regions': 0,
493
+ 'peak_amplitude_db': None,
494
+ 'avg_rms_db': None,
495
+ 'trimmed_filename': None,
496
+ 'threshold_strategy': threshold_strategy,
497
+ 'threshold_db_used': threshold_db if threshold_strategy == 'peak_relative' else None,
498
+ 'noise_floor_percentile': noise_floor_percentile if threshold_strategy == 'noise_floor' else None,
499
+ 'noise_floor_delta_db': noise_floor_delta_db if threshold_strategy == 'noise_floor' else None,
500
+ 'min_sound_duration_ms_used': min_sound_duration_ms,
501
+ 'error': str(e)
502
+ })
503
+
504
+ # Create DataFrame
505
+ results_df = pd.DataFrame(results)
506
+
507
+ # Save CSV
508
+ csv_path = output_path / "effective_durations.csv"
509
+ results_df.to_csv(csv_path, index=False)
510
+ logger.info(f"Saved effective durations to {csv_path}")
511
+
512
+ # Print summary statistics
513
+ print_summary_statistics(results_df)
514
+
515
+ return results_df
516
+
517
+
518
+ def print_summary_statistics(df: pd.DataFrame):
519
+ """Print summary statistics of the processed dataset."""
520
+ print("\n" + "=" * 60)
521
+ print("ESC-50 Preprocessing Summary")
522
+ print("=" * 60)
523
+
524
+ # Filter out errors
525
+ valid_df = df[df['effective_duration_s'].notna()]
526
+
527
+ print(f"\nTotal clips processed: {len(df)}")
528
+ print(f"Successfully processed: {len(valid_df)}")
529
+ print(f"Errors: {len(df) - len(valid_df)}")
530
+
531
+ print(f"\nRaw duration statistics:")
532
+ print(f" Mean: {valid_df['raw_duration_s'].mean():.3f}s")
533
+ print(f" Std: {valid_df['raw_duration_s'].std():.3f}s")
534
+ print(f" Min: {valid_df['raw_duration_s'].min():.3f}s")
535
+ print(f" Max: {valid_df['raw_duration_s'].max():.3f}s")
536
+
537
+ print(f"\nFinal duration statistics (edges trimmed, internal structure preserved):")
538
+ print(f" Mean: {valid_df['final_duration_s'].mean():.3f}s")
539
+ print(f" Std: {valid_df['final_duration_s'].std():.3f}s")
540
+ print(f" Min: {valid_df['final_duration_s'].min():.3f}s")
541
+ print(f" Max: {valid_df['final_duration_s'].max():.3f}s")
542
+
543
+ print(f"\nEffective duration statistics (sum of sound regions only):")
544
+ print(f" Mean: {valid_df['effective_duration_s'].mean():.3f}s")
545
+ print(f" Std: {valid_df['effective_duration_s'].std():.3f}s")
546
+ print(f" Min: {valid_df['effective_duration_s'].min():.3f}s")
547
+ print(f" Max: {valid_df['effective_duration_s'].max():.3f}s")
548
+
549
+ # Compare effective vs final
550
+ print(f"\nComparison (final includes internal silences):")
551
+ print(f" Avg effective: {valid_df['effective_duration_s'].mean():.3f}s")
552
+ print(f" Avg final: {valid_df['final_duration_s'].mean():.3f}s")
553
+ print(f" Difference: {valid_df['final_duration_s'].mean() - valid_df['effective_duration_s'].mean():.3f}s (internal silences)")
554
+
555
+ # Duration reduction
556
+ reduction = (1 - valid_df['final_duration_s'].mean() / valid_df['raw_duration_s'].mean()) * 100
557
+ print(f"\nAverage edge trimming reduction: {reduction:.1f}%")
558
+
559
+ # Per-category statistics
560
+ print("\nEffective duration by category (top 10 longest):")
561
+ category_stats = valid_df.groupby('category')['effective_duration_s'].agg(['mean', 'std', 'min', 'max'])
562
+ category_stats = category_stats.sort_values('mean', ascending=False)
563
+ print(category_stats.head(10).to_string())
564
+
565
+ print("\nEffective duration by category (top 10 shortest):")
566
+ print(category_stats.tail(10).to_string())
567
+
568
+ print("\n" + "=" * 60)
569
+
570
+
571
+ def load_config(config_path: str) -> dict:
572
+ """Load configuration from YAML file."""
573
+ import yaml
574
+ with open(config_path, 'r') as f:
575
+ return yaml.safe_load(f)
576
+
577
+
578
+ def main():
579
+ parser = argparse.ArgumentParser(
580
+ description="Preprocess ESC-50 dataset for duration task"
581
+ )
582
+ parser.add_argument(
583
+ '--config', '-c',
584
+ type=str,
585
+ default='config.yaml',
586
+ help='Path to configuration file'
587
+ )
588
+ parser.add_argument(
589
+ '--threshold-db',
590
+ type=float,
591
+ default=None,
592
+ help='dB threshold below peak for silence detection (default: -40)'
593
+ )
594
+ parser.add_argument(
595
+ '--min-sound-ms',
596
+ type=int,
597
+ default=None,
598
+ help='Minimum sound duration in ms to keep (default: 50)'
599
+ )
600
+ parser.add_argument(
601
+ '--output-dir',
602
+ type=str,
603
+ default=None,
604
+ help='Output directory (default: from config or ESC-50_preprocessed)'
605
+ )
606
+ parser.add_argument(
607
+ '--no-trimmed-audio',
608
+ action='store_true',
609
+ help='Do not save trimmed audio files (only save CSV)'
610
+ )
611
+ parser.add_argument(
612
+ '--threshold-strategy',
613
+ type=str,
614
+ choices=['peak_relative', 'noise_floor'],
615
+ default=None,
616
+ help='Threshold strategy: peak_relative (old) or noise_floor (adaptive, recommended)'
617
+ )
618
+ parser.add_argument(
619
+ '--noise-floor-percentile',
620
+ type=float,
621
+ default=None,
622
+ help='Percentile for noise floor estimation (default: 10)'
623
+ )
624
+ parser.add_argument(
625
+ '--noise-floor-delta-db',
626
+ type=float,
627
+ default=None,
628
+ help='dB above noise floor to set threshold (default: 15)'
629
+ )
630
+
631
+ args = parser.parse_args()
632
+
633
+ # Load config
634
+ config = load_config(args.config)
635
+
636
+ # Get ESC-50 paths from config
637
+ esc50_config = config.get('esc50', {})
638
+ audio_dir = esc50_config.get('audio_path', '/home/debarpanb1/ESC-50_github/audio')
639
+ metadata_path = esc50_config.get('metadata_path', '/home/debarpanb1/ESC-50_github/meta/esc50.csv')
640
+
641
+ # Get duration task config for preprocessing parameters
642
+ duration_config = config.get('tasks', {}).get('duration', {})
643
+
644
+ # Determine threshold and min sound duration
645
+ threshold_db = args.threshold_db
646
+ if threshold_db is None:
647
+ threshold_db = duration_config.get('amplitude_threshold_db', -40.0)
648
+
649
+ min_sound_ms = args.min_sound_ms
650
+ if min_sound_ms is None:
651
+ min_sound_ms = duration_config.get('min_sound_duration_ms', 50)
652
+
653
+ # Determine output directory
654
+ output_dir = args.output_dir
655
+ if output_dir is None:
656
+ output_dir = duration_config.get(
657
+ 'preprocessed_data_path',
658
+ '/home/debarpanb1/TREA_2.0/ESC-50_preprocessed'
659
+ )
660
+
661
+ # Determine threshold strategy (noise_floor is recommended/default)
662
+ threshold_strategy = args.threshold_strategy
663
+ if threshold_strategy is None:
664
+ threshold_strategy = duration_config.get('threshold_strategy', 'noise_floor')
665
+
666
+ # Determine noise floor percentile
667
+ noise_floor_percentile = args.noise_floor_percentile
668
+ if noise_floor_percentile is None:
669
+ noise_floor_percentile = duration_config.get('noise_floor_percentile', 10.0)
670
+
671
+ # Determine noise floor delta dB
672
+ noise_floor_delta_db = args.noise_floor_delta_db
673
+ if noise_floor_delta_db is None:
674
+ noise_floor_delta_db = duration_config.get('noise_floor_delta_db', 15.0)
675
+
676
+ # Log configuration
677
+ logger.info("=" * 60)
678
+ logger.info("ESC-50 Preprocessing Configuration")
679
+ logger.info("=" * 60)
680
+ logger.info(f"Audio directory: {audio_dir}")
681
+ logger.info(f"Metadata path: {metadata_path}")
682
+ logger.info(f"Output directory: {output_dir}")
683
+ logger.info(f"Threshold strategy: {threshold_strategy}")
684
+ if threshold_strategy == 'peak_relative':
685
+ logger.info(f" Peak-relative threshold dB: {threshold_db}")
686
+ else:
687
+ logger.info(f" Noise floor percentile: {noise_floor_percentile}")
688
+ logger.info(f" Noise floor delta dB: {noise_floor_delta_db}")
689
+ logger.info(f"Min sound duration (ms): {min_sound_ms}")
690
+ logger.info(f"Adaptive edge trimming: only if silence >= 100ms, keep 10% buffer")
691
+ logger.info(f"Save trimmed audio: {not args.no_trimmed_audio}")
692
+ logger.info("=" * 60)
693
+
694
+ # Process dataset
695
+ results_df = process_esc50_dataset(
696
+ audio_dir=audio_dir,
697
+ metadata_path=metadata_path,
698
+ output_dir=output_dir,
699
+ threshold_db=threshold_db,
700
+ min_sound_duration_ms=min_sound_ms,
701
+ save_trimmed_audio=not args.no_trimmed_audio,
702
+ threshold_strategy=threshold_strategy,
703
+ noise_floor_percentile=noise_floor_percentile,
704
+ noise_floor_delta_db=noise_floor_delta_db
705
+ )
706
+
707
+ logger.info(f"\nPreprocessing complete!")
708
+ logger.info(f"Results saved to: {output_dir}")
709
+
710
+ return results_df
711
+
712
+
713
+ if __name__ == "__main__":
714
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pyyaml
2
+ pandas
3
+ pydub
4
+ numpy
5
+ pyloudnorm
6
+
run_llm_answers_all.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Run llm_answer_generator.py across dataset folders and tasks
3
+ # Processes both MCQ and open_text CSVs for tasks: count, duration, order, volume
4
+
5
+ set -euo pipefail
6
+ export CUDA_VISIBLE_DEVICES=7
7
+ PY_SCRIPT="$(dirname "$0")/llm_answer_generator.py"
8
+ BASE_DIR="$(dirname "$0")"
9
+
10
+ DATA_SPLITS=(train validation test_large test_ood)
11
+ TASKS=(count duration order volume)
12
+
13
+ echo "Running LLM answer generation script across splits: ${DATA_SPLITS[*]} and tasks: ${TASKS[*]}"
14
+
15
+ for split in "${DATA_SPLITS[@]}"; do
16
+ for task in "${TASKS[@]}"; do
17
+ # open_text file
18
+ ot_csv="${BASE_DIR}/dataset_v2/${split}/${task}/${task}_open_text.csv"
19
+ if [ -f "${ot_csv}" ]; then
20
+ echo "[OPEN_TEXT] Processing ${ot_csv}"
21
+ python "${PY_SCRIPT}" --input "${ot_csv}" --mode open_text --task "${task}"
22
+ else
23
+ echo "[OPEN_TEXT] Not found: ${ot_csv}"
24
+ fi
25
+ done
26
+ done
27
+
28
+ echo "All tasks processed."
run_pipeline.sh ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ ################################################################################
4
+ # Temporal Reasoning Audio Dataset Generation Pipeline
5
+ #
6
+ # This script orchestrates the entire dataset creation process for all tasks.
7
+ ################################################################################
8
+
9
+ set -e # Exit on error
10
+
11
+ # Default configuration
12
+ CONFIG_FILE="config.yaml"
13
+ OUTPUT_DIR=""
14
+ TASKS=""
15
+ PYTHON_CMD="python"
16
+
17
+ # Colors for output
18
+ RED='\033[0;31m'
19
+ GREEN='\033[0;32m'
20
+ YELLOW='\033[1;33m'
21
+ BLUE='\033[0;34m'
22
+ NC='\033[0m' # No Color
23
+
24
+ # Function to print colored messages
25
+ print_info() {
26
+ echo -e "${BLUE}[INFO]${NC} $1"
27
+ }
28
+
29
+ print_success() {
30
+ echo -e "${GREEN}[SUCCESS]${NC} $1"
31
+ }
32
+
33
+ print_warning() {
34
+ echo -e "${YELLOW}[WARNING]${NC} $1"
35
+ }
36
+
37
+ print_error() {
38
+ echo -e "${RED}[ERROR]${NC} $1"
39
+ }
40
+
41
+ # Function to print usage
42
+ usage() {
43
+ cat << EOF
44
+ Usage: $0 [OPTIONS]
45
+
46
+ Temporal Reasoning Audio Dataset Generation Pipeline
47
+
48
+ OPTIONS:
49
+ -c, --config FILE Configuration file (default: config.yaml)
50
+ -o, --output DIR Output directory (overrides config)
51
+ -t, --tasks TASKS Specific tasks to run: count,duration,order,volume
52
+ (default: all enabled tasks)
53
+ -p, --python CMD Python command to use (default: python)
54
+ -h, --help Display this help message
55
+
56
+ EXAMPLES:
57
+ # Run all tasks with default config
58
+ $0
59
+
60
+ # Run with custom config
61
+ $0 --config my_config.yaml
62
+
63
+ # Run specific tasks only
64
+ $0 --tasks count,duration
65
+
66
+ # Use custom output directory
67
+ $0 --output /path/to/output
68
+
69
+ # Combine options
70
+ $0 --config custom.yaml --tasks count,order --output ./my_dataset
71
+
72
+ EOF
73
+ }
74
+
75
+ # Parse command line arguments
76
+ while [[ $# -gt 0 ]]; do
77
+ case $1 in
78
+ -c|--config)
79
+ CONFIG_FILE="$2"
80
+ shift 2
81
+ ;;
82
+ -o|--output)
83
+ OUTPUT_DIR="$2"
84
+ shift 2
85
+ ;;
86
+ -t|--tasks)
87
+ TASKS="$2"
88
+ shift 2
89
+ ;;
90
+ -p|--python)
91
+ PYTHON_CMD="$2"
92
+ shift 2
93
+ ;;
94
+ -h|--help)
95
+ usage
96
+ exit 0
97
+ ;;
98
+ *)
99
+ print_error "Unknown option: $1"
100
+ usage
101
+ exit 1
102
+ ;;
103
+ esac
104
+ done
105
+
106
+ # Get script directory
107
+ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
108
+
109
+ # Check if config file exists
110
+ if [ ! -f "$SCRIPT_DIR/$CONFIG_FILE" ]; then
111
+ print_error "Config file not found: $CONFIG_FILE"
112
+ exit 1
113
+ fi
114
+
115
+ # Print header
116
+ echo ""
117
+ echo "================================================================================"
118
+ echo " TEMPORAL REASONING AUDIO DATASET GENERATION PIPELINE"
119
+ echo "================================================================================"
120
+ echo ""
121
+ print_info "Configuration: $CONFIG_FILE"
122
+ print_info "Python command: $PYTHON_CMD"
123
+ [ -n "$OUTPUT_DIR" ] && print_info "Output directory: $OUTPUT_DIR"
124
+ [ -n "$TASKS" ] && print_info "Tasks to run: $TASKS"
125
+ echo ""
126
+
127
+ # Check Python dependencies
128
+ print_info "Checking Python dependencies..."
129
+ $PYTHON_CMD -c "import yaml, pandas, pydub" 2>/dev/null
130
+ if [ $? -ne 0 ]; then
131
+ print_error "Missing required Python packages. Please install:"
132
+ echo " pip install pyyaml pandas pydub"
133
+ exit 1
134
+ fi
135
+ print_success "Dependencies OK"
136
+ echo ""
137
+
138
+ # Build Python command arguments
139
+ PYTHON_ARGS="$SCRIPT_DIR/main.py --config $SCRIPT_DIR/$CONFIG_FILE"
140
+ [ -n "$OUTPUT_DIR" ] && PYTHON_ARGS="$PYTHON_ARGS --output $OUTPUT_DIR"
141
+ if [ -n "$TASKS" ]; then
142
+ # Convert comma-separated to space-separated for Python argparse
143
+ TASKS_SPACE=$(echo $TASKS | tr ',' ' ')
144
+ PYTHON_ARGS="$PYTHON_ARGS --tasks $TASKS_SPACE"
145
+ fi
146
+
147
+ # Run the pipeline
148
+ print_info "Starting pipeline..."
149
+ echo ""
150
+
151
+ $PYTHON_CMD $PYTHON_ARGS
152
+
153
+ if [ $? -eq 0 ]; then
154
+ echo ""
155
+ echo "================================================================================"
156
+ print_success "PIPELINE COMPLETED SUCCESSFULLY!"
157
+ echo "================================================================================"
158
+ echo ""
159
+ else
160
+ echo ""
161
+ echo "================================================================================"
162
+ print_error "PIPELINE FAILED!"
163
+ echo "================================================================================"
164
+ echo ""
165
+ exit 1
166
+ fi
synthetic_silences/silent_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed8ddf138c2c59409bb4f1dbbf3fc910b486752b0c389dbb5dac6a4e68b8cbe5
3
+ size 263052
synthetic_silences/silent_10.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35fab767d2262eb552485542c6e593a5d84b7080862c577b23c11385176c7767
3
+ size 274840
synthetic_silences/silent_11.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b01397619b480a22261daa7b018b59b5fd1baf1e3d4ed81161908def25112f17
3
+ size 324418
synthetic_silences/silent_12.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2da9f4814fd0c6d50aa68696079c8d0ee880ed37d583e88a20481fd88c54e612
3
+ size 310108
synthetic_silences/silent_13.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5999933e975cd5846ac152bf888a954ea9243fa2218429998d96ceffac54a7e0
3
+ size 121474
synthetic_silences/silent_14.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42c8b935bd521534635cc4fea040023dbf420084b51d1e3529953d5d1593df48
3
+ size 209182
synthetic_silences/silent_15.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00529829944fd650a368d6fe65e25a7f3d25d8d4ba932712b35dfa5608380c3e
3
+ size 160682
synthetic_silences/silent_16.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:469eb34930878ba69a3994da5c2160314ce0c8bf0157d83f4ad349052a0c197b
3
+ size 112534
synthetic_silences/silent_17.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d788262618a55e51d12b0c2220ced172c0edf9072569ab010d48adc01607215
3
+ size 165986
synthetic_silences/silent_18.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83b6ef068680eacd83ac3d0b2f282fb37e2f4f018b03e89ab9a129aeac27a054
3
+ size 257330
synthetic_silences/silent_19.wav ADDED
Binary file (96.9 kB). View file
 
synthetic_silences/silent_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01bbeb6e0c14200b30be0eb57484450ba5807954333fede2e4c59d32a7042eaf
3
+ size 310850
synthetic_silences/silent_20.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cda1e6a66b8cca7fc408f90cb6b8e8c13294fc33e8735a23dd72f1d36f9a991b
3
+ size 140232
synthetic_silences/silent_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d49fadd497b9af43be5afbb08070a6317000f15edc8924bf3c11b3fcbb140616
3
+ size 227846
synthetic_silences/silent_4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:148c743ea43d3528a53395f579d4d337512de9d1fb3c5d5b66e55f3a5e9c4d0c
3
+ size 337068
synthetic_silences/silent_5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51976ce15c0272f14125acaa5529a88d6f085ce153ef64bdc662586e97cb5678
3
+ size 205426
synthetic_silences/silent_6.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dda249ab269984ae15d0a78b582455c053b1cddafb78c792cafbcbf3f682a087
3
+ size 329056
synthetic_silences/silent_7.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27156909b1191624cff0b0478477f8c40e47581bbb0be24a84e9113bf88f36a1
3
+ size 146876
synthetic_silences/silent_8.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:815a88cc01def086ca4dc23c41359eea297ec39c179b114e6e608d27bd2d9a39
3
+ size 216452
synthetic_silences/silent_9.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f88a22998b27a0e18e1801aa63ef4b83315c243762478c9cf149db4338ebafdb
3
+ size 307884
tasks/__pycache__/task_count.cpython-312.pyc ADDED
Binary file (19.7 kB). View file
 
tasks/__pycache__/task_duration.cpython-312.pyc ADDED
Binary file (30.9 kB). View file
 
tasks/__pycache__/task_order.cpython-312.pyc ADDED
Binary file (23.7 kB). View file
 
tasks/__pycache__/task_volume.cpython-312.pyc ADDED
Binary file (27.7 kB). View file
 
tasks/task_count.py ADDED
@@ -0,0 +1,472 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task 1: Count - Generate counting questions
3
+
4
+ This task joins multiple audio sources and asks questions about counting
5
+ the number of unique sound sources in the audio.
6
+ """
7
+
8
+ import csv
9
+ import random
10
+ from pathlib import Path
11
+ from typing import Dict, List
12
+
13
+ import sys
14
+ sys.path.append(str(Path(__file__).parent.parent))
15
+
16
+ from utils import (
17
+ AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
18
+ setup_logger, set_random_seed, generate_sample_durations_for_task,
19
+ generate_single_clip_duration, build_count_task_audio,
20
+ get_max_clip_num_to_be_joined
21
+ )
22
+
23
+
24
+ class CountTaskGenerator:
25
+ """Generator for counting task dataset."""
26
+
27
+ def __init__(self, config: Dict, logger):
28
+ """
29
+ Initialize count task generator.
30
+
31
+ Args:
32
+ config: Configuration dictionary
33
+ logger: Logger instance
34
+ """
35
+ self.config = config
36
+ self.logger = logger
37
+ self.task_config = config['tasks']['count']
38
+
39
+ # Initialize components
40
+ self.dataset = ESC50Dataset(
41
+ config['esc50']['metadata_path'],
42
+ config['esc50']['audio_path'],
43
+ config # Pass config for class subset loading
44
+ )
45
+ self.audio_processor = AudioProcessor(
46
+ crossfade_duration=config['audio']['crossfade_duration'],
47
+ silence_duration=config['audio']['silence_duration'],
48
+ with_silence=config['audio']['with_silence'],
49
+ normalize=config['audio']['normalize'],
50
+ normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
51
+ synthetic_silence_path=config['synthetic_silence']['path']
52
+ )
53
+ self.question_generator = QuestionGenerator(
54
+ num_options=config['mcq']['num_options'],
55
+ option_labels=config['mcq']['option_labels'],
56
+ distractor_strategy=config['mcq']['distractor_strategy']
57
+ )
58
+
59
+ # Initialize LLM question generator
60
+ self.llm_enabled = config.get('llm', {}).get('enabled', False)
61
+ self.llm_generator = LLMQuestionGenerator(
62
+ enabled=self.llm_enabled,
63
+ template_questions=self.task_config
64
+ )
65
+ if self.llm_enabled:
66
+ logger.info("LLM question generation enabled (local Llama 3.1 8B)")
67
+ else:
68
+ logger.info("Using template-based question generation")
69
+
70
+ # Duration settings from config
71
+ self.min_clip_duration = config['audio']['min_clip_duration']
72
+ self.max_clip_duration = config['audio']['max_clip_duration']
73
+ self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0)
74
+ self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
75
+ self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
76
+ # Small crossfade within same-source repetitions (for consecutive mode)
77
+ self.crossfade_within_source_ms = config['audio'].get('crossfade_within_source', 50)
78
+ self.task_duration_hours = self.task_config['task_duration_size']
79
+
80
+ # Ordering mode: "random" or "consecutive"
81
+ # random: Clips shuffled (A B A C B A C) - tests sound recognition
82
+ # consecutive: Same-source grouped (AAA BBB CCC) - easier
83
+ self.ordering_mode = self.task_config.get('ordering_mode', 'random')
84
+ logger.info(f"Count task ordering mode: {self.ordering_mode}")
85
+
86
+ # Set up output paths
87
+ self.output_base = Path(config['output']['base_path']) / 'count'
88
+ self.output_base.mkdir(parents=True, exist_ok=True)
89
+ self.audio_output = self.output_base / 'audios'
90
+ self.audio_output.mkdir(parents=True, exist_ok=True)
91
+
92
+ def create_sampling_list(self, parent_list: List, n_sampling: int) -> List:
93
+ """
94
+ Sample elements from parent list with replacement.
95
+
96
+ Args:
97
+ parent_list: List to sample from
98
+ n_sampling: Number of samples
99
+
100
+ Returns:
101
+ List of sampled elements
102
+ """
103
+ return [random.choice(parent_list) for _ in range(n_sampling)]
104
+
105
+ def generate_sample(self, sample_id: int, target_unique_count: int = None, target_duration_seconds: float = None) -> Dict:
106
+ """
107
+ Generate a single count task sample.
108
+
109
+ Pipeline for COUNT task:
110
+ 1. Use pre-generated target duration (or generate if not provided)
111
+ 2. Calculate max clips that can fit
112
+ 3. Pick N unique classes (N <= max_clips, since each source needs at least 1 clip)
113
+ 4. For each class, sample one audio clip
114
+ 5. Calculate repetitions to fill target duration
115
+ 6. Based on ordering_mode:
116
+ - "random": Shuffle clips (A B A C B A C) - tests recognition
117
+ - "consecutive": Group same-class (AAA BBB CCC) - easier
118
+ 7. Insert silences between clips
119
+ 8. Distribute remainder as random extra silences
120
+
121
+ Args:
122
+ sample_id: Sample ID number
123
+ target_unique_count: Target number of unique sounds (for balanced distribution)
124
+ target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
125
+
126
+ Returns:
127
+ Dictionary with sample metadata
128
+ """
129
+ # Use pre-generated duration or generate one (backward compatibility)
130
+ if target_duration_seconds is not None:
131
+ clip_duration_seconds = target_duration_seconds
132
+ else:
133
+ clip_duration_seconds = generate_single_clip_duration(
134
+ self.min_clip_duration,
135
+ self.max_clip_duration
136
+ )
137
+
138
+ # Calculate max clips that can fit in target duration
139
+ max_clips, remainder_seconds = get_max_clip_num_to_be_joined(
140
+ clip_duration_seconds,
141
+ self.source_clip_duration,
142
+ self.min_silence_ms
143
+ )
144
+
145
+ # Ensure at least 1 clip
146
+ max_clips = max(1, max_clips)
147
+
148
+ max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
149
+
150
+ # Calculate valid range: n_unique_audios can be 1 to max_clips_per_sample
151
+ # but cannot exceed what physically fits or available categories
152
+ max_unique_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
153
+
154
+ if max_unique_for_sample < 1:
155
+ raise ValueError(
156
+ f"Sample {sample_id}: Cannot generate sample - max_unique_for_sample={max_unique_for_sample}. "
157
+ f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, "
158
+ f"available_categories={len(self.dataset.CATEGORIES)}, duration={clip_duration_seconds:.1f}s. "
159
+ f"Increase min_clip_duration or reduce max_clips_per_sample."
160
+ )
161
+
162
+ # Determine n_unique_audios - use target from balanced distribution or random
163
+ if target_unique_count is not None:
164
+ # Clamp target to what this specific sample duration can fit
165
+ # Short samples can't fit all possible answers, so we clamp down
166
+ n_unique_audios = min(target_unique_count, max_unique_for_sample)
167
+
168
+ if n_unique_audios != target_unique_count:
169
+ self.logger.debug(
170
+ f"Sample {sample_id}: Clamped target from {target_unique_count} to {n_unique_audios} "
171
+ f"(duration={clip_duration_seconds:.1f}s can only fit {max_clips} clips)"
172
+ )
173
+ else:
174
+ # No target specified - randomly select from valid range
175
+ n_unique_audios = random.randint(1, max_unique_for_sample)
176
+
177
+ self.logger.debug(
178
+ f"Sample {sample_id}: target={clip_duration_seconds:.1f}s, max_clips={max_clips}, "
179
+ f"n_unique_audios={n_unique_audios}"
180
+ )
181
+
182
+ # Sample unique categories - use least-used categories for balanced distribution
183
+ selected_categories = self.dataset.get_least_used_categories(n_unique_audios)
184
+
185
+ # Track usage of all selected categories
186
+ for cat in selected_categories:
187
+ self.dataset.category_usage_counts[cat] += 1
188
+
189
+ # Sample one file from each unique category
190
+ source_files = []
191
+ source_paths = []
192
+ source_categories = []
193
+
194
+ for category in selected_categories:
195
+ filename, filepath = self.dataset.sample_file_from_category(category)
196
+ source_files.append(filename)
197
+ source_paths.append(filepath)
198
+ source_categories.append(category)
199
+
200
+ # Load unique source audios
201
+ source_audios = []
202
+ for file_path in source_paths:
203
+ audio = self.audio_processor.load_audio(file_path)
204
+ source_audios.append(audio)
205
+
206
+ # Build audio using configured ordering mode
207
+ final_audio, clip_sequence, build_metadata = build_count_task_audio(
208
+ source_audios,
209
+ source_categories,
210
+ clip_duration_seconds,
211
+ ordering_mode=self.ordering_mode,
212
+ source_clip_duration_seconds=self.source_clip_duration,
213
+ min_silence_ms=self.min_silence_ms,
214
+ max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
215
+ crossfade_within_source_ms=self.crossfade_within_source_ms
216
+ )
217
+
218
+ # Save the audio
219
+ output_audio_path = self.audio_output / f"{sample_id}.wav"
220
+ final_audio.export(str(output_audio_path), format="wav")
221
+
222
+ # Generate questions (using LLM if enabled)
223
+ if self.llm_enabled and self.llm_generator:
224
+ llm_questions = self.llm_generator.generate_count_questions(
225
+ correct_count=n_unique_audios,
226
+ categories_present=list(set(clip_sequence))
227
+ )
228
+ mcq_question_text = llm_questions.get('mcq_question')
229
+ open_text_question_text = llm_questions.get('open_text_question')
230
+ else:
231
+ mcq_question_text = random.choice(self.task_config['mcq_questions'])
232
+ open_text_question_text = random.choice(self.task_config['open_text_questions'])
233
+
234
+ # Generate MCQ with options
235
+ mcq_data = self.question_generator.generate_count_mcq(
236
+ mcq_question_text,
237
+ n_unique_audios,
238
+ self.dataset.CATEGORIES
239
+ )
240
+
241
+ # Generate open-text answer
242
+ open_text_data = self.question_generator.generate_count_open_text(
243
+ open_text_question_text,
244
+ n_unique_audios
245
+ )
246
+
247
+ # Create metadata
248
+ metadata = {
249
+ 'id': sample_id,
250
+ 'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
251
+ 'n_unique_sounds': n_unique_audios,
252
+ 'total_clips': build_metadata['total_clips'],
253
+ 'repetitions_per_source': build_metadata['repetitions_per_source'],
254
+ 'ordering_mode': self.ordering_mode,
255
+ 'source_files': source_files,
256
+ 'source_categories': source_categories,
257
+ 'clip_sequence': clip_sequence,
258
+ 'unique_categories': sorted(list(set(source_categories))),
259
+ 'target_duration_seconds': clip_duration_seconds,
260
+ 'actual_duration_seconds': len(final_audio) / 1000.0,
261
+ 'mcq_question': mcq_data['question'],
262
+ 'mcq_options': mcq_data['options'],
263
+ 'mcq_correct_answer': mcq_data['correct_answer'],
264
+ 'open_text_question': open_text_data['question'],
265
+ 'open_text_answer': open_text_data['correct_answer'],
266
+ 'llm_generated': self.llm_enabled
267
+ }
268
+
269
+ self.logger.info(
270
+ f"Generated count sample {sample_id}: {n_unique_audios} unique sounds, "
271
+ f"{build_metadata['total_clips']} clips, {len(final_audio)/1000:.1f}s"
272
+ )
273
+
274
+ return metadata
275
+
276
+ def generate_dataset(self) -> tuple:
277
+ """
278
+ Generate the complete count task dataset.
279
+
280
+ Returns:
281
+ Tuple of (mcq_csv_path, open_text_csv_path)
282
+ """
283
+ # Generate sample durations upfront to exactly fill target duration
284
+ sample_durations = generate_sample_durations_for_task(
285
+ self.task_duration_hours,
286
+ self.min_clip_duration,
287
+ self.max_clip_duration
288
+ )
289
+ num_samples = len(sample_durations)
290
+ self.logger.info(f"Generating {num_samples} count task samples (target: {self.task_duration_hours}h, actual: {sum(sample_durations)/3600:.2f}h)...")
291
+
292
+ # Calculate max clips each sample can fit based on duration
293
+ max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
294
+ sample_max_clips = []
295
+ for duration in sample_durations:
296
+ max_clips, _ = get_max_clip_num_to_be_joined(
297
+ duration,
298
+ self.source_clip_duration,
299
+ self.min_silence_ms
300
+ )
301
+ # Limit to config max and available categories
302
+ max_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
303
+ sample_max_clips.append(max_for_sample)
304
+
305
+ # Create balanced distribution by assigning targets based on sample capacity
306
+ # Sort samples by capacity to assign higher targets to samples that can fit them
307
+ possible_answers = list(range(1, max_clips_per_sample + 1))
308
+ samples_per_answer = num_samples // len(possible_answers)
309
+ remainder = num_samples % len(possible_answers)
310
+
311
+ # Create list of (sample_idx, duration, max_clips_capacity)
312
+ sample_info = [(i, sample_durations[i], sample_max_clips[i]) for i in range(num_samples)]
313
+
314
+ # Sort by capacity (descending) - assign high targets to high-capacity samples
315
+ sample_info.sort(key=lambda x: x[2], reverse=True)
316
+
317
+ # Assign targets: distribute each answer count across samples
318
+ balanced_assignments = [None] * num_samples
319
+ assignment_pool = []
320
+
321
+ for answer in possible_answers:
322
+ count = samples_per_answer + (1 if remainder > 0 else 0)
323
+ assignment_pool.extend([answer] * count)
324
+ remainder = max(0, remainder - 1)
325
+
326
+ # Reverse pool so we assign high targets first (to high-capacity samples)
327
+ assignment_pool.sort(reverse=True)
328
+
329
+ for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
330
+ # Assign target, clamped to sample's capacity
331
+ target = min(assignment_pool[idx], capacity)
332
+ balanced_assignments[sample_idx] = target
333
+
334
+ # Log the actual distribution after capacity clamping
335
+ from collections import Counter
336
+ distribution = Counter(balanced_assignments)
337
+ self.logger.info(f"Balanced answer distribution (after capacity-aware assignment): {dict(sorted(distribution.items()))}")
338
+
339
+ all_metadata = []
340
+
341
+ for i in range(num_samples):
342
+ metadata = self.generate_sample(
343
+ i,
344
+ target_unique_count=balanced_assignments[i],
345
+ target_duration_seconds=sample_durations[i]
346
+ )
347
+ all_metadata.append(metadata)
348
+
349
+ # Save MCQ CSV
350
+ mcq_csv_path = self.output_base / 'count_mcq.csv'
351
+ self._save_mcq_csv(all_metadata, mcq_csv_path)
352
+
353
+ # Save open-text CSV
354
+ open_text_csv_path = self.output_base / 'count_open_text.csv'
355
+ self._save_open_text_csv(all_metadata, open_text_csv_path)
356
+
357
+ # Save metadata CSV
358
+ metadata_csv_path = self.output_base / 'count_metadata.csv'
359
+ self._save_metadata_csv(all_metadata, metadata_csv_path)
360
+
361
+ self.logger.info(f"Count task dataset generation complete!")
362
+ self.logger.info(f" - MCQ CSV: {mcq_csv_path}")
363
+ self.logger.info(f" - Open-text CSV: {open_text_csv_path}")
364
+ self.logger.info(f" - Metadata CSV: {metadata_csv_path}")
365
+ self.logger.info(f" - Audio files: {self.audio_output}")
366
+
367
+ return mcq_csv_path, open_text_csv_path
368
+
369
+ def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
370
+ """Save MCQ format CSV."""
371
+ with open(output_path, 'w', newline='') as f:
372
+ writer = csv.writer(f)
373
+ # Header
374
+ writer.writerow([
375
+ 'question', 'id', 'audio_path',
376
+ 'optionA', 'optionB', 'optionC', 'optionD',
377
+ 'correct', 'source_wavs', 'source_categories'
378
+ ])
379
+
380
+ # Data rows
381
+ for meta in metadata_list:
382
+ writer.writerow([
383
+ meta['mcq_question'],
384
+ meta['id'],
385
+ meta['audio_path'],
386
+ meta['mcq_options']['A'],
387
+ meta['mcq_options']['B'],
388
+ meta['mcq_options']['C'],
389
+ meta['mcq_options']['D'],
390
+ meta['mcq_correct_answer'],
391
+ str(meta['source_files']),
392
+ str(meta['unique_categories'])
393
+ ])
394
+
395
+ def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
396
+ """Save open-text format CSV."""
397
+ with open(output_path, 'w', newline='') as f:
398
+ writer = csv.writer(f)
399
+ # Header
400
+ writer.writerow([
401
+ 'question', 'id', 'audio_path', 'answer',
402
+ 'source_wavs', 'source_categories'
403
+ ])
404
+
405
+ # Data rows
406
+ for meta in metadata_list:
407
+ writer.writerow([
408
+ meta['open_text_question'],
409
+ meta['id'],
410
+ meta['audio_path'],
411
+ meta['open_text_answer'],
412
+ str(meta['source_files']),
413
+ str(meta['unique_categories'])
414
+ ])
415
+
416
+ def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
417
+ """Save detailed metadata CSV."""
418
+ with open(output_path, 'w', newline='') as f:
419
+ writer = csv.writer(f)
420
+ # Header
421
+ writer.writerow([
422
+ 'id', 'audio_path', 'total_clips', 'n_unique_sounds',
423
+ 'source_files', 'source_categories', 'unique_categories',
424
+ 'ordering_mode', 'target_duration_s', 'actual_duration_s', 'llm_generated'
425
+ ])
426
+
427
+ # Data rows
428
+ for meta in metadata_list:
429
+ writer.writerow([
430
+ meta['id'],
431
+ meta['audio_path'],
432
+ meta['total_clips'],
433
+ meta['n_unique_sounds'],
434
+ str(meta['source_files']),
435
+ str(meta['source_categories']),
436
+ str(meta['unique_categories']),
437
+ meta.get('ordering_mode', 'random'),
438
+ meta.get('target_duration_seconds', 0),
439
+ meta.get('actual_duration_seconds', 0),
440
+ meta.get('llm_generated', False)
441
+ ])
442
+
443
+
444
+ def main(config_path: str = None):
445
+ """Main entry point for count task generation."""
446
+ import yaml
447
+
448
+ # Load configuration
449
+ if config_path is None:
450
+ config_path = Path(__file__).parent.parent / 'config.yaml'
451
+
452
+ with open(config_path, 'r') as f:
453
+ config = yaml.safe_load(f)
454
+
455
+ # Set random seed
456
+ set_random_seed(config['random_seed'])
457
+
458
+ # Setup logger
459
+ logger = setup_logger(
460
+ 'count_task',
461
+ log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
462
+ level=config['logging']['level'],
463
+ console_output=config['logging']['console_output']
464
+ )
465
+
466
+ # Generate dataset
467
+ generator = CountTaskGenerator(config, logger)
468
+ generator.generate_dataset()
469
+
470
+
471
+ if __name__ == '__main__':
472
+ main()
tasks/task_duration.py ADDED
@@ -0,0 +1,820 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task 2: Duration - Generate duration comparison questions
3
+
4
+ This task creates audio samples where sources have different effective durations
5
+ and asks questions about which sound is heard for the longest or shortest time.
6
+
7
+ Key features:
8
+ - Uses amplitude-filtered (preprocessed) audio clips with known effective durations
9
+ - First calculates max clips from total duration, then distributes slots
10
+ - Strategically distributes repetitions to ensure clear longest/shortest answers
11
+ - Consecutive ordering within sources, random order between sources
12
+ - Gap multipliers ensure unambiguous answers (e.g., longest is 1.5x longer than next)
13
+ - NO category preference - random selection to avoid bias
14
+ """
15
+
16
+ import csv
17
+ import random
18
+ import math
19
+ from pathlib import Path
20
+ from typing import Dict, List, Tuple, Optional
21
+ from collections import Counter
22
+
23
+ import sys
24
+ sys.path.append(str(Path(__file__).parent.parent))
25
+
26
+ from utils import (
27
+ AudioProcessor, PreprocessedESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
28
+ setup_logger, set_random_seed, calculate_num_samples_for_task,
29
+ generate_single_clip_duration, get_max_clip_num_to_be_joined,
30
+ build_duration_task_audio, distribute_remainder_as_silences,
31
+ generate_sample_durations_for_task
32
+ )
33
+
34
+
35
+ class DurationTaskGenerator:
36
+ """Generator for duration comparison task dataset using preprocessed ESC-50."""
37
+
38
+ def __init__(self, config: Dict, logger):
39
+ """
40
+ Initialize duration task generator.
41
+
42
+ Args:
43
+ config: Configuration dictionary
44
+ logger: Logger instance
45
+ """
46
+ self.config = config
47
+ self.logger = logger
48
+ self.task_config = config['tasks']['duration']
49
+
50
+ # Initialize preprocessed dataset (with effective durations)
51
+ self.dataset = PreprocessedESC50Dataset(
52
+ metadata_path=config['esc50']['metadata_path'],
53
+ audio_path=config['esc50']['audio_path'],
54
+ preprocessed_path=self.task_config['preprocessed_data_path'],
55
+ config=config # Pass config for class subset loading
56
+ )
57
+
58
+ # Calculate average effective duration from preprocessed data
59
+ self.avg_effective_duration = self.dataset.effective_df['effective_duration_s'].mean()
60
+ self.logger.info(f"Average effective duration: {self.avg_effective_duration:.2f}s")
61
+
62
+ # Initialize audio processor
63
+ self.audio_processor = AudioProcessor(
64
+ crossfade_duration=config['audio']['crossfade_duration'],
65
+ silence_duration=config['audio']['silence_duration'],
66
+ with_silence=config['audio']['with_silence'],
67
+ normalize=config['audio']['normalize'],
68
+ normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
69
+ synthetic_silence_path=config['synthetic_silence']['path']
70
+ )
71
+
72
+ # Initialize question generator
73
+ self.question_generator = QuestionGenerator(
74
+ num_options=config['mcq']['num_options'],
75
+ option_labels=config['mcq']['option_labels'],
76
+ distractor_strategy=config['mcq']['distractor_strategy']
77
+ )
78
+
79
+ # Initialize LLM question generator
80
+ self.llm_enabled = config.get('llm', {}).get('enabled', False)
81
+ self.llm_generator = LLMQuestionGenerator(
82
+ enabled=self.llm_enabled,
83
+ template_questions=self.task_config
84
+ )
85
+
86
+ # Duration settings from config
87
+ self.min_clip_duration = config['audio']['min_clip_duration']
88
+ self.max_clip_duration = config['audio']['max_clip_duration']
89
+ self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
90
+ self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
91
+ self.crossfade_within_source_ms = config['audio'].get('crossfade_within_source', 50)
92
+ self.task_duration_hours = self.task_config['task_duration_size']
93
+
94
+ # Duration task specific settings
95
+ self.multiplier_longest = self.task_config.get('multiplier_longest', 1.5)
96
+ self.multiplier_shortest = self.task_config.get('multiplier_shortest', 0.75)
97
+ self.reject_if_gap_not_met = self.task_config.get('reject_if_gap_not_met', True)
98
+ self.sample_different_clips = self.task_config.get('sample_different_clips_same_class', True)
99
+ # Minimum effective duration per source (seconds) - clips shorter than this are harder to distinguish
100
+ self.min_effective_duration_per_source = self.task_config.get('min_effective_duration_per_source', 1.0)
101
+
102
+ # Set up output paths
103
+ self.output_base = Path(config['output']['base_path']) / 'duration'
104
+ self.output_base.mkdir(parents=True, exist_ok=True)
105
+ self.audio_output = self.output_base / 'audios'
106
+ self.audio_output.mkdir(parents=True, exist_ok=True)
107
+
108
+ # Statistics tracking
109
+ self.rejection_count = 0
110
+ self.success_count = 0
111
+
112
+ def _calculate_max_clips_and_sources(
113
+ self,
114
+ target_duration_s: float,
115
+ question_type: str
116
+ ) -> Tuple[int, int, float]:
117
+ """
118
+ Calculate max clips possible and choose n_sources from config that satisfies gap.
119
+
120
+ Key principle:
121
+ 1. Calculate valid range of sources that can satisfy gap constraint
122
+ 2. Filter config values to only those within valid range
123
+ 3. Pick RANDOMLY from valid config values (ensures variety)
124
+
125
+ For LONGEST:
126
+ - Target needs at least 2 clips to beat max_background by 1.5x
127
+ - max_sources = max_clips - 2 + 1 (backgrounds get 1 each)
128
+ - min_sources = 2 (need at least 1 background)
129
+
130
+ For SHORTEST:
131
+ - Target gets 1 clip
132
+ - Each background needs at least 2 clips to be 2x target (1/0.5)
133
+ - max_sources = 1 + (max_clips - 1) // 2
134
+ - min_sources = 2
135
+
136
+ Args:
137
+ target_duration_s: Target total audio duration
138
+ question_type: "longest" or "shortest"
139
+
140
+ Returns:
141
+ Tuple of (max_clips, n_sources, remainder_s)
142
+ """
143
+ # Get max clips using average effective duration
144
+ max_clips, remainder_s = get_max_clip_num_to_be_joined(
145
+ target_duration_s,
146
+ self.avg_effective_duration,
147
+ self.min_silence_ms
148
+ )
149
+
150
+ # Ensure at least 2 clips
151
+ max_clips = max(2, max_clips)
152
+
153
+ # Get config values for n_sources
154
+ # If single int (e.g., 15), sample from [1, 15] like count/order tasks
155
+ # If list (e.g., [2,3,4]), sample from the list
156
+ num_sources_config = self.task_config.get('num_unique_sources', [2, 3, 4, 5])
157
+ if isinstance(num_sources_config, int):
158
+ # Single int: create range [1, num_sources_config]
159
+ num_sources_config = list(range(1, num_sources_config + 1))
160
+
161
+ if question_type == "longest":
162
+ # Target needs at least 2 clips to reliably beat background by multiplier
163
+ # (with 1.5x multiplier, 2 clips of target vs 1 clip of background usually works)
164
+ min_target_clips = 2
165
+
166
+ # Minimum sources: need at least 1 background + target = 2
167
+ min_valid_sources = 2
168
+
169
+ # Maximum sources: max_clips - min_target_clips + 1
170
+ # (subtract target's clips, add 1 for the target itself)
171
+ max_valid_sources = max_clips - min_target_clips + 1
172
+
173
+ else: # shortest
174
+ # Target gets 1 clip
175
+ # Each background needs at least 2 clips to be >= 2x target (1/0.5 multiplier)
176
+ min_clips_per_background = 2
177
+
178
+ # Minimum sources: 2 (target + 1 background)
179
+ min_valid_sources = 2
180
+
181
+ # Maximum sources: how many backgrounds can we fit?
182
+ remaining_clips = max_clips - 1 # 1 for target
183
+ max_backgrounds = remaining_clips // min_clips_per_background
184
+ max_valid_sources = max_backgrounds + 1 # +1 for target
185
+
186
+ # Filter config values to only valid ones
187
+ valid_config_sources = [
188
+ n for n in num_sources_config
189
+ if min_valid_sources <= n <= max_valid_sources
190
+ ]
191
+
192
+ if not valid_config_sources:
193
+ raise ValueError(
194
+ f"Duration task: No valid num_unique_sources for {question_type} question. "
195
+ f"Config values: {num_sources_config}, Valid range: [{min_valid_sources}, {max_valid_sources}]. "
196
+ f"max_clips={max_clips}, duration={target_duration_s:.1f}s. "
197
+ f"Increase min_clip_duration or adjust num_unique_sources config."
198
+ )
199
+
200
+ # Pick RANDOMLY from valid config values (ensures variety!)
201
+ n_sources = random.choice(valid_config_sources)
202
+
203
+ # Validate final value
204
+ if n_sources < 2 or n_sources > len(self.dataset.CATEGORIES):
205
+ raise ValueError(
206
+ f"Duration task: Invalid n_sources={n_sources}. "
207
+ f"Must be in range [2, {len(self.dataset.CATEGORIES)}]"
208
+ )
209
+
210
+ self.logger.debug(
211
+ f"Max clips: {max_clips}, Question: {question_type}, "
212
+ f"Valid range: [{min_valid_sources}, {max_valid_sources}], "
213
+ f"Valid config: {valid_config_sources}, Selected: {n_sources}"
214
+ )
215
+
216
+ return max_clips, n_sources, remainder_s
217
+
218
+ def _calculate_slot_distribution(
219
+ self,
220
+ max_clips: int,
221
+ n_sources: int,
222
+ effective_durations: Dict[str, float],
223
+ target_category: str,
224
+ question_type: str
225
+ ) -> Tuple[Dict[str, int], bool, Dict]:
226
+ """
227
+ Calculate how many clips each source gets.
228
+
229
+ For LONGEST: target gets (max_clips - n_backgrounds), backgrounds get 1 each
230
+ For SHORTEST: target gets 1, backgrounds share (max_clips - 1)
231
+
232
+ Args:
233
+ max_clips: Maximum number of clips that fit
234
+ n_sources: Number of unique sources
235
+ effective_durations: Dict mapping category -> effective duration
236
+ target_category: The category that should be longest/shortest
237
+ question_type: "longest" or "shortest"
238
+
239
+ Returns:
240
+ Tuple of (slot_distribution, gap_satisfied, metadata)
241
+ """
242
+ categories = list(effective_durations.keys())
243
+ background_categories = [c for c in categories if c != target_category]
244
+ n_backgrounds = len(background_categories)
245
+
246
+ if question_type == "longest":
247
+ # Target gets max_clips - n_backgrounds
248
+ # Backgrounds get 1 each
249
+ target_clips = max_clips - n_backgrounds
250
+ target_clips = max(1, target_clips) # At least 1
251
+
252
+ slot_distribution = {target_category: target_clips}
253
+ for cat in background_categories:
254
+ slot_distribution[cat] = 1
255
+
256
+ # Verify gap: target_duration >= max_background × multiplier
257
+ target_duration = target_clips * effective_durations[target_category]
258
+ background_durations = [effective_durations[c] for c in background_categories]
259
+ max_background = max(background_durations) if background_durations else 0
260
+ required_target = max_background * self.multiplier_longest
261
+ gap_satisfied = target_duration >= required_target
262
+
263
+ metadata = {
264
+ 'target_clips': target_clips,
265
+ 'target_duration_s': target_duration,
266
+ 'max_background_s': max_background,
267
+ 'required_target_s': required_target,
268
+ 'multiplier': self.multiplier_longest
269
+ }
270
+
271
+ else: # shortest
272
+ # Target gets 1 clip
273
+ # Backgrounds share (max_clips - 1)
274
+ remaining_clips = max_clips - 1
275
+ clips_per_background = max(1, remaining_clips // n_backgrounds)
276
+ extra_clips = remaining_clips % n_backgrounds
277
+
278
+ slot_distribution = {target_category: 1}
279
+
280
+ for i, cat in enumerate(background_categories):
281
+ clips = clips_per_background + (1 if i < extra_clips else 0)
282
+ slot_distribution[cat] = clips
283
+
284
+ # Verify gap: target_duration <= min_background × multiplier
285
+ target_duration = effective_durations[target_category]
286
+ background_durations = [
287
+ slot_distribution[c] * effective_durations[c]
288
+ for c in background_categories
289
+ ]
290
+ min_background = min(background_durations) if background_durations else float('inf')
291
+ required_max_target = min_background * self.multiplier_shortest
292
+
293
+ # CRITICAL: Target must still be at least min_effective_duration_per_source
294
+ # Otherwise clips that are too short (e.g., 0.03s) would be used and be indistinguishable
295
+ target_too_short = target_duration < self.min_effective_duration_per_source
296
+ gap_satisfied = (target_duration <= required_max_target) and (not target_too_short)
297
+
298
+ metadata = {
299
+ 'target_clips': 1,
300
+ 'target_duration_s': target_duration,
301
+ 'min_background_s': min_background,
302
+ 'required_max_target_s': required_max_target,
303
+ 'multiplier': self.multiplier_shortest,
304
+ 'target_too_short': target_too_short
305
+ }
306
+
307
+ return slot_distribution, gap_satisfied, metadata
308
+
309
+ def _try_generate_sample(
310
+ self,
311
+ sample_id: int,
312
+ question_type: str,
313
+ max_retries: int = 5,
314
+ target_duration_seconds: float = None
315
+ ) -> Optional[Dict]:
316
+ """
317
+ Try to generate a valid duration sample with retries.
318
+
319
+ Args:
320
+ sample_id: Sample ID
321
+ question_type: "longest" or "shortest"
322
+ max_retries: Maximum retry attempts
323
+ target_duration_seconds: Pre-generated target duration
324
+
325
+ Returns:
326
+ Metadata dict if successful, None if all retries failed
327
+ """
328
+ for attempt in range(max_retries):
329
+ try:
330
+ result = self._generate_single_sample(sample_id, question_type, target_duration_seconds=target_duration_seconds)
331
+ if result is not None:
332
+ return result
333
+ except Exception as e:
334
+ self.logger.warning(f"Sample {sample_id} attempt {attempt+1} failed: {e}")
335
+
336
+ return None
337
+
338
+ def _generate_single_sample(
339
+ self,
340
+ sample_id: int,
341
+ question_type: str,
342
+ target_duration_seconds: float = None
343
+ ) -> Optional[Dict]:
344
+ """
345
+ Generate a single duration task sample.
346
+
347
+ Corrected Pipeline:
348
+ 1. Use pre-generated target duration (or generate if not provided)
349
+ 2. Calculate max_clips using get_max_clip_num_to_be_joined
350
+ 3. Based on max_clips and question_type, determine n_sources
351
+ 4. Select categories RANDOMLY (no bias toward short/long)
352
+ 5. Pick target category RANDOMLY from selected
353
+ 6. Get effective durations for all sources
354
+ 7. Calculate slot distribution based on max_clips
355
+ 8. Verify gap constraint
356
+ 9. Load audio clips and build final audio
357
+
358
+ Args:
359
+ sample_id: Sample ID number
360
+ question_type: "longest" or "shortest"
361
+ target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
362
+
363
+ Returns:
364
+ Dictionary with sample metadata, or None if failed
365
+ """
366
+ # Step 1: Use pre-generated duration or generate one (backward compatibility)
367
+ if target_duration_seconds is not None:
368
+ target_duration_s = target_duration_seconds
369
+ else:
370
+ target_duration_s = generate_single_clip_duration(
371
+ self.min_clip_duration,
372
+ self.max_clip_duration
373
+ )
374
+
375
+ # Step 2 & 3: Calculate max_clips and n_sources
376
+ max_clips, n_sources, remainder_s = self._calculate_max_clips_and_sources(
377
+ target_duration_s,
378
+ question_type
379
+ )
380
+
381
+ # Step 4: Select categories RANDOMLY (using least-used for balance, but no duration preference)
382
+ all_categories = self.dataset.get_least_used_categories(n_sources)
383
+
384
+ # Step 5: Pick target category RANDOMLY from selected (no bias!)
385
+ target_category = random.choice(all_categories)
386
+ self.dataset.category_usage_counts[target_category] += 1
387
+
388
+ # Step 6: Get effective durations by sampling one file per category
389
+ # Use min_effective_duration_per_source to avoid clips that are too short to distinguish
390
+ effective_durations = {}
391
+ selected_files = {}
392
+
393
+ for category in all_categories:
394
+ filename, filepath, eff_dur = self.dataset.sample_file_from_category_with_duration(
395
+ category,
396
+ min_effective_duration=self.min_effective_duration_per_source
397
+ )
398
+ effective_durations[category] = eff_dur
399
+ selected_files[category] = {
400
+ 'filename': filename,
401
+ 'filepath': filepath,
402
+ 'effective_duration_s': eff_dur
403
+ }
404
+
405
+ # Step 7: Calculate slot distribution based on max_clips
406
+ slot_distribution, gap_satisfied, calc_metadata = self._calculate_slot_distribution(
407
+ max_clips=max_clips,
408
+ n_sources=n_sources,
409
+ effective_durations=effective_durations,
410
+ target_category=target_category,
411
+ question_type=question_type
412
+ )
413
+
414
+ # Step 8: If gap not satisfied, try adjustments
415
+ if not gap_satisfied:
416
+ # Try with different clips that have better durations
417
+ if self.sample_different_clips:
418
+ gap_satisfied = self._try_improve_gap_with_different_clips(
419
+ question_type=question_type,
420
+ target_category=target_category,
421
+ all_categories=all_categories,
422
+ max_clips=max_clips,
423
+ n_sources=n_sources,
424
+ effective_durations=effective_durations,
425
+ selected_files=selected_files,
426
+ slot_distribution=slot_distribution
427
+ )
428
+
429
+ if not gap_satisfied and self.reject_if_gap_not_met:
430
+ self.rejection_count += 1
431
+ self.logger.debug(
432
+ f"Sample {sample_id} rejected: gap not satisfied "
433
+ f"(type={question_type}, max_clips={max_clips}, sources={n_sources})"
434
+ )
435
+ return None
436
+
437
+ # Step 9: Load audio clips based on slot distribution
438
+ source_audio_lists = {}
439
+ files_used = {}
440
+
441
+ for category in all_categories:
442
+ reps = slot_distribution.get(category, 0)
443
+ if reps == 0:
444
+ continue
445
+
446
+ # Get files for this category
447
+ if self.sample_different_clips and reps > 1:
448
+ filenames, filepaths, total_dur = self.dataset.sample_files_from_category_to_reach_duration(
449
+ category,
450
+ reps * effective_durations[category],
451
+ prefer_same_file=False
452
+ )
453
+ else:
454
+ # Use same file repeated
455
+ file_info = selected_files[category]
456
+ filenames = [file_info['filename']] * reps
457
+ filepaths = [file_info['filepath']] * reps
458
+
459
+ # Load audio segments
460
+ audio_list = []
461
+ for fp in filepaths[:reps]:
462
+ audio = self.audio_processor.load_audio(fp)
463
+ audio_list.append(audio)
464
+
465
+ # If we need more, cycle through
466
+ while len(audio_list) < reps:
467
+ audio_list.append(audio_list[len(audio_list) % len(audio_list)])
468
+
469
+ source_audio_lists[category] = audio_list[:reps]
470
+ files_used[category] = filenames[:reps]
471
+
472
+ # Step 10: Build final audio
473
+ final_audio, category_sequence, build_metadata = build_duration_task_audio(
474
+ source_audio_lists=source_audio_lists,
475
+ slot_distribution=slot_distribution,
476
+ effective_durations=effective_durations,
477
+ target_total_duration_s=target_duration_s,
478
+ min_silence_between_sources_ms=self.min_silence_ms,
479
+ max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
480
+ crossfade_within_source_ms=self.crossfade_within_source_ms
481
+ )
482
+
483
+ # Save audio
484
+ output_audio_path = self.audio_output / f"{sample_id}.wav"
485
+ final_audio.export(str(output_audio_path), format="wav")
486
+
487
+ # Step 11: Generate questions
488
+ correct_category = target_category
489
+ present_categories = all_categories
490
+
491
+ mcq_question = self.task_config['mcq_questions'][question_type]
492
+ mcq_data = self.question_generator.generate_category_mcq(
493
+ mcq_question,
494
+ correct_category,
495
+ present_categories,
496
+ self.dataset.CATEGORIES
497
+ )
498
+
499
+ open_text_question = self.task_config['open_text_questions'][question_type]
500
+ open_text_data = self.question_generator.generate_category_open_text(
501
+ open_text_question,
502
+ correct_category
503
+ )
504
+
505
+ # Calculate actual effective durations
506
+ actual_effective_durations = {
507
+ cat: slot_distribution[cat] * effective_durations[cat]
508
+ for cat in all_categories
509
+ if cat in slot_distribution
510
+ }
511
+
512
+ # Create metadata
513
+ metadata = {
514
+ 'id': sample_id,
515
+ 'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
516
+ 'question_type': question_type,
517
+ 'max_clips': max_clips,
518
+ 'n_unique_sources': n_sources,
519
+ 'target_category': target_category,
520
+ 'present_categories': present_categories,
521
+ 'source_order': build_metadata['source_order'],
522
+ 'slot_distribution': slot_distribution,
523
+ 'effective_durations_per_clip': effective_durations,
524
+ 'total_effective_durations': actual_effective_durations,
525
+ 'gap_satisfied': gap_satisfied,
526
+ 'multiplier_used': self.multiplier_longest if question_type == 'longest' else self.multiplier_shortest,
527
+ 'files_used': files_used,
528
+ 'target_duration_s': target_duration_s,
529
+ 'actual_duration_s': len(final_audio) / 1000.0,
530
+ 'timestamp_string': build_metadata.get('timestamp_string', ''),
531
+ 'source_timestamps': build_metadata.get('source_timestamps', []),
532
+ 'mcq_question': mcq_data['question'],
533
+ 'mcq_options': mcq_data['options'],
534
+ 'mcq_correct_answer': mcq_data['correct_answer'],
535
+ 'open_text_question': open_text_data['question'],
536
+ 'open_text_answer': open_text_data['correct_answer'],
537
+ 'calc_metadata': calc_metadata
538
+ }
539
+
540
+ self.success_count += 1
541
+ self.logger.info(
542
+ f"Generated duration sample {sample_id}: {question_type}, "
543
+ f"max_clips={max_clips}, sources={n_sources}, target={target_category}, "
544
+ f"slots={slot_distribution}, gap_satisfied={gap_satisfied}"
545
+ )
546
+
547
+ return metadata
548
+
549
+ def _try_improve_gap_with_different_clips(
550
+ self,
551
+ question_type: str,
552
+ target_category: str,
553
+ all_categories: List[str],
554
+ max_clips: int,
555
+ n_sources: int,
556
+ effective_durations: Dict[str, float],
557
+ selected_files: Dict[str, Dict],
558
+ slot_distribution: Dict[str, int]
559
+ ) -> bool:
560
+ """
561
+ Try to improve gap satisfaction by selecting different clips.
562
+
563
+ For LONGEST: try clips with longer effective duration for target
564
+ For SHORTEST: try clips with shorter effective duration for target
565
+
566
+ Args:
567
+ Various state from generate_sample
568
+
569
+ Returns:
570
+ True if gap is now satisfied
571
+ """
572
+ files = self.dataset.get_files_by_category_with_durations(target_category)
573
+
574
+ if question_type == "longest":
575
+ # Try to find a longer clip for target category
576
+ files_sorted = sorted(files, key=lambda x: x['effective_duration_s'], reverse=True)
577
+ else:
578
+ # For shortest, try shorter clip for target
579
+ files_sorted = sorted(files, key=lambda x: x['effective_duration_s'])
580
+
581
+ if files_sorted:
582
+ best = files_sorted[0]
583
+ effective_durations[target_category] = best['effective_duration_s']
584
+ selected_files[target_category] = {
585
+ 'filename': best['filename'],
586
+ 'filepath': best['filepath'],
587
+ 'effective_duration_s': best['effective_duration_s']
588
+ }
589
+
590
+ # Recalculate slot distribution
591
+ new_slots, gap_satisfied, _ = self._calculate_slot_distribution(
592
+ max_clips=max_clips,
593
+ n_sources=n_sources,
594
+ effective_durations=effective_durations,
595
+ target_category=target_category,
596
+ question_type=question_type
597
+ )
598
+
599
+ if gap_satisfied:
600
+ slot_distribution.clear()
601
+ slot_distribution.update(new_slots)
602
+
603
+ return gap_satisfied
604
+
605
+ def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Optional[Dict]:
606
+ """
607
+ Generate a single duration task sample with retries.
608
+
609
+ Args:
610
+ sample_id: Sample ID number
611
+ target_question_type: Target question type for balanced distribution
612
+ target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
613
+
614
+ Returns:
615
+ Dictionary with sample metadata, or None if failed
616
+ """
617
+ question_type = target_question_type or random.choice(
618
+ self.task_config['question_types']
619
+ )
620
+
621
+ return self._try_generate_sample(sample_id, question_type, target_duration_seconds=target_duration_seconds)
622
+
623
+ def generate_dataset(self) -> tuple:
624
+ """
625
+ Generate the complete duration task dataset.
626
+
627
+ Uses generate_sample_durations_for_task() to pre-generate exact sample durations
628
+ that sum to exactly the target task duration. This guarantees:
629
+ - Exact coverage of target duration
630
+ - No estimation errors from average-based calculation
631
+
632
+ Returns:
633
+ Tuple of (mcq_csv_path, open_text_csv_path)
634
+ """
635
+ # Generate sample durations upfront (guarantees exact total duration)
636
+ sample_durations = generate_sample_durations_for_task(
637
+ self.task_duration_hours,
638
+ self.min_clip_duration,
639
+ self.max_clip_duration
640
+ )
641
+ num_samples = len(sample_durations)
642
+
643
+ self.logger.info(
644
+ f"Generating {num_samples} duration task samples "
645
+ f"(target: {self.task_duration_hours}h, exact fill)..."
646
+ )
647
+
648
+ # Create balanced question type distribution
649
+ question_types = self.task_config['question_types']
650
+ balanced_types = []
651
+ samples_per_type = num_samples // len(question_types)
652
+ remainder = num_samples % len(question_types)
653
+
654
+ for qtype in question_types:
655
+ count = samples_per_type + (1 if remainder > 0 else 0)
656
+ balanced_types.extend([qtype] * count)
657
+ remainder = max(0, remainder - 1)
658
+
659
+ random.shuffle(balanced_types)
660
+ type_dist = Counter(balanced_types)
661
+ self.logger.info(f"Balanced question type distribution: {dict(sorted(type_dist.items()))}")
662
+
663
+ all_metadata = []
664
+ sample_idx = 0
665
+ type_idx = 0
666
+
667
+ while len(all_metadata) < num_samples and type_idx < len(balanced_types) * 2:
668
+ question_type = balanced_types[type_idx % len(balanced_types)]
669
+ target_duration = sample_durations[sample_idx] if sample_idx < len(sample_durations) else None
670
+
671
+ metadata = self.generate_sample(sample_idx, question_type, target_duration_seconds=target_duration)
672
+
673
+ if metadata is not None:
674
+ all_metadata.append(metadata)
675
+ sample_idx += 1
676
+
677
+ type_idx += 1
678
+
679
+ # Log progress
680
+ if len(all_metadata) % 50 == 0:
681
+ self.logger.info(
682
+ f"Progress: {len(all_metadata)}/{num_samples} samples, "
683
+ f"{self.rejection_count} rejections"
684
+ )
685
+
686
+ self.logger.info(
687
+ f"Generation complete: {len(all_metadata)} samples, "
688
+ f"{self.rejection_count} rejections "
689
+ f"({self.rejection_count/(len(all_metadata)+self.rejection_count)*100:.1f}% rejection rate)"
690
+ )
691
+
692
+ # Save CSVs
693
+ mcq_csv_path = self.output_base / 'duration_mcq.csv'
694
+ self._save_mcq_csv(all_metadata, mcq_csv_path)
695
+
696
+ open_text_csv_path = self.output_base / 'duration_open_text.csv'
697
+ self._save_open_text_csv(all_metadata, open_text_csv_path)
698
+
699
+ metadata_csv_path = self.output_base / 'duration_metadata.csv'
700
+ self._save_metadata_csv(all_metadata, metadata_csv_path)
701
+
702
+ self.logger.info(f"Duration task dataset generation complete!")
703
+ self.logger.info(f" - MCQ CSV: {mcq_csv_path}")
704
+ self.logger.info(f" - Open-text CSV: {open_text_csv_path}")
705
+ self.logger.info(f" - Metadata CSV: {metadata_csv_path}")
706
+ self.logger.info(f" - Audio files: {self.audio_output}")
707
+
708
+ return mcq_csv_path, open_text_csv_path
709
+
710
+ def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
711
+ """Save MCQ format CSV."""
712
+ with open(output_path, 'w', newline='') as f:
713
+ writer = csv.writer(f)
714
+ writer.writerow([
715
+ 'question', 'id', 'audio_path',
716
+ 'optionA', 'optionB', 'optionC', 'optionD',
717
+ 'correct', 'question_type', 'max_clips', 'n_sources',
718
+ 'target_category', 'slot_distribution', 'effective_durations'
719
+ ])
720
+
721
+ for meta in metadata_list:
722
+ writer.writerow([
723
+ meta['mcq_question'],
724
+ meta['id'],
725
+ meta['audio_path'],
726
+ meta['mcq_options']['A'],
727
+ meta['mcq_options']['B'],
728
+ meta['mcq_options']['C'],
729
+ meta['mcq_options']['D'],
730
+ meta['mcq_correct_answer'],
731
+ meta['question_type'],
732
+ meta['max_clips'],
733
+ meta['n_unique_sources'],
734
+ meta['target_category'],
735
+ str(meta['slot_distribution']),
736
+ str(meta['total_effective_durations'])
737
+ ])
738
+
739
+ def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
740
+ """Save open-text format CSV."""
741
+ with open(output_path, 'w', newline='') as f:
742
+ writer = csv.writer(f)
743
+ writer.writerow([
744
+ 'question', 'id', 'audio_path', 'answer',
745
+ 'question_type', 'max_clips', 'n_sources',
746
+ 'target_category', 'effective_durations'
747
+ ])
748
+
749
+ for meta in metadata_list:
750
+ writer.writerow([
751
+ meta['open_text_question'],
752
+ meta['id'],
753
+ meta['audio_path'],
754
+ meta['open_text_answer'],
755
+ meta['question_type'],
756
+ meta['max_clips'],
757
+ meta['n_unique_sources'],
758
+ meta['target_category'],
759
+ str(meta['total_effective_durations'])
760
+ ])
761
+
762
+ def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
763
+ """Save detailed metadata CSV with effective durations and timestamps."""
764
+ with open(output_path, 'w', newline='') as f:
765
+ writer = csv.writer(f)
766
+ writer.writerow([
767
+ 'id', 'audio_path', 'question_type', 'max_clips', 'n_sources',
768
+ 'target_category', 'present_categories', 'source_order',
769
+ 'slot_distribution', 'effective_durations_per_clip',
770
+ 'total_effective_durations', 'gap_satisfied', 'multiplier_used',
771
+ 'target_duration_s', 'actual_duration_s', 'clip_timestamps', 'files_used'
772
+ ])
773
+
774
+ for meta in metadata_list:
775
+ writer.writerow([
776
+ meta['id'],
777
+ meta['audio_path'],
778
+ meta['question_type'],
779
+ meta['max_clips'],
780
+ meta['n_unique_sources'],
781
+ meta['target_category'],
782
+ str(meta['present_categories']),
783
+ str(meta['source_order']),
784
+ str(meta['slot_distribution']),
785
+ str(meta['effective_durations_per_clip']),
786
+ str(meta['total_effective_durations']),
787
+ meta['gap_satisfied'],
788
+ meta['multiplier_used'],
789
+ round(meta['target_duration_s'], 2),
790
+ round(meta['actual_duration_s'], 2),
791
+ meta.get('timestamp_string', ''),
792
+ str(meta['files_used'])
793
+ ])
794
+
795
+
796
+ def main(config_path: str = None):
797
+ """Main entry point for duration task generation."""
798
+ import yaml
799
+
800
+ if config_path is None:
801
+ config_path = Path(__file__).parent.parent / 'config.yaml'
802
+
803
+ with open(config_path, 'r') as f:
804
+ config = yaml.safe_load(f)
805
+
806
+ set_random_seed(config['random_seed'])
807
+
808
+ logger = setup_logger(
809
+ 'duration_task',
810
+ log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
811
+ level=config['logging']['level'],
812
+ console_output=config['logging']['console_output']
813
+ )
814
+
815
+ generator = DurationTaskGenerator(config, logger)
816
+ generator.generate_dataset()
817
+
818
+
819
+ if __name__ == '__main__':
820
+ main()
tasks/task_order.py ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task 3: Order - Generate temporal ordering questions
3
+
4
+ This task joins multiple audio sources and asks questions about their temporal order
5
+ (first, last, what comes after, what comes before).
6
+ """
7
+
8
+ import csv
9
+ import random
10
+ import math
11
+ from pathlib import Path
12
+ from typing import Dict, List
13
+
14
+ import sys
15
+ sys.path.append(str(Path(__file__).parent.parent))
16
+
17
+ from utils import (
18
+ AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
19
+ setup_logger, set_random_seed, calculate_num_samples_for_task,
20
+ generate_single_clip_duration, get_max_clip_num_to_be_joined,
21
+ build_clip_sequence_with_silences, generate_sample_durations_for_task
22
+ )
23
+
24
+
25
+ class OrderTaskGenerator:
26
+ """Generator for temporal ordering task dataset."""
27
+
28
+ def __init__(self, config: Dict, logger):
29
+ """
30
+ Initialize order task generator.
31
+
32
+ Args:
33
+ config: Configuration dictionary
34
+ logger: Logger instance
35
+ """
36
+ self.config = config
37
+ self.logger = logger
38
+ self.task_config = config['tasks']['order']
39
+
40
+ # Initialize components
41
+ self.dataset = ESC50Dataset(
42
+ config['esc50']['metadata_path'],
43
+ config['esc50']['audio_path'],
44
+ config # Pass config for class subset loading
45
+ )
46
+ self.audio_processor = AudioProcessor(
47
+ crossfade_duration=config['audio']['crossfade_duration'],
48
+ silence_duration=config['audio']['silence_duration'],
49
+ with_silence=config['audio']['with_silence'],
50
+ normalize=config['audio']['normalize'],
51
+ normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
52
+ synthetic_silence_path=config['synthetic_silence']['path']
53
+ )
54
+ self.question_generator = QuestionGenerator(
55
+ num_options=config['mcq']['num_options'],
56
+ option_labels=config['mcq']['option_labels'],
57
+ distractor_strategy=config['mcq']['distractor_strategy']
58
+ )
59
+
60
+ # Initialize LLM question generator
61
+ self.llm_enabled = config.get('llm', {}).get('enabled', False)
62
+ self.llm_generator = LLMQuestionGenerator(
63
+ enabled=self.llm_enabled,
64
+ template_questions=self.task_config
65
+ )
66
+
67
+ # Duration settings from config
68
+ self.min_clip_duration = config['audio']['min_clip_duration']
69
+ self.max_clip_duration = config['audio']['max_clip_duration']
70
+ # Duration of individual source clips (ESC-50 default is 5s)
71
+ self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0)
72
+ self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
73
+ self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
74
+ self.crossfade_ms = config['audio'].get('crossfade_duration', 0)
75
+ self.task_duration_hours = self.task_config['task_duration_size']
76
+
77
+ # Order task specific settings
78
+ self.allow_source_repetition = self.task_config.get('allow_source_repetition', False)
79
+ self.min_clips_for_second = self.task_config.get('min_clips_for_second_questions', 4)
80
+
81
+ # Set up output paths
82
+ self.output_base = Path(config['output']['base_path']) / 'order'
83
+ self.output_base.mkdir(parents=True, exist_ok=True)
84
+ self.audio_output = self.output_base / 'audios'
85
+ self.audio_output.mkdir(parents=True, exist_ok=True)
86
+
87
+ def _get_valid_question_types(self, n_clips: int) -> List[str]:
88
+ """
89
+ Get question types valid for the given number of clips.
90
+
91
+ "second" and "second_last" require at least min_clips_for_second clips.
92
+
93
+ Args:
94
+ n_clips: Number of clips in the sample
95
+
96
+ Returns:
97
+ List of valid question types
98
+ """
99
+ all_types = self.task_config['question_types']
100
+
101
+ # Filter based on n_clips
102
+ valid_types = []
103
+ for qtype in all_types:
104
+ if qtype in ['second', 'second_last']:
105
+ if n_clips >= self.min_clips_for_second:
106
+ valid_types.append(qtype)
107
+ elif qtype in ['after', 'before']:
108
+ if n_clips >= 2:
109
+ valid_types.append(qtype)
110
+ else: # first, last
111
+ valid_types.append(qtype)
112
+
113
+ return valid_types if valid_types else ['first', 'last']
114
+
115
+ def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Dict:
116
+ """
117
+ Generate a single order task sample.
118
+
119
+ Pipeline: pick dataset -> pick class -> pick audio clip -> get duration ->
120
+ concatenate clips to reach target duration -> modulo to get num clips ->
121
+ inserting silences randomly based on remainder.
122
+
123
+ Args:
124
+ sample_id: Sample ID number
125
+ target_question_type: Target question type for balanced distribution
126
+ target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
127
+
128
+ Returns:
129
+ Dictionary with sample metadata
130
+ """
131
+ # Use pre-generated duration or generate one (backward compatibility)
132
+ if target_duration_seconds is not None:
133
+ clip_duration_seconds = target_duration_seconds
134
+ else:
135
+ clip_duration_seconds = generate_single_clip_duration(
136
+ self.min_clip_duration,
137
+ self.max_clip_duration
138
+ )
139
+
140
+ # Calculate how many clips we need using the new helper
141
+ max_clips, remainder_seconds = get_max_clip_num_to_be_joined(
142
+ clip_duration_seconds,
143
+ self.source_clip_duration,
144
+ self.min_silence_ms
145
+ )
146
+
147
+ max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
148
+
149
+ # Silence reduction strategy: subsample from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)]
150
+ # This ensures we use close to max_clips that fit, reducing excessive silence
151
+
152
+ # Calculate valid range for this sample's duration
153
+ min_clips_for_sample = max(2, max_clips - 3) # At least 2, preferably max_clips-3
154
+ max_clips_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
155
+
156
+ # Validate range
157
+ if max_clips_for_sample < 2:
158
+ raise ValueError(
159
+ f"Sample {sample_id}: Cannot generate order task - need at least 2 clips. "
160
+ f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, "
161
+ f"duration={clip_duration_seconds:.1f}s. Increase min_clip_duration."
162
+ )
163
+
164
+ if min_clips_for_sample > max_clips_for_sample:
165
+ raise ValueError(
166
+ f"Sample {sample_id}: Invalid clip range - min_clips ({min_clips_for_sample}) > max_clips ({max_clips_for_sample}). "
167
+ f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, duration={clip_duration_seconds:.1f}s"
168
+ )
169
+
170
+ # Randomly select from valid range (NO balanced pool for order task)
171
+ n_clips = random.randint(min_clips_for_sample, max_clips_for_sample)
172
+
173
+ # Get valid question types for this n_clips
174
+ valid_question_types = self._get_valid_question_types(n_clips)
175
+
176
+ if not valid_question_types:
177
+ raise ValueError(
178
+ f"Sample {sample_id}: No valid question types for n_clips={n_clips}. "
179
+ f"This should not happen - check _get_valid_question_types implementation."
180
+ )
181
+
182
+ # Pre-select question type to determine answer position
183
+ if target_question_type is not None:
184
+ if target_question_type not in valid_question_types:
185
+ raise ValueError(
186
+ f"Sample {sample_id}: target_question_type='{target_question_type}' not valid for n_clips={n_clips}. "
187
+ f"Valid types: {valid_question_types}. Balanced distribution should only assign valid types."
188
+ )
189
+ question_type = target_question_type
190
+ else:
191
+ question_type = random.choice(valid_question_types)
192
+
193
+ # Determine answer position based on question type
194
+ if question_type == 'first':
195
+ answer_position = 0
196
+ elif question_type == 'last':
197
+ answer_position = n_clips - 1
198
+ elif question_type == 'second':
199
+ answer_position = 1 # 0-indexed, so position 1 is second
200
+ elif question_type == 'second_last':
201
+ answer_position = n_clips - 2 # Second to last
202
+ elif question_type == 'after':
203
+ # Answer is after a reference, so position 1 to n-1
204
+ answer_position = random.randint(1, n_clips - 1) if n_clips >= 2 else 0
205
+ else: # before
206
+ # Answer is before a reference, so position 0 to n-2
207
+ answer_position = random.randint(0, n_clips - 2) if n_clips >= 2 else 0
208
+
209
+ # Select answer category from least-used categories
210
+ answer_category = self.dataset.get_least_used_categories(1)[0]
211
+
212
+ # Sample remaining categories, ensuring balanced distribution
213
+ if n_clips <= len(self.dataset.CATEGORIES):
214
+ other_categories = self.dataset.get_least_used_categories(
215
+ n_clips - 1,
216
+ exclude=[answer_category]
217
+ )
218
+ else:
219
+ # Need more clips than unique categories - sample with some repetition
220
+ other_categories = self.dataset.get_least_used_categories(
221
+ min(n_clips - 1, len(self.dataset.CATEGORIES) - 1),
222
+ exclude=[answer_category]
223
+ )
224
+ # Add random repetitions if needed
225
+ while len(other_categories) < n_clips - 1:
226
+ other_categories.append(random.choice(self.dataset.CATEGORIES))
227
+
228
+ # Arrange categories with answer at correct position
229
+ selected_categories = []
230
+ other_idx = 0
231
+ for i in range(n_clips):
232
+ if i == answer_position:
233
+ selected_categories.append(answer_category)
234
+ else:
235
+ selected_categories.append(other_categories[other_idx])
236
+ other_idx += 1
237
+
238
+ # Track usage of answer category
239
+ self.dataset.category_usage_counts[answer_category] += 1
240
+
241
+ # Sample one file from each category and load audio
242
+ audio_segments = []
243
+ filenames_list = []
244
+
245
+ for category in selected_categories:
246
+ filename, filepath = self.dataset.sample_file_from_category(category)
247
+ audio = self.audio_processor.load_audio(filepath)
248
+ audio_segments.append(audio)
249
+ filenames_list.append(filename)
250
+
251
+ # Build final audio with guaranteed silences between clips
252
+ output_audio_path = self.audio_output / f"{sample_id}.wav"
253
+ final_audio = build_clip_sequence_with_silences(
254
+ audio_segments,
255
+ clip_duration_seconds,
256
+ min_silence_ms=self.min_silence_ms,
257
+ max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
258
+ crossfade_ms=self.crossfade_ms
259
+ )
260
+
261
+ # Save the audio
262
+ final_audio.export(str(output_audio_path), format="wav")
263
+
264
+ # Determine correct answer and generate questions based on question type
265
+ # CRITICAL BUG FIX: Verify answer_category is actually at answer_position
266
+ if selected_categories[answer_position] != answer_category:
267
+ self.logger.error(f"Sample {sample_id}: Answer mismatch! Expected {answer_category} at position {answer_position}, got {selected_categories[answer_position]}")
268
+ # Force correct by using actual category at answer_position
269
+ correct_category = selected_categories[answer_position]
270
+ else:
271
+ correct_category = answer_category
272
+
273
+ if question_type == 'first':
274
+ mcq_question = self.task_config['mcq_questions']['first']
275
+ open_text_question = self.task_config['open_text_questions']['first']
276
+
277
+ elif question_type == 'last':
278
+ mcq_question = self.task_config['mcq_questions']['last']
279
+ open_text_question = self.task_config['open_text_questions']['last']
280
+
281
+ elif question_type == 'second':
282
+ mcq_question = self.task_config['mcq_questions']['second']
283
+ open_text_question = self.task_config['open_text_questions']['second']
284
+
285
+ elif question_type == 'second_last':
286
+ mcq_question = self.task_config['mcq_questions']['second_last']
287
+ open_text_question = self.task_config['open_text_questions']['second_last']
288
+
289
+ elif question_type == 'after':
290
+ # Reference is the sound before answer_position
291
+ if answer_position > 0:
292
+ reference_category = selected_categories[answer_position - 1]
293
+ mcq_question = self.task_config['mcq_questions']['after'].format(sound1=reference_category)
294
+ open_text_question = self.task_config['open_text_questions']['after'].format(sound1=reference_category)
295
+ else:
296
+ # Fallback shouldn't happen but handle gracefully
297
+ mcq_question = self.task_config['mcq_questions']['first']
298
+ open_text_question = self.task_config['open_text_questions']['first']
299
+
300
+ else: # before
301
+ # Reference is the sound after answer_position
302
+ if answer_position < n_clips - 1:
303
+ reference_category = selected_categories[answer_position + 1]
304
+ mcq_question = self.task_config['mcq_questions']['before'].format(sound2=reference_category)
305
+ open_text_question = self.task_config['open_text_questions']['before'].format(sound2=reference_category)
306
+ else:
307
+ # Fallback to 'first' if only 1 clip
308
+ correct_category = selected_categories[0]
309
+ mcq_question = self.task_config['mcq_questions']['first']
310
+ open_text_question = self.task_config['open_text_questions']['first']
311
+ question_type = 'first'
312
+
313
+ # Generate MCQ
314
+ mcq_data = self.question_generator.generate_category_mcq(
315
+ mcq_question,
316
+ correct_category,
317
+ selected_categories,
318
+ self.dataset.CATEGORIES
319
+ )
320
+
321
+ # Generate open-text question
322
+ open_text_data = self.question_generator.generate_category_open_text(
323
+ open_text_question,
324
+ correct_category
325
+ )
326
+
327
+ # Also generate a sequence question for open-text
328
+ sequence_question = self.task_config['open_text_questions']['sequence']
329
+ sequence_data = self.question_generator.generate_sequence_open_text(
330
+ sequence_question,
331
+ selected_categories
332
+ )
333
+
334
+ # Create metadata
335
+ metadata = {
336
+ 'id': sample_id,
337
+ 'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
338
+ 'n_clips': n_clips,
339
+ 'question_type': question_type,
340
+ 'audio_sequence': selected_categories,
341
+ 'correct_answer_category': correct_category,
342
+ 'source_files': filenames_list,
343
+ 'mcq_question': mcq_data['question'],
344
+ 'mcq_options': mcq_data['options'],
345
+ 'mcq_correct_answer': mcq_data['correct_answer'],
346
+ 'open_text_question': open_text_data['question'],
347
+ 'open_text_answer': open_text_data['correct_answer'],
348
+ 'sequence_question': sequence_data['question'],
349
+ 'sequence_answer': sequence_data['correct_answer']
350
+ }
351
+
352
+ self.logger.info(f"Generated order sample {sample_id}: {question_type}, {n_clips} clips")
353
+
354
+ return metadata
355
+
356
+ def generate_dataset(self) -> tuple:
357
+ """
358
+ Generate the complete order task dataset.
359
+
360
+ Uses generate_sample_durations_for_task() to pre-generate exact sample durations
361
+ that sum to exactly the target task duration. This guarantees:
362
+ - Exact coverage of target duration
363
+ - No estimation errors from average-based calculation
364
+
365
+ Returns:
366
+ Tuple of (mcq_csv_path, open_text_csv_path, sequence_csv_path)
367
+ """
368
+ # Generate sample durations upfront (guarantees exact total duration)
369
+ sample_durations = generate_sample_durations_for_task(
370
+ self.task_duration_hours,
371
+ self.min_clip_duration,
372
+ self.max_clip_duration
373
+ )
374
+ num_samples = len(sample_durations)
375
+
376
+ self.logger.info(f"Generating {num_samples} order task samples (target: {self.task_duration_hours}h, exact fill)...")
377
+
378
+ # Calculate effective max clips each sample can use (accounting for silence reduction)
379
+ # This matches the logic in generate_sample()
380
+ max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
381
+ sample_effective_max_clips = []
382
+
383
+ for duration in sample_durations:
384
+ max_clips, _ = get_max_clip_num_to_be_joined(
385
+ duration,
386
+ self.source_clip_duration,
387
+ self.min_silence_ms
388
+ )
389
+ # Apply the same constraints as generate_sample()
390
+ effective_max = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
391
+ sample_effective_max_clips.append(effective_max)
392
+
393
+ # Create capacity-aware balanced question type distribution
394
+ # Categorize question types by clip requirements
395
+ question_types = self.task_config['question_types']
396
+
397
+ # Separate into tiers based on clip requirements
398
+ basic_types = ['first', 'last', 'after', 'before'] # Need >= 2 clips
399
+ advanced_types = ['second', 'second_last'] # Need >= min_clips_for_second
400
+
401
+ # Count how many samples can support each tier (use effective max, not raw max)
402
+ samples_for_basic = sum(1 for emc in sample_effective_max_clips if emc >= 2)
403
+ samples_for_advanced = sum(1 for emc in sample_effective_max_clips if emc >= self.min_clips_for_second)
404
+
405
+ # Create list of (sample_idx, duration, effective_max_clips)
406
+ sample_info = [(i, sample_durations[i], sample_effective_max_clips[i]) for i in range(num_samples)]
407
+
408
+ # Sort by capacity (descending) - assign advanced types to high-capacity samples
409
+ sample_info.sort(key=lambda x: x[2], reverse=True)
410
+
411
+ # Calculate distribution: prefer advanced types for longer clips
412
+ samples_per_type = num_samples // len(question_types)
413
+ remainder = num_samples % len(question_types)
414
+
415
+ # Build assignment pool - advanced types first (for high-capacity samples)
416
+ assignment_pool = []
417
+ for qtype in advanced_types:
418
+ count = samples_per_type + (1 if remainder > 0 else 0)
419
+ assignment_pool.extend([qtype] * count)
420
+ remainder = max(0, remainder - 1)
421
+
422
+ for qtype in basic_types:
423
+ count = samples_per_type + (1 if remainder > 0 else 0)
424
+ assignment_pool.extend([qtype] * count)
425
+ remainder = max(0, remainder - 1)
426
+
427
+ # Assign question types based on capacity
428
+ balanced_assignments = [None] * num_samples
429
+
430
+ for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
431
+ target_qtype = assignment_pool[idx]
432
+
433
+ # Validate and adjust if needed
434
+ valid_types = self._get_valid_question_types(capacity)
435
+
436
+ if target_qtype not in valid_types:
437
+ # Assign a valid alternative - prefer similar types
438
+ if target_qtype in advanced_types and any(t in valid_types for t in basic_types):
439
+ # Downgrade to basic type
440
+ target_qtype = random.choice([t for t in basic_types if t in valid_types])
441
+ else:
442
+ # Fallback to any valid type
443
+ target_qtype = random.choice(valid_types)
444
+
445
+ balanced_assignments[sample_idx] = target_qtype
446
+
447
+ # Log the actual distribution after capacity-aware assignment
448
+ from collections import Counter
449
+ type_dist = Counter(balanced_assignments)
450
+ self.logger.info(f"Balanced question type distribution (after capacity-aware assignment): {dict(sorted(type_dist.items()))}")
451
+
452
+ all_metadata = []
453
+
454
+ for i, target_duration in enumerate(sample_durations):
455
+ metadata = self.generate_sample(i, target_question_type=balanced_assignments[i], target_duration_seconds=target_duration)
456
+ all_metadata.append(metadata) # Save MCQ CSV
457
+ mcq_csv_path = self.output_base / 'order_mcq.csv'
458
+ self._save_mcq_csv(all_metadata, mcq_csv_path)
459
+
460
+ # Save open-text CSV
461
+ open_text_csv_path = self.output_base / 'order_open_text.csv'
462
+ self._save_open_text_csv(all_metadata, open_text_csv_path)
463
+
464
+ # Save sequence CSV
465
+ sequence_csv_path = self.output_base / 'order_sequence.csv'
466
+ self._save_sequence_csv(all_metadata, sequence_csv_path)
467
+
468
+ # Save metadata CSV
469
+ metadata_csv_path = self.output_base / 'order_metadata.csv'
470
+ self._save_metadata_csv(all_metadata, metadata_csv_path)
471
+
472
+ self.logger.info(f"Order task dataset generation complete!")
473
+ self.logger.info(f" - MCQ CSV: {mcq_csv_path}")
474
+ self.logger.info(f" - Open-text CSV: {open_text_csv_path}")
475
+ self.logger.info(f" - Sequence CSV: {sequence_csv_path}")
476
+ self.logger.info(f" - Metadata CSV: {metadata_csv_path}")
477
+ self.logger.info(f" - Audio files: {self.audio_output}")
478
+
479
+ return mcq_csv_path, open_text_csv_path, sequence_csv_path
480
+
481
+ def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
482
+ """Save MCQ format CSV."""
483
+ with open(output_path, 'w', newline='') as f:
484
+ writer = csv.writer(f)
485
+ # Header
486
+ writer.writerow([
487
+ 'question', 'id', 'audio_path',
488
+ 'optionA', 'optionB', 'optionC', 'optionD',
489
+ 'correct', 'question_type', 'audio_sequence'
490
+ ])
491
+
492
+ # Data rows
493
+ for meta in metadata_list:
494
+ writer.writerow([
495
+ meta['mcq_question'],
496
+ meta['id'],
497
+ meta['audio_path'],
498
+ meta['mcq_options']['A'],
499
+ meta['mcq_options']['B'],
500
+ meta['mcq_options']['C'],
501
+ meta['mcq_options']['D'],
502
+ meta['mcq_correct_answer'],
503
+ meta['question_type'],
504
+ str(meta['audio_sequence'])
505
+ ])
506
+
507
+ def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
508
+ """Save open-text format CSV."""
509
+ with open(output_path, 'w', newline='') as f:
510
+ writer = csv.writer(f)
511
+ # Header
512
+ writer.writerow([
513
+ 'question', 'id', 'audio_path', 'answer',
514
+ 'question_type', 'audio_sequence'
515
+ ])
516
+
517
+ # Data rows
518
+ for meta in metadata_list:
519
+ writer.writerow([
520
+ meta['open_text_question'],
521
+ meta['id'],
522
+ meta['audio_path'],
523
+ meta['open_text_answer'],
524
+ meta['question_type'],
525
+ str(meta['audio_sequence'])
526
+ ])
527
+
528
+ def _save_sequence_csv(self, metadata_list: List[Dict], output_path: Path):
529
+ """Save sequence question CSV."""
530
+ with open(output_path, 'w', newline='') as f:
531
+ writer = csv.writer(f)
532
+ # Header
533
+ writer.writerow([
534
+ 'question', 'id', 'audio_path', 'answer', 'audio_sequence'
535
+ ])
536
+
537
+ # Data rows
538
+ for meta in metadata_list:
539
+ writer.writerow([
540
+ meta['sequence_question'],
541
+ meta['id'],
542
+ meta['audio_path'],
543
+ meta['sequence_answer'],
544
+ str(meta['audio_sequence'])
545
+ ])
546
+
547
+ def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
548
+ """Save detailed metadata CSV."""
549
+ with open(output_path, 'w', newline='') as f:
550
+ writer = csv.writer(f)
551
+ # Header
552
+ writer.writerow([
553
+ 'id', 'audio_path', 'n_clips', 'question_type',
554
+ 'audio_sequence', 'correct_answer', 'source_files'
555
+ ])
556
+
557
+ # Data rows
558
+ for meta in metadata_list:
559
+ writer.writerow([
560
+ meta['id'],
561
+ meta['audio_path'],
562
+ meta['n_clips'],
563
+ meta['question_type'],
564
+ str(meta['audio_sequence']),
565
+ meta['correct_answer_category'],
566
+ str(meta['source_files'])
567
+ ])
568
+
569
+
570
+ def main(config_path: str = None):
571
+ """Main entry point for order task generation."""
572
+ import yaml
573
+
574
+ # Load configuration
575
+ if config_path is None:
576
+ config_path = Path(__file__).parent.parent / 'config.yaml'
577
+
578
+ with open(config_path, 'r') as f:
579
+ config = yaml.safe_load(f)
580
+
581
+ # Set random seed
582
+ set_random_seed(config['random_seed'])
583
+
584
+ # Setup logger
585
+ logger = setup_logger(
586
+ 'order_task',
587
+ log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
588
+ level=config['logging']['level'],
589
+ console_output=config['logging']['console_output']
590
+ )
591
+
592
+ # Generate dataset
593
+ generator = OrderTaskGenerator(config, logger)
594
+ generator.generate_dataset()
595
+
596
+
597
+ if __name__ == '__main__':
598
+ main()
tasks/task_volume.py ADDED
@@ -0,0 +1,732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task 4: Volume - Generate volume comparison questions
3
+
4
+ This task joins multiple audio sources with different volume levels
5
+ and asks questions about the loudest or softest sound.
6
+ """
7
+
8
+ import csv
9
+ import random
10
+ import math
11
+ from pathlib import Path
12
+ from typing import Dict, List, Tuple, Optional
13
+
14
+ import sys
15
+ sys.path.append(str(Path(__file__).parent.parent))
16
+
17
+ from utils import (
18
+ AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
19
+ setup_logger, set_random_seed, calculate_num_samples_for_task,
20
+ generate_single_clip_duration, get_max_clip_num_to_be_joined,
21
+ build_clip_sequence_with_silences, generate_sample_durations_for_task,
22
+ get_lufs_loudness, normalize_to_lufs
23
+ )
24
+
25
+
26
+ class VolumeTaskGenerator:
27
+ """Generator for volume comparison task dataset."""
28
+
29
+ def __init__(self, config: Dict, logger):
30
+ """
31
+ Initialize volume task generator.
32
+
33
+ Args:
34
+ config: Configuration dictionary
35
+ logger: Logger instance
36
+ """
37
+ self.config = config
38
+ self.logger = logger
39
+ self.task_config = config['tasks']['volume']
40
+
41
+ # Initialize components
42
+ self.dataset = ESC50Dataset(
43
+ config['esc50']['metadata_path'],
44
+ config['esc50']['audio_path'],
45
+ config # Pass config for class subset loading
46
+ )
47
+ self.audio_processor = AudioProcessor(
48
+ crossfade_duration=config['audio']['crossfade_duration'],
49
+ silence_duration=config['audio']['silence_duration'],
50
+ with_silence=config['audio']['with_silence'],
51
+ normalize=config['audio']['normalize'],
52
+ normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
53
+ synthetic_silence_path=config['synthetic_silence']['path']
54
+ )
55
+ self.question_generator = QuestionGenerator(
56
+ num_options=config['mcq']['num_options'],
57
+ option_labels=config['mcq']['option_labels'],
58
+ distractor_strategy=config['mcq']['distractor_strategy']
59
+ )
60
+
61
+ # Initialize LLM question generator
62
+ self.llm_enabled = config.get('llm', {}).get('enabled', False)
63
+ self.llm_generator = LLMQuestionGenerator(
64
+ enabled=self.llm_enabled,
65
+ template_questions=self.task_config
66
+ )
67
+
68
+ # Duration settings from config
69
+ self.min_clip_duration = config['audio']['min_clip_duration']
70
+ self.max_clip_duration = config['audio']['max_clip_duration']
71
+ # Duration of individual source clips (ESC-50 default is 5s)
72
+ self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0)
73
+ self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
74
+ self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
75
+ self.crossfade_ms = config['audio'].get('crossfade_duration', 0)
76
+ self.task_duration_hours = self.task_config['task_duration_size']
77
+
78
+ # Volume task specific settings
79
+ self.normalize_to_baseline = self.task_config.get('normalize_to_baseline', True)
80
+ self.baseline_dBFS = self.task_config.get('baseline_dBFS', -20.0)
81
+ self.use_same_clip_different_volumes = self.task_config.get('use_same_clip_different_volumes', False)
82
+ self.repetitions_per_source = self.task_config.get('repetitions_per_source', [2, 3, 4])
83
+ if isinstance(self.repetitions_per_source, int):
84
+ self.repetitions_per_source = [self.repetitions_per_source]
85
+
86
+ # Volume gap multipliers (similar to duration task)
87
+ self.multiplier_max_loudness = self.task_config.get('multiplier_max_loudness', 1.5)
88
+ self.multiplier_min_loudness = self.task_config.get('multiplier_min_loudness', 0.5)
89
+ self.reject_if_gap_not_met = self.task_config.get('reject_if_gap_not_met', True)
90
+
91
+ # LUFS vs dBFS loudness measurement option
92
+ # LUFS (Loudness Units Full Scale) measures PERCEIVED loudness
93
+ # dBFS measures RMS amplitude - does NOT account for frequency sensitivity
94
+ # LUFS is recommended for comparing different sound types
95
+ self.use_lufs = self.task_config.get('use_lufs', True)
96
+ self.baseline_lufs = self.task_config.get('baseline_lufs', -23.0) # EBU R128 standard
97
+
98
+ # Set up output paths
99
+ self.output_base = Path(config['output']['base_path']) / 'volume'
100
+ self.output_base.mkdir(parents=True, exist_ok=True)
101
+ self.audio_output = self.output_base / 'audios'
102
+ self.audio_output.mkdir(parents=True, exist_ok=True)
103
+
104
+ # Create balanced sampling pool for num_clips
105
+ self.clips_count_pool = []
106
+
107
+ def _normalize_to_baseline(self, audio: "AudioSegment") -> "AudioSegment":
108
+ """
109
+ Normalize audio to the baseline loudness level.
110
+
111
+ Uses LUFS (perceived loudness) if use_lufs=True, otherwise dBFS.
112
+ This ensures all clips start from the same perceived loudness before
113
+ applying volume adjustments.
114
+
115
+ Args:
116
+ audio: Input audio segment
117
+
118
+ Returns:
119
+ Normalized audio segment
120
+ """
121
+ if not self.normalize_to_baseline:
122
+ return audio
123
+
124
+ if self.use_lufs:
125
+ # Use LUFS-based normalization (perceived loudness)
126
+ normalized = normalize_to_lufs(audio, self.baseline_lufs)
127
+ self.logger.debug(
128
+ f"Normalized to baseline LUFS: {get_lufs_loudness(audio):.2f} -> {get_lufs_loudness(normalized):.2f} LUFS"
129
+ )
130
+ return normalized
131
+ else:
132
+ # Use dBFS normalization (RMS amplitude)
133
+ change_in_dBFS = self.baseline_dBFS - audio.dBFS
134
+ normalized = audio.apply_gain(change_in_dBFS)
135
+ self.logger.debug(
136
+ f"Normalized to baseline dBFS: {audio.dBFS:.2f} -> {normalized.dBFS:.2f} dBFS"
137
+ )
138
+ return normalized
139
+
140
+ def _get_amplitude_loudness(self, audio: "AudioSegment") -> float:
141
+ """
142
+ Get the loudness of an audio clip.
143
+
144
+ Uses LUFS (perceived loudness) if use_lufs=True, otherwise dBFS.
145
+
146
+ Args:
147
+ audio: Input audio segment
148
+
149
+ Returns:
150
+ Loudness in LUFS or dBFS depending on configuration
151
+ """
152
+ if self.use_lufs:
153
+ return get_lufs_loudness(audio)
154
+ else:
155
+ return audio.dBFS
156
+
157
+ def _verify_loudness_gap(
158
+ self,
159
+ volume_levels: List[float],
160
+ question_type: str
161
+ ) -> Tuple[bool, int, Dict]:
162
+ """
163
+ Verify that loudness gap constraint is satisfied.
164
+
165
+ For MAX_LOUDNESS: max_volume >= second_max × multiplier_max
166
+ For MIN_LOUDNESS: min_volume <= second_min × multiplier_min
167
+
168
+ Since we work with dB (logarithmic), the gap is in dB difference:
169
+ - For max: max_dB - second_max_dB >= required_gap_dB
170
+ - For min: second_min_dB - min_dB >= required_gap_dB
171
+
172
+ The multiplier translates to dB: 1.5x linear = ~3.5dB, 2x = ~6dB
173
+
174
+ Args:
175
+ volume_levels: List of volume adjustments in dB
176
+ question_type: "max_loudness" or "min_loudness"
177
+
178
+ Returns:
179
+ Tuple of (gap_satisfied, answer_idx, metadata)
180
+ """
181
+ import math
182
+
183
+ sorted_levels = sorted(volume_levels, reverse=True) # Highest first
184
+
185
+ if question_type == "max_loudness":
186
+ max_level = sorted_levels[0]
187
+ second_max = sorted_levels[1] if len(sorted_levels) > 1 else sorted_levels[0]
188
+
189
+ # Convert multiplier to dB difference
190
+ # multiplier 1.5 means 1.5x louder in amplitude = 20*log10(1.5) ≈ 3.5 dB
191
+ required_gap_dB = 20 * math.log10(self.multiplier_max_loudness)
192
+ actual_gap_dB = max_level - second_max
193
+
194
+ gap_satisfied = actual_gap_dB >= required_gap_dB
195
+ answer_idx = volume_levels.index(max_level)
196
+
197
+ metadata = {
198
+ 'max_level_dB': max_level,
199
+ 'second_max_dB': second_max,
200
+ 'required_gap_dB': required_gap_dB,
201
+ 'actual_gap_dB': actual_gap_dB,
202
+ 'multiplier': self.multiplier_max_loudness
203
+ }
204
+
205
+ else: # min_loudness
206
+ min_level = sorted_levels[-1]
207
+ second_min = sorted_levels[-2] if len(sorted_levels) > 1 else sorted_levels[-1]
208
+
209
+ # For min, we want min to be multiplier times softer
210
+ # multiplier 0.5 means 0.5x amplitude = 20*log10(0.5) ≈ -6 dB
211
+ # So second_min - min_level should be >= 6 dB
212
+ required_gap_dB = abs(20 * math.log10(self.multiplier_min_loudness))
213
+ actual_gap_dB = second_min - min_level
214
+
215
+ gap_satisfied = actual_gap_dB >= required_gap_dB
216
+ answer_idx = volume_levels.index(min_level)
217
+
218
+ metadata = {
219
+ 'min_level_dB': min_level,
220
+ 'second_min_dB': second_min,
221
+ 'required_gap_dB': required_gap_dB,
222
+ 'actual_gap_dB': actual_gap_dB,
223
+ 'multiplier': self.multiplier_min_loudness
224
+ }
225
+
226
+ return gap_satisfied, answer_idx, metadata
227
+
228
+ def generate_volume_levels(self, n_clips: int, question_type: str = None) -> List[float]:
229
+ """
230
+ Generate volume levels dynamically based on multiplier constraints.
231
+
232
+ The levels are generated to ensure proper gap for the question type:
233
+ - For max_loudness: the loudest is clearly distinguishable (gap = multiplier_max)
234
+ - For min_loudness: the softest is clearly distinguishable (gap = multiplier_min)
235
+
236
+ Args:
237
+ n_clips: Number of clips
238
+ question_type: "max_loudness" or "min_loudness" to ensure proper gap
239
+
240
+ Returns:
241
+ List of volume adjustments in dB (integers)
242
+ """
243
+ # Base spacing between adjacent volume levels (minimum audible difference)
244
+ # 6 dB = 2x amplitude, 12 dB = 4x amplitude (clearly distinguishable)
245
+ min_diff = 12 # 12 dB is a VERY noticeable difference (4x perceived loudness)
246
+
247
+ # Calculate required gap based on multiplier (round up to nearest int)
248
+ if question_type == "max_loudness":
249
+ required_gap = int(math.ceil(20 * math.log10(self.multiplier_max_loudness)))
250
+ elif question_type == "min_loudness":
251
+ required_gap = int(math.ceil(abs(20 * math.log10(self.multiplier_min_loudness))))
252
+ else:
253
+ required_gap = min_diff
254
+
255
+ # Ensure gap is at least min_diff
256
+ required_gap = max(required_gap, min_diff)
257
+
258
+ if question_type == "max_loudness":
259
+ # Generate levels where max has clear gap from others
260
+ # Max level (answer) at a high value - MUCH louder
261
+ max_level = 18 # dB adjustment = ~8x louder than baseline
262
+
263
+ # Other levels should be at least required_gap below max
264
+ # Spread them out with min_diff spacing
265
+ other_levels = []
266
+ current_level = max_level - required_gap
267
+ for i in range(n_clips - 1):
268
+ other_levels.append(current_level)
269
+ current_level -= min_diff
270
+
271
+ selected_levels = other_levels + [max_level]
272
+
273
+ elif question_type == "min_loudness":
274
+ # Generate levels where min has clear gap from others
275
+ # Min level (answer) at a low value - MUCH quieter
276
+ min_level = -24 # dB adjustment = ~1/16th of baseline volume
277
+
278
+ # Other levels should be at least required_gap above min
279
+ # Spread them out with min_diff spacing
280
+ other_levels = []
281
+ current_level = min_level + required_gap
282
+ for i in range(n_clips - 1):
283
+ other_levels.append(current_level)
284
+ current_level += min_diff
285
+
286
+ selected_levels = [min_level] + other_levels
287
+
288
+ else:
289
+ # Default: evenly spaced levels centered around 0
290
+ total_range = (n_clips - 1) * min_diff
291
+ start_level = -total_range // 2
292
+ selected_levels = [start_level + i * min_diff for i in range(n_clips)]
293
+
294
+ # Shuffle to randomize order in the audio
295
+ random.shuffle(selected_levels)
296
+
297
+ return selected_levels
298
+
299
+ def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Dict:
300
+ """
301
+ Generate a single volume task sample.
302
+
303
+ Pipeline:
304
+ 1. Pick dataset -> pick class -> pick audio clip
305
+ 2. NORMALIZE all clips to baseline dBFS (critical for controlled comparison)
306
+ 3. Apply different volume adjustments to each clip
307
+ 4. Concatenate clips with silences
308
+
309
+ Optionally: use same clip with different volume levels if configured.
310
+
311
+ Args:
312
+ sample_id: Sample ID number
313
+ target_question_type: Target question type for balanced distribution
314
+ target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
315
+
316
+ Returns:
317
+ Dictionary with sample metadata
318
+ """
319
+ # Use pre-generated duration or generate one (backward compatibility)
320
+ if target_duration_seconds is not None:
321
+ clip_duration_seconds = target_duration_seconds
322
+ else:
323
+ clip_duration_seconds = generate_single_clip_duration(
324
+ self.min_clip_duration,
325
+ self.max_clip_duration
326
+ )
327
+
328
+ # Calculate how many clips we need using the new helper
329
+ max_clips, remainder_seconds = get_max_clip_num_to_be_joined(
330
+ clip_duration_seconds,
331
+ self.source_clip_duration,
332
+ self.min_silence_ms
333
+ )
334
+
335
+ max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
336
+
337
+ # Silence reduction strategy: subsample from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)]
338
+ # This ensures we use close to max_clips that fit, reducing excessive silence
339
+
340
+ # Calculate valid range for this sample's duration
341
+ min_clips_for_sample = max(2, max_clips - 3) # At least 2, preferably max_clips-3
342
+ max_clips_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
343
+
344
+ # Validate range
345
+ if max_clips_for_sample < 2:
346
+ raise ValueError(
347
+ f"Sample {sample_id}: Cannot generate volume task - need at least 2 clips. "
348
+ f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, "
349
+ f"duration={clip_duration_seconds:.1f}s. Increase min_clip_duration."
350
+ )
351
+
352
+ if min_clips_for_sample > max_clips_for_sample:
353
+ raise ValueError(
354
+ f"Sample {sample_id}: Invalid clip range - min_clips ({min_clips_for_sample}) > max_clips ({max_clips_for_sample}). "
355
+ f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, duration={clip_duration_seconds:.1f}s"
356
+ )
357
+
358
+ # Randomly select from valid range (NO balanced pool for volume task)
359
+ n_clips = random.randint(min_clips_for_sample, max_clips_for_sample)
360
+ n_clips = max(2, n_clips) # Ensure at least 2 for volume comparison
361
+
362
+ # Pre-select question type to determine answer position
363
+ # Use target question type if provided, otherwise randomly select
364
+ if target_question_type is not None:
365
+ question_type = target_question_type
366
+ else:
367
+ question_type = random.choice(self.task_config['question_types'])
368
+
369
+ # Generate volume levels and verify gap constraint
370
+ max_attempts = 10
371
+ gap_satisfied = False
372
+ volume_levels = None
373
+ gap_metadata = None
374
+
375
+ for attempt in range(max_attempts):
376
+ volume_levels = self.generate_volume_levels(n_clips, question_type)
377
+ gap_satisfied, answer_idx, gap_metadata = self._verify_loudness_gap(
378
+ volume_levels, question_type
379
+ )
380
+
381
+ if gap_satisfied:
382
+ break
383
+
384
+ self.logger.debug(
385
+ f"Sample {sample_id} attempt {attempt+1}: gap not satisfied, "
386
+ f"required={gap_metadata['required_gap_dB']:.1f}dB, "
387
+ f"actual={gap_metadata['actual_gap_dB']:.1f}dB"
388
+ )
389
+
390
+ if not gap_satisfied and self.reject_if_gap_not_met:
391
+ self.logger.warning(
392
+ f"Sample {sample_id} rejected: loudness gap not satisfied after {max_attempts} attempts"
393
+ )
394
+ return None
395
+
396
+ # Determine answer position based on question type
397
+ if question_type == 'max_loudness':
398
+ answer_idx = volume_levels.index(max(volume_levels))
399
+ else: # min_loudness
400
+ answer_idx = volume_levels.index(min(volume_levels))
401
+
402
+ # Select answer category from least-used categories
403
+ answer_category = self.dataset.get_least_used_categories(1)[0]
404
+
405
+ # Determine if using same clip with different volumes
406
+ if self.use_same_clip_different_volumes:
407
+ # Use ONE source clip repeated at different volume levels
408
+ selected_categories = [answer_category] * n_clips
409
+ # Track usage
410
+ self.dataset.category_usage_counts[answer_category] += 1
411
+ correct_category = answer_category
412
+ else:
413
+ # Use different source clips (original behavior)
414
+ # Sample remaining categories, ensuring balanced distribution
415
+ if n_clips <= len(self.dataset.CATEGORIES):
416
+ other_categories = self.dataset.get_least_used_categories(
417
+ n_clips - 1,
418
+ exclude=[answer_category]
419
+ )
420
+ else:
421
+ # Need more clips than unique categories
422
+ other_categories = self.dataset.get_least_used_categories(
423
+ min(n_clips - 1, len(self.dataset.CATEGORIES) - 1),
424
+ exclude=[answer_category]
425
+ )
426
+ # Add random repetitions if needed
427
+ while len(other_categories) < n_clips - 1:
428
+ other_categories.append(random.choice(self.dataset.CATEGORIES))
429
+
430
+ # Arrange categories with answer at correct position
431
+ selected_categories = []
432
+ other_idx = 0
433
+ for i in range(n_clips):
434
+ if i == answer_idx:
435
+ selected_categories.append(answer_category)
436
+ else:
437
+ selected_categories.append(other_categories[other_idx])
438
+ other_idx += 1
439
+
440
+ # Track usage of answer category
441
+ self.dataset.category_usage_counts[answer_category] += 1
442
+
443
+ # CRITICAL BUG FIX: Verify answer_category is actually at answer_idx
444
+ if selected_categories[answer_idx] != answer_category:
445
+ self.logger.error(f"Sample {sample_id}: Answer mismatch! Expected {answer_category} at index {answer_idx}, got {selected_categories[answer_idx]}")
446
+ correct_category = selected_categories[answer_idx]
447
+ else:
448
+ correct_category = answer_category
449
+
450
+ # Sample files and process audio
451
+ audio_segments = []
452
+ filenames_list = []
453
+ original_loudness = []
454
+ final_loudness = []
455
+
456
+ if self.use_same_clip_different_volumes:
457
+ # Load one file and repeat it with different volumes
458
+ filename, filepath = self.dataset.sample_file_from_category(answer_category)
459
+ base_audio = self.audio_processor.load_audio(filepath)
460
+ original_loudness_val = self._get_amplitude_loudness(base_audio)
461
+
462
+ # Normalize to baseline first
463
+ base_audio_normalized = self._normalize_to_baseline(base_audio)
464
+
465
+ for i in range(n_clips):
466
+ # Apply volume adjustment to normalized audio
467
+ audio_adjusted = self.audio_processor.adjust_volume(
468
+ base_audio_normalized,
469
+ volume_levels[i]
470
+ )
471
+ audio_segments.append(audio_adjusted)
472
+ filenames_list.append(filename)
473
+ original_loudness.append(original_loudness_val)
474
+ final_loudness.append(self._get_amplitude_loudness(audio_adjusted))
475
+ else:
476
+ # Use different files (original behavior but with normalization)
477
+ for i, category in enumerate(selected_categories):
478
+ filename, filepath = self.dataset.sample_file_from_category(category)
479
+ audio = self.audio_processor.load_audio(filepath)
480
+
481
+ # Record original loudness
482
+ orig_loud = self._get_amplitude_loudness(audio)
483
+ original_loudness.append(orig_loud)
484
+
485
+ # STEP 1: Normalize to baseline dBFS
486
+ audio_normalized = self._normalize_to_baseline(audio)
487
+
488
+ # STEP 2: Apply volume adjustment (relative to baseline)
489
+ audio_adjusted = self.audio_processor.adjust_volume(
490
+ audio_normalized,
491
+ volume_levels[i]
492
+ )
493
+
494
+ audio_segments.append(audio_adjusted)
495
+ filenames_list.append(filename)
496
+ final_loudness.append(self._get_amplitude_loudness(audio_adjusted))
497
+
498
+ # Build final audio with guaranteed silences between clips
499
+ output_audio_path = self.audio_output / f"{sample_id}.wav"
500
+ final_audio = build_clip_sequence_with_silences(
501
+ audio_segments,
502
+ clip_duration_seconds,
503
+ min_silence_ms=self.min_silence_ms,
504
+ max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
505
+ crossfade_ms=self.crossfade_ms
506
+ )
507
+
508
+ # Save the audio
509
+ final_audio.export(str(output_audio_path), format="wav")
510
+
511
+ # Generate MCQ
512
+ mcq_question = self.task_config['mcq_questions'][question_type]
513
+ mcq_data = self.question_generator.generate_category_mcq(
514
+ mcq_question,
515
+ correct_category,
516
+ selected_categories,
517
+ self.dataset.CATEGORIES
518
+ )
519
+
520
+ # Generate open-text question
521
+ open_text_question = self.task_config['open_text_questions'][question_type]
522
+ open_text_data = self.question_generator.generate_category_open_text(
523
+ open_text_question,
524
+ correct_category
525
+ )
526
+
527
+ # Create category to volume mapping
528
+ category_volumes = {
529
+ selected_categories[i]: volume_levels[i]
530
+ for i in range(n_clips)
531
+ }
532
+
533
+ # Create metadata
534
+ metadata = {
535
+ 'id': sample_id,
536
+ 'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
537
+ 'n_clips': n_clips,
538
+ 'question_type': question_type,
539
+ 'audio_sequence': selected_categories,
540
+ 'volume_levels_db': volume_levels,
541
+ 'category_volumes': category_volumes,
542
+ 'correct_answer_category': correct_category,
543
+ 'correct_volume_db': volume_levels[answer_idx],
544
+ 'source_files': filenames_list,
545
+ 'use_same_clip': self.use_same_clip_different_volumes,
546
+ 'baseline_dBFS': self.baseline_dBFS if self.normalize_to_baseline else None,
547
+ 'original_loudness_dBFS': original_loudness,
548
+ 'final_loudness_dBFS': final_loudness,
549
+ 'gap_satisfied': gap_satisfied,
550
+ 'gap_metadata': gap_metadata,
551
+ 'mcq_question': mcq_data['question'],
552
+ 'mcq_options': mcq_data['options'],
553
+ 'mcq_correct_answer': mcq_data['correct_answer'],
554
+ 'open_text_question': open_text_data['question'],
555
+ 'open_text_answer': open_text_data['correct_answer']
556
+ }
557
+
558
+ self.logger.info(
559
+ f"Generated volume sample {sample_id}: {question_type}, {n_clips} clips, "
560
+ f"volumes={volume_levels}, gap_satisfied={gap_satisfied}, "
561
+ f"gap={gap_metadata['actual_gap_dB']:.1f}dB (required={gap_metadata['required_gap_dB']:.1f}dB)"
562
+ )
563
+
564
+ return metadata
565
+
566
+ def generate_dataset(self) -> tuple:
567
+ """
568
+ Generate the complete volume task dataset.
569
+
570
+ Uses generate_sample_durations_for_task() to pre-generate exact sample durations
571
+ that sum to exactly the target task duration. This guarantees:
572
+ - Exact coverage of target duration
573
+ - No estimation errors from average-based calculation
574
+
575
+ Returns:
576
+ Tuple of (mcq_csv_path, open_text_csv_path)
577
+ """
578
+ # Generate sample durations upfront (guarantees exact total duration)
579
+ sample_durations = generate_sample_durations_for_task(
580
+ self.task_duration_hours,
581
+ self.min_clip_duration,
582
+ self.max_clip_duration
583
+ )
584
+ num_samples = len(sample_durations)
585
+
586
+ self.logger.info(f"Generating {num_samples} volume task samples (target: {self.task_duration_hours}h, exact fill)...")
587
+
588
+ # Create balanced question type distribution (NO clips balancing for volume task)
589
+ question_types = self.task_config['question_types']
590
+ balanced_question_types = []
591
+ samples_per_type = num_samples // len(question_types)
592
+ remainder = num_samples % len(question_types)
593
+
594
+ for qtype in question_types:
595
+ count = samples_per_type + (1 if remainder > 0 else 0)
596
+ balanced_question_types.extend([qtype] * count)
597
+ remainder = max(0, remainder - 1)
598
+
599
+ random.shuffle(balanced_question_types)
600
+ from collections import Counter
601
+ type_dist = Counter(balanced_question_types)
602
+ self.logger.info(f"Balanced question type distribution: {dict(sorted(type_dist.items()))}")
603
+
604
+ all_metadata = []
605
+
606
+ for i, target_duration in enumerate(sample_durations):
607
+ metadata = self.generate_sample(i, target_question_type=balanced_question_types[i], target_duration_seconds=target_duration)
608
+ all_metadata.append(metadata) # Save MCQ CSV
609
+ mcq_csv_path = self.output_base / 'volume_mcq.csv'
610
+ self._save_mcq_csv(all_metadata, mcq_csv_path)
611
+
612
+ # Save open-text CSV
613
+ open_text_csv_path = self.output_base / 'volume_open_text.csv'
614
+ self._save_open_text_csv(all_metadata, open_text_csv_path)
615
+
616
+ # Save metadata CSV
617
+ metadata_csv_path = self.output_base / 'volume_metadata.csv'
618
+ self._save_metadata_csv(all_metadata, metadata_csv_path)
619
+
620
+ self.logger.info(f"Volume task dataset generation complete!")
621
+ self.logger.info(f" - MCQ CSV: {mcq_csv_path}")
622
+ self.logger.info(f" - Open-text CSV: {open_text_csv_path}")
623
+ self.logger.info(f" - Metadata CSV: {metadata_csv_path}")
624
+ self.logger.info(f" - Audio files: {self.audio_output}")
625
+
626
+ return mcq_csv_path, open_text_csv_path
627
+
628
+ def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
629
+ """Save MCQ format CSV."""
630
+ with open(output_path, 'w', newline='') as f:
631
+ writer = csv.writer(f)
632
+ # Header
633
+ writer.writerow([
634
+ 'question', 'id', 'audio_path',
635
+ 'optionA', 'optionB', 'optionC', 'optionD',
636
+ 'correct', 'question_type', 'audio_sequence',
637
+ 'category_volumes'
638
+ ])
639
+
640
+ # Data rows
641
+ for meta in metadata_list:
642
+ writer.writerow([
643
+ meta['mcq_question'],
644
+ meta['id'],
645
+ meta['audio_path'],
646
+ meta['mcq_options']['A'],
647
+ meta['mcq_options']['B'],
648
+ meta['mcq_options']['C'],
649
+ meta['mcq_options']['D'],
650
+ meta['mcq_correct_answer'],
651
+ meta['question_type'],
652
+ str(meta['audio_sequence']),
653
+ str(meta['category_volumes'])
654
+ ])
655
+
656
+ def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
657
+ """Save open-text format CSV."""
658
+ with open(output_path, 'w', newline='') as f:
659
+ writer = csv.writer(f)
660
+ # Header
661
+ writer.writerow([
662
+ 'question', 'id', 'audio_path', 'answer',
663
+ 'question_type', 'audio_sequence', 'category_volumes'
664
+ ])
665
+
666
+ # Data rows
667
+ for meta in metadata_list:
668
+ writer.writerow([
669
+ meta['open_text_question'],
670
+ meta['id'],
671
+ meta['audio_path'],
672
+ meta['open_text_answer'],
673
+ meta['question_type'],
674
+ str(meta['audio_sequence']),
675
+ str(meta['category_volumes'])
676
+ ])
677
+
678
+ def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
679
+ """Save detailed metadata CSV."""
680
+ with open(output_path, 'w', newline='') as f:
681
+ writer = csv.writer(f)
682
+ # Header
683
+ writer.writerow([
684
+ 'id', 'audio_path', 'n_clips', 'question_type',
685
+ 'audio_sequence', 'volume_levels_db', 'correct_answer',
686
+ 'correct_volume_db', 'source_files'
687
+ ])
688
+
689
+ # Data rows
690
+ for meta in metadata_list:
691
+ writer.writerow([
692
+ meta['id'],
693
+ meta['audio_path'],
694
+ meta['n_clips'],
695
+ meta['question_type'],
696
+ str(meta['audio_sequence']),
697
+ str(meta['volume_levels_db']),
698
+ meta['correct_answer_category'],
699
+ meta['correct_volume_db'],
700
+ str(meta['source_files'])
701
+ ])
702
+
703
+
704
+ def main(config_path: str = None):
705
+ """Main entry point for volume task generation."""
706
+ import yaml
707
+
708
+ # Load configuration
709
+ if config_path is None:
710
+ config_path = Path(__file__).parent.parent / 'config.yaml'
711
+
712
+ with open(config_path, 'r') as f:
713
+ config = yaml.safe_load(f)
714
+
715
+ # Set random seed
716
+ set_random_seed(config['random_seed'])
717
+
718
+ # Setup logger
719
+ logger = setup_logger(
720
+ 'volume_task',
721
+ log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
722
+ level=config['logging']['level'],
723
+ console_output=config['logging']['console_output']
724
+ )
725
+
726
+ # Generate dataset
727
+ generator = VolumeTaskGenerator(config, logger)
728
+ generator.generate_dataset()
729
+
730
+
731
+ if __name__ == '__main__':
732
+ main()
utils/__init__.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility module initialization.
3
+ """
4
+
5
+ from .audio_utils import (
6
+ AudioProcessor, set_random_seed,
7
+ calculate_num_samples_for_task, generate_sample_durations_for_task,
8
+ generate_single_clip_duration,
9
+ concatenate_to_target_duration,
10
+ get_max_clip_num_to_be_joined,
11
+ build_clip_sequence_with_silences,
12
+ distribute_remainder_as_silences,
13
+ repeat_clips_to_fill_duration,
14
+ build_consecutive_sources_for_count_task,
15
+ build_random_order_for_count_task,
16
+ build_count_task_audio,
17
+ calculate_duration_slot_distribution,
18
+ build_duration_task_audio,
19
+ get_lufs_loudness,
20
+ normalize_to_lufs
21
+ )
22
+ from .dataset_utils import ESC50Dataset, PreprocessedESC50Dataset
23
+ from .logger import setup_logger
24
+ from .question_utils import QuestionGenerator
25
+ from .llm_utils import LLMQuestionGenerator
26
+
27
+ __all__ = [
28
+ 'AudioProcessor',
29
+ 'ESC50Dataset',
30
+ 'PreprocessedESC50Dataset',
31
+ 'QuestionGenerator',
32
+ 'LLMQuestionGenerator',
33
+ 'setup_logger',
34
+ 'set_random_seed',
35
+ 'calculate_num_samples_for_task',
36
+ 'generate_sample_durations_for_task',
37
+ 'generate_single_clip_duration',
38
+ 'concatenate_to_target_duration',
39
+ 'get_max_clip_num_to_be_joined',
40
+ 'build_clip_sequence_with_silences',
41
+ 'distribute_remainder_as_silences',
42
+ 'repeat_clips_to_fill_duration',
43
+ 'build_consecutive_sources_for_count_task',
44
+ 'build_random_order_for_count_task',
45
+ 'build_count_task_audio',
46
+ 'calculate_duration_slot_distribution',
47
+ 'build_duration_task_audio',
48
+ 'get_lufs_loudness',
49
+ 'normalize_to_lufs'
50
+ ]
utils/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (1.28 kB). View file
 
utils/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (1.16 kB). View file
 
utils/__pycache__/audio_utils.cpython-312.pyc ADDED
Binary file (48 kB). View file
 
utils/__pycache__/audio_utils.cpython-314.pyc ADDED
Binary file (45.1 kB). View file
 
utils/__pycache__/dataset_utils.cpython-312.pyc ADDED
Binary file (26.2 kB). View file
 
utils/__pycache__/llm_utils.cpython-312.pyc ADDED
Binary file (5.87 kB). View file
 
utils/__pycache__/logger.cpython-312.pyc ADDED
Binary file (2.33 kB). View file
 
utils/__pycache__/question_utils.cpython-312.pyc ADDED
Binary file (9.7 kB). View file
 
utils/audio_utils.py ADDED
@@ -0,0 +1,1388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio processing utilities for temporal reasoning dataset generation.
3
+ """
4
+
5
+ import os
6
+ import random
7
+ from pathlib import Path
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+
10
+ import numpy as np
11
+ from pydub import AudioSegment
12
+
13
+ try:
14
+ import pyloudnorm as pyln
15
+ PYLOUDNORM_AVAILABLE = True
16
+ except ImportError:
17
+ PYLOUDNORM_AVAILABLE = False
18
+
19
+ from .logger import setup_logger
20
+
21
+ logger = setup_logger(__name__)
22
+
23
+
24
+ def get_lufs_loudness(audio: AudioSegment) -> float:
25
+ """
26
+ Calculate integrated LUFS loudness (perceived loudness) of an audio segment.
27
+
28
+ LUFS (Loudness Units Full Scale) is the broadcast standard for measuring
29
+ perceived loudness. It accounts for human hearing sensitivity to different
30
+ frequencies using K-weighting.
31
+
32
+ Args:
33
+ audio: Input audio segment (pydub AudioSegment)
34
+
35
+ Returns:
36
+ Loudness in LUFS (negative values, typically -70 to 0)
37
+ Returns dBFS if pyloudnorm is not available (fallback)
38
+ """
39
+ if not PYLOUDNORM_AVAILABLE:
40
+ logger.warning("pyloudnorm not available, falling back to dBFS")
41
+ return audio.dBFS
42
+
43
+ # Convert pydub AudioSegment to numpy array
44
+ samples = np.array(audio.get_array_of_samples())
45
+
46
+ # Handle stereo by reshaping
47
+ if audio.channels == 2:
48
+ samples = samples.reshape((-1, 2))
49
+
50
+ # Normalize to float [-1, 1]
51
+ if audio.sample_width == 1:
52
+ samples = samples.astype(np.float64) / 128.0 - 1.0
53
+ elif audio.sample_width == 2:
54
+ samples = samples.astype(np.float64) / 32768.0
55
+ elif audio.sample_width == 4:
56
+ samples = samples.astype(np.float64) / 2147483648.0
57
+ else:
58
+ samples = samples.astype(np.float64) / 32768.0 # default to 16-bit
59
+
60
+ # Create meter with sample rate
61
+ meter = pyln.Meter(audio.frame_rate)
62
+
63
+ # Measure integrated loudness
64
+ try:
65
+ loudness = meter.integrated_loudness(samples)
66
+ # Handle -inf for silent audio
67
+ if np.isinf(loudness):
68
+ loudness = -70.0 # Return very quiet value instead of -inf
69
+ return loudness
70
+ except Exception as e:
71
+ logger.warning(f"LUFS measurement failed: {e}, falling back to dBFS")
72
+ return audio.dBFS
73
+
74
+
75
+ def normalize_to_lufs(audio: AudioSegment, target_lufs: float = -23.0) -> AudioSegment:
76
+ """
77
+ Normalize audio to a target LUFS level (perceived loudness normalization).
78
+
79
+ This is superior to dBFS normalization for comparing different sound types
80
+ because it accounts for human hearing sensitivity.
81
+
82
+ Args:
83
+ audio: Input audio segment
84
+ target_lufs: Target loudness level in LUFS (default: -23 LUFS, EBU R128 standard)
85
+
86
+ Returns:
87
+ Loudness-normalized audio segment
88
+ """
89
+ if not PYLOUDNORM_AVAILABLE:
90
+ logger.warning("pyloudnorm not available, falling back to dBFS normalization")
91
+ change_db = target_lufs - audio.dBFS
92
+ return audio.apply_gain(change_db)
93
+
94
+ current_lufs = get_lufs_loudness(audio)
95
+
96
+ # Calculate required gain change
97
+ gain_db = target_lufs - current_lufs
98
+
99
+ # Apply gain
100
+ normalized = audio.apply_gain(gain_db)
101
+
102
+ logger.debug(f"Normalized LUFS: {current_lufs:.2f} -> {get_lufs_loudness(normalized):.2f} LUFS")
103
+
104
+ return normalized
105
+
106
+
107
+ class AudioProcessor:
108
+ """Handles audio loading, processing, and concatenation."""
109
+
110
+ def __init__(
111
+ self,
112
+ crossfade_duration: int = 500,
113
+ silence_duration: int = 1000,
114
+ with_silence: bool = True,
115
+ normalize: bool = False,
116
+ normalize_target_dBFS: float = -20.0,
117
+ synthetic_silence_path: Optional[str] = None
118
+ ):
119
+ """
120
+ Initialize the audio processor.
121
+
122
+ Args:
123
+ crossfade_duration: Duration of crossfade in milliseconds
124
+ silence_duration: Duration of silence between clips in milliseconds
125
+ with_silence: Whether to add silence between clips
126
+ normalize: Whether to normalize audio levels
127
+ normalize_target_dBFS: Target dBFS level for normalization
128
+ synthetic_silence_path: Path to synthetic silence audio files
129
+ """
130
+ self.crossfade_duration = crossfade_duration
131
+ self.silence_duration = silence_duration
132
+ self.with_silence = with_silence
133
+ self.normalize = normalize
134
+ self.normalize_target_dBFS = normalize_target_dBFS
135
+ self.synthetic_silence_path = synthetic_silence_path
136
+ self._silence_cache = {}
137
+
138
+ def load_audio(self, audio_path: str) -> AudioSegment:
139
+ """
140
+ Load an audio file.
141
+
142
+ Args:
143
+ audio_path: Path to the audio file
144
+
145
+ Returns:
146
+ Loaded audio segment
147
+ """
148
+ try:
149
+ audio = AudioSegment.from_file(audio_path, format="wav")
150
+ logger.debug(f"Loaded audio: {audio_path}, duration: {len(audio)}ms")
151
+ return audio
152
+ except Exception as e:
153
+ logger.error(f"Error loading audio {audio_path}: {e}")
154
+ raise
155
+
156
+ def normalize_audio(self, audio: AudioSegment, target_dBFS: Optional[float] = None) -> AudioSegment:
157
+ """
158
+ Normalize audio to a target dBFS level.
159
+
160
+ Args:
161
+ audio: Input audio segment
162
+ target_dBFS: Target dBFS level (uses default if None)
163
+
164
+ Returns:
165
+ Normalized audio segment
166
+ """
167
+ if target_dBFS is None:
168
+ target_dBFS = self.normalize_target_dBFS
169
+
170
+ change_in_dBFS = target_dBFS - audio.dBFS
171
+ normalized = audio.apply_gain(change_in_dBFS)
172
+ logger.debug(f"Normalized audio: {audio.dBFS:.2f} dBFS -> {normalized.dBFS:.2f} dBFS")
173
+ return normalized
174
+
175
+ def adjust_volume(self, audio: AudioSegment, volume_db: float) -> AudioSegment:
176
+ """
177
+ Adjust audio volume by a specific dB amount.
178
+
179
+ Args:
180
+ audio: Input audio segment
181
+ volume_db: Volume adjustment in dB (positive = louder, negative = quieter)
182
+
183
+ Returns:
184
+ Volume-adjusted audio segment
185
+ """
186
+ adjusted = audio.apply_gain(volume_db)
187
+ logger.debug(f"Adjusted volume by {volume_db} dB: {audio.dBFS:.2f} -> {adjusted.dBFS:.2f} dBFS")
188
+ return adjusted
189
+
190
+ def get_silence(self, duration: Optional[int] = None) -> AudioSegment:
191
+ """
192
+ Get a silence audio segment, using synthetic silence if available.
193
+
194
+ Args:
195
+ duration: Duration in milliseconds (uses default if None)
196
+
197
+ Returns:
198
+ Silence audio segment
199
+ """
200
+ if duration is None:
201
+ duration = self.silence_duration
202
+
203
+ # Check cache first
204
+ if duration in self._silence_cache:
205
+ return self._silence_cache[duration]
206
+
207
+ # Try to load synthetic silence
208
+ if self.synthetic_silence_path and os.path.exists(self.synthetic_silence_path):
209
+ silence_files = list(Path(self.synthetic_silence_path).glob("*.wav"))
210
+ if silence_files:
211
+ silence = self.load_audio(str(random.choice(silence_files)))
212
+ # Adjust duration if needed
213
+ if len(silence) < duration:
214
+ # Repeat the silence
215
+ repetitions = (duration // len(silence)) + 1
216
+ silence = silence * repetitions
217
+ silence = silence[:duration]
218
+ self._silence_cache[duration] = silence
219
+ logger.debug(f"Using synthetic silence: {duration}ms")
220
+ return silence
221
+
222
+ # Fall back to pure silence
223
+ silence = AudioSegment.silent(duration=duration)
224
+ self._silence_cache[duration] = silence
225
+ logger.debug(f"Using pure silence: {duration}ms")
226
+ return silence
227
+
228
+ def concatenate_audios(
229
+ self,
230
+ audio_list: List[AudioSegment],
231
+ normalize_each: bool = False,
232
+ volume_adjustments: Optional[List[float]] = None
233
+ ) -> AudioSegment:
234
+ """
235
+ Concatenate multiple audio segments with crossfade and optional silence.
236
+
237
+ Args:
238
+ audio_list: List of audio segments to concatenate
239
+ normalize_each: Whether to normalize each audio before concatenation
240
+ volume_adjustments: Optional list of volume adjustments (in dB) for each audio
241
+
242
+ Returns:
243
+ Concatenated audio segment
244
+ """
245
+ if not audio_list:
246
+ raise ValueError("audio_list cannot be empty")
247
+
248
+ if len(audio_list) == 1:
249
+ audio = audio_list[0]
250
+ if normalize_each and self.normalize:
251
+ audio = self.normalize_audio(audio)
252
+ if volume_adjustments and len(volume_adjustments) > 0:
253
+ audio = self.adjust_volume(audio, volume_adjustments[0])
254
+ return audio
255
+
256
+ # Process first audio
257
+ merged = audio_list[0]
258
+ if normalize_each and self.normalize:
259
+ merged = self.normalize_audio(merged)
260
+ if volume_adjustments and len(volume_adjustments) > 0:
261
+ merged = self.adjust_volume(merged, volume_adjustments[0])
262
+
263
+ # Concatenate remaining audios
264
+ for i, audio in enumerate(audio_list[1:], start=1):
265
+ # Process current audio
266
+ current = audio
267
+ if normalize_each and self.normalize:
268
+ current = self.normalize_audio(current)
269
+ if volume_adjustments and len(volume_adjustments) > i:
270
+ current = self.adjust_volume(current, volume_adjustments[i])
271
+
272
+ # Add silence if configured
273
+ if self.with_silence:
274
+ silence = self.get_silence()
275
+ # Crossfade between audio and silence for smooth transition
276
+ merged = merged.append(silence, crossfade=self.crossfade_duration)
277
+
278
+ # Append current audio WITHOUT crossfade to avoid cutting it
279
+ # The crossfade with silence already provides smooth transition
280
+ merged = merged.append(current, crossfade=0)
281
+
282
+ logger.debug(f"Concatenated {len(audio_list)} audio segments, total duration: {len(merged)}ms")
283
+ return merged
284
+
285
+ def concatenate_audio_files(
286
+ self,
287
+ audio_paths: List[str],
288
+ output_path: str,
289
+ normalize_each: bool = False,
290
+ volume_adjustments: Optional[List[float]] = None,
291
+ target_durations: Optional[List[float]] = None
292
+ ) -> Tuple[AudioSegment, dict]:
293
+ """
294
+ Load, concatenate, and save multiple audio files.
295
+
296
+ Args:
297
+ audio_paths: List of paths to audio files
298
+ output_path: Path to save the concatenated audio
299
+ normalize_each: Whether to normalize each audio before concatenation
300
+ volume_adjustments: Optional list of volume adjustments (in dB) for each audio
301
+ target_durations: Optional list of target durations (in seconds) for each clip
302
+
303
+ Returns:
304
+ Tuple of (concatenated audio segment, metadata dict)
305
+ """
306
+ # Load all audio files
307
+ audio_segments = []
308
+ for i, path in enumerate(audio_paths):
309
+ audio = self.load_audio(path)
310
+
311
+ # Adjust duration if specified
312
+ if target_durations and i < len(target_durations):
313
+ target_ms = int(target_durations[i] * 1000)
314
+ audio = trim_or_repeat_audio(audio, target_ms)
315
+ logger.debug(f"Adjusted clip {i} to {len(audio)}ms (target: {target_ms}ms)")
316
+
317
+ audio_segments.append(audio)
318
+
319
+ # Concatenate
320
+ merged = self.concatenate_audios(audio_segments, normalize_each, volume_adjustments)
321
+
322
+ # Save
323
+ output_path = Path(output_path)
324
+ output_path.parent.mkdir(parents=True, exist_ok=True)
325
+ merged.export(str(output_path), format="wav")
326
+ logger.info(f"Saved concatenated audio: {output_path}")
327
+
328
+ # Create metadata
329
+ metadata = {
330
+ "output_path": str(output_path),
331
+ "source_files": audio_paths,
332
+ "num_sources": len(audio_paths),
333
+ "total_duration_ms": len(merged),
334
+ "total_duration_s": len(merged) / 1000.0,
335
+ "individual_durations_ms": [len(a) for a in audio_segments],
336
+ "individual_durations_s": [len(a) / 1000.0 for a in audio_segments],
337
+ "target_durations_s": target_durations if target_durations else [],
338
+ "volume_adjustments_db": volume_adjustments if volume_adjustments else []
339
+ }
340
+
341
+ return merged, metadata
342
+
343
+
344
+ def generate_sample_durations_for_task(
345
+ task_duration_hours: float,
346
+ min_clip_duration: float,
347
+ max_clip_duration: float
348
+ ) -> list:
349
+ """
350
+ Generate sample durations that exactly fill the target task duration.
351
+
352
+ Algorithm:
353
+ 1. Start with remaining = total_seconds
354
+ 2. While remaining >= min_clip_duration:
355
+ - Sample d ~ Uniform(min, min(max, remaining))
356
+ - Append d to durations list
357
+ - Subtract d from remaining
358
+ 3. Return shuffled list of durations
359
+
360
+ This ensures:
361
+ - Total of all durations ≈ task_duration (within min_clip_duration tolerance)
362
+ - Each duration is uniformly sampled within valid range
363
+ - No overshoot of target duration
364
+
365
+ Args:
366
+ task_duration_hours: Total duration for the task in hours
367
+ min_clip_duration: Minimum duration per clip in seconds
368
+ max_clip_duration: Maximum duration per clip in seconds
369
+
370
+ Returns:
371
+ List of sample durations in seconds (shuffled)
372
+ """
373
+ task_duration_seconds = task_duration_hours * 3600
374
+ remaining = task_duration_seconds
375
+ durations = []
376
+
377
+ while remaining >= min_clip_duration:
378
+ # Cap max at remaining to avoid overshoot
379
+ effective_max = min(max_clip_duration, remaining)
380
+
381
+ # If remaining is less than min, we can't fit another sample
382
+ if effective_max < min_clip_duration:
383
+ break
384
+
385
+ # Sample uniformly within valid range
386
+ d = random.uniform(min_clip_duration, effective_max)
387
+ durations.append(d)
388
+ remaining -= d
389
+
390
+ # Shuffle to randomize order (durations were generated sequentially)
391
+ random.shuffle(durations)
392
+
393
+ total_duration = sum(durations)
394
+ logger.info(f"Task duration target: {task_duration_hours}h ({task_duration_seconds:.1f}s)")
395
+ logger.info(f"Generated {len(durations)} sample durations, total: {total_duration:.1f}s")
396
+ logger.info(f"Duration range: [{min(durations):.1f}s, {max(durations):.1f}s], "
397
+ f"mean: {total_duration/len(durations):.1f}s")
398
+ logger.info(f"Unused remainder: {remaining:.1f}s ({remaining/task_duration_seconds*100:.2f}%)")
399
+
400
+ return durations
401
+
402
+
403
+ def calculate_num_samples_for_task(
404
+ task_duration_hours: float,
405
+ min_clip_duration: float,
406
+ max_clip_duration: float
407
+ ) -> int:
408
+ """
409
+ Calculate number of samples needed to fill the task duration.
410
+
411
+ DEPRECATED: Use generate_sample_durations_for_task() instead for exact duration filling.
412
+ This function is kept for backward compatibility but uses average-based estimation.
413
+
414
+ Args:
415
+ task_duration_hours: Total duration for the task in hours
416
+ min_clip_duration: Minimum duration per clip in seconds
417
+ max_clip_duration: Maximum duration per clip in seconds
418
+
419
+ Returns:
420
+ Number of samples to generate (estimate)
421
+ """
422
+ task_duration_seconds = task_duration_hours * 3600
423
+ avg_clip_duration = (min_clip_duration + max_clip_duration) / 2
424
+ num_samples = int(task_duration_seconds / avg_clip_duration)
425
+
426
+ logger.info(f"Task duration: {task_duration_hours}h ({task_duration_seconds}s)")
427
+ logger.info(f"Avg clip duration: {avg_clip_duration}s (min: {min_clip_duration}s, max: {max_clip_duration}s)")
428
+ logger.info(f"Calculated number of samples: {num_samples}")
429
+
430
+ return max(1, num_samples) # At least 1 sample
431
+
432
+
433
+ def generate_single_clip_duration(
434
+ min_duration: float,
435
+ max_duration: float
436
+ ) -> float:
437
+ """
438
+ Generate a random clip duration between min and max.
439
+
440
+ Args:
441
+ min_duration: Minimum duration in seconds
442
+ max_duration: Maximum duration in seconds
443
+
444
+ Returns:
445
+ Random duration in seconds
446
+ """
447
+ return random.uniform(min_duration, max_duration)
448
+
449
+
450
+ def concatenate_to_target_duration(
451
+ base_audio: AudioSegment,
452
+ target_duration_seconds: float,
453
+ crossfade_ms: int = 0
454
+ ) -> AudioSegment:
455
+ """
456
+ Concatenate a base audio clip to reach target duration.
457
+
458
+ This takes a 5-second ESC-50 clip and repeats it to create a longer clip.
459
+
460
+ Args:
461
+ base_audio: Original 5s audio segment
462
+ target_duration_seconds: Target duration in seconds
463
+ crossfade_ms: Crossfade between repetitions in milliseconds
464
+
465
+ Returns:
466
+ Audio segment of target duration
467
+ """
468
+ target_duration_ms = int(target_duration_seconds * 1000)
469
+ base_duration_ms = len(base_audio)
470
+
471
+ if target_duration_ms <= base_duration_ms:
472
+ # Just trim if target is shorter
473
+ return base_audio[:target_duration_ms]
474
+
475
+ # Calculate number of repetitions needed
476
+ num_repetitions = (target_duration_ms // base_duration_ms) + 1
477
+
478
+ # Concatenate with crossfade
479
+ result = base_audio
480
+ for i in range(1, num_repetitions):
481
+ if crossfade_ms > 0:
482
+ result = result.append(base_audio, crossfade=crossfade_ms)
483
+ else:
484
+ result = result + base_audio
485
+
486
+ # Stop if we've reached target
487
+ if len(result) >= target_duration_ms:
488
+ break
489
+
490
+ # Trim to exact duration
491
+ return result[:target_duration_ms]
492
+
493
+
494
+ def set_random_seed(seed: int):
495
+ """Set random seed for reproducibility."""
496
+ random.seed(seed)
497
+ np.random.seed(seed)
498
+ logger.info(f"Random seed set to: {seed}")
499
+
500
+
501
+ def get_max_clip_num_to_be_joined(
502
+ target_duration_seconds: float,
503
+ source_clip_duration_seconds: float,
504
+ min_silence_ms: int = 100
505
+ ) -> Tuple[int, float]:
506
+ """
507
+ Calculate the maximum number of source clips needed to reach target duration.
508
+
509
+ Pipeline: pick dataset -> pick class -> pick audio clip -> get duration ->
510
+ concatenate clips to reach target duration -> modulo to get num clips ->
511
+ inserting silences randomly based on remainder.
512
+
513
+ Args:
514
+ target_duration_seconds: Target total duration in seconds
515
+ source_clip_duration_seconds: Duration of each source clip (e.g., 5s for ESC-50)
516
+ min_silence_ms: Minimum silence between clips in milliseconds
517
+
518
+ Returns:
519
+ Tuple of (num_clips_needed, remainder_seconds_for_silences)
520
+ - num_clips_needed: How many source clips to concatenate
521
+ - remainder_seconds_for_silences: Extra time to distribute as random silences
522
+
523
+ Example:
524
+ target=30s, source=5s -> (6, 0.0) - exactly 6 clips, no extra silence
525
+ target=32s, source=5s -> (6, 2.0) - 6 clips + 2s distributed as silences
526
+ """
527
+ target_ms = target_duration_seconds * 1000
528
+ source_ms = source_clip_duration_seconds * 1000
529
+
530
+ # Account for minimum silence between each pair of clips
531
+ # If we have N clips, we have (N-1) gaps for silence
532
+ # Each gap needs at least min_silence_ms
533
+
534
+ # Start by computing raw number of clips (floor division)
535
+ num_clips = int(target_ms // source_ms)
536
+ num_clips = max(1, num_clips) # At least 1 clip
537
+
538
+ # Total audio content from clips
539
+ clips_duration_ms = num_clips * source_ms
540
+
541
+ # Minimum required silence for gaps
542
+ num_gaps = max(0, num_clips - 1)
543
+ min_total_silence_ms = num_gaps * min_silence_ms
544
+
545
+ # Check if we need to reduce clips to fit silences
546
+ while num_clips > 1 and (clips_duration_ms + min_total_silence_ms) > target_ms:
547
+ num_clips -= 1
548
+ clips_duration_ms = num_clips * source_ms
549
+ num_gaps = num_clips - 1
550
+ min_total_silence_ms = num_gaps * min_silence_ms
551
+
552
+ # Calculate remainder for extra silences
553
+ remainder_ms = target_ms - clips_duration_ms - min_total_silence_ms
554
+ remainder_seconds = max(0, remainder_ms / 1000.0)
555
+
556
+ logger.debug(
557
+ f"get_max_clip_num: target={target_duration_seconds}s, source={source_clip_duration_seconds}s "
558
+ f"-> {num_clips} clips, {remainder_seconds:.3f}s remainder for extra silences"
559
+ )
560
+
561
+ return num_clips, remainder_seconds
562
+
563
+
564
+ def build_clip_sequence_with_silences(
565
+ audio_segments: List[AudioSegment],
566
+ target_duration_seconds: float,
567
+ min_silence_ms: int = 100,
568
+ max_extra_silence_per_gap_ms: int = 500,
569
+ crossfade_ms: int = 0
570
+ ) -> AudioSegment:
571
+ """
572
+ Build a final audio clip by concatenating segments with guaranteed silences.
573
+
574
+ Ensures:
575
+ 1. All clips are joined with at least min_silence_ms between them
576
+ 2. Any remainder duration is distributed as random extra silences in gaps
577
+ 3. Final duration matches target_duration_seconds exactly
578
+
579
+ Args:
580
+ audio_segments: List of audio segments to concatenate
581
+ target_duration_seconds: Target total duration in seconds
582
+ min_silence_ms: Minimum silence between each pair of clips (always inserted)
583
+ max_extra_silence_per_gap_ms: Maximum extra silence to add per gap
584
+ crossfade_ms: Crossfade duration in ms (applied when joining)
585
+
586
+ Returns:
587
+ Concatenated audio segment of exact target duration
588
+ """
589
+ if not audio_segments:
590
+ raise ValueError("audio_segments cannot be empty")
591
+
592
+ target_ms = int(target_duration_seconds * 1000)
593
+
594
+ if len(audio_segments) == 1:
595
+ # Single clip: just trim/repeat to target
596
+ audio = audio_segments[0]
597
+ if len(audio) >= target_ms:
598
+ return audio[:target_ms]
599
+ else:
600
+ # Repeat to reach target
601
+ return concatenate_to_target_duration(audio, target_duration_seconds, crossfade_ms)
602
+
603
+ # Calculate total audio content duration
604
+ total_audio_ms = sum(len(seg) for seg in audio_segments)
605
+ num_gaps = len(audio_segments) - 1
606
+
607
+ # Minimum silence needed
608
+ min_total_silence_ms = num_gaps * min_silence_ms
609
+
610
+ # Available time for extra silences
611
+ available_extra_ms = target_ms - total_audio_ms - min_total_silence_ms
612
+
613
+ if available_extra_ms < 0:
614
+ # Not enough room - need to trim clips
615
+ logger.warning(
616
+ f"Clips too long for target duration. Total audio: {total_audio_ms}ms, "
617
+ f"target: {target_ms}ms. Will trim final result."
618
+ )
619
+ available_extra_ms = 0
620
+
621
+ # Distribute extra silence randomly across gaps
622
+ extra_silences_ms = distribute_remainder_as_silences(
623
+ available_extra_ms,
624
+ num_gaps,
625
+ max_extra_silence_per_gap_ms
626
+ )
627
+
628
+ # Build the final audio
629
+ result = audio_segments[0]
630
+
631
+ for i, audio in enumerate(audio_segments[1:]):
632
+ # Calculate total silence for this gap
633
+ gap_silence_ms = min_silence_ms + extra_silences_ms[i]
634
+
635
+ # Add silence
636
+ silence = AudioSegment.silent(duration=gap_silence_ms)
637
+
638
+ if crossfade_ms > 0 and crossfade_ms < gap_silence_ms:
639
+ # Crossfade audio->silence for smooth transition, but NOT silence->audio
640
+ result = result.append(silence, crossfade=crossfade_ms)
641
+ result = result.append(audio, crossfade=0) # No crossfade to avoid cutting audio
642
+ else:
643
+ result = result + silence + audio
644
+
645
+ # Trim to exact target duration
646
+ if len(result) > target_ms:
647
+ result = result[:target_ms]
648
+ elif len(result) < target_ms:
649
+ # Pad with silence if slightly short
650
+ padding = AudioSegment.silent(duration=target_ms - len(result))
651
+ result = result + padding
652
+
653
+ logger.debug(
654
+ f"Built clip sequence: {len(audio_segments)} segments, "
655
+ f"final duration: {len(result)}ms (target: {target_ms}ms)"
656
+ )
657
+
658
+ return result
659
+
660
+
661
+ def distribute_remainder_as_silences(
662
+ remainder_ms: float,
663
+ num_gaps: int,
664
+ max_per_gap_ms: int = 500
665
+ ) -> List[int]:
666
+ """
667
+ Distribute remainder time as random silences across gaps.
668
+
669
+ Args:
670
+ remainder_ms: Total extra time to distribute (in ms)
671
+ num_gaps: Number of gaps between clips
672
+ max_per_gap_ms: Maximum extra silence per gap
673
+
674
+ Returns:
675
+ List of extra silence durations (in ms) for each gap
676
+ """
677
+ if num_gaps <= 0:
678
+ return []
679
+
680
+ remainder_ms = int(max(0, remainder_ms))
681
+
682
+ if remainder_ms == 0:
683
+ return [0] * num_gaps
684
+
685
+ # Generate random weights for distribution
686
+ weights = [random.random() for _ in range(num_gaps)]
687
+ total_weight = sum(weights)
688
+
689
+ if total_weight == 0:
690
+ # Fallback to uniform distribution
691
+ weights = [1.0] * num_gaps
692
+ total_weight = num_gaps
693
+
694
+ # Distribute proportionally, respecting max_per_gap
695
+ extra_silences = []
696
+ remaining = remainder_ms
697
+
698
+ for i, w in enumerate(weights):
699
+ if i == num_gaps - 1:
700
+ # Last gap gets whatever is left
701
+ extra = min(remaining, max_per_gap_ms)
702
+ else:
703
+ proportion = w / total_weight
704
+ extra = int(remainder_ms * proportion)
705
+ extra = min(extra, max_per_gap_ms, remaining)
706
+
707
+ extra_silences.append(extra)
708
+ remaining -= extra
709
+ total_weight -= w
710
+
711
+ # If there's still remainder (due to max_per_gap limits), do another pass
712
+ while remaining > 0:
713
+ for i in range(num_gaps):
714
+ if extra_silences[i] < max_per_gap_ms and remaining > 0:
715
+ add = min(remaining, max_per_gap_ms - extra_silences[i])
716
+ extra_silences[i] += add
717
+ remaining -= add
718
+ if remaining > 0:
719
+ # Can't distribute more (all gaps at max)
720
+ break
721
+
722
+ logger.debug(f"Distributed {remainder_ms}ms across {num_gaps} gaps: {extra_silences}")
723
+
724
+ return extra_silences
725
+
726
+
727
+ def repeat_clips_to_fill_duration(
728
+ source_audios: List[AudioSegment],
729
+ source_categories: List[str],
730
+ target_duration_seconds: float,
731
+ source_clip_duration_seconds: float = 5.0,
732
+ min_silence_ms: int = 100
733
+ ) -> Tuple[List[AudioSegment], List[str], int]:
734
+ """
735
+ Repeat source clips to fill target duration, cycling through all sources.
736
+
737
+ This ensures all unique sources appear and are repeated proportionally.
738
+
739
+ Args:
740
+ source_audios: List of unique source audio segments
741
+ source_categories: List of category names corresponding to source_audios
742
+ target_duration_seconds: Target total duration
743
+ source_clip_duration_seconds: Duration of each source clip
744
+ min_silence_ms: Minimum silence between clips
745
+
746
+ Returns:
747
+ Tuple of (expanded_audio_list, expanded_categories, num_clips)
748
+ """
749
+ num_clips, remainder = get_max_clip_num_to_be_joined(
750
+ target_duration_seconds,
751
+ source_clip_duration_seconds,
752
+ min_silence_ms
753
+ )
754
+
755
+ num_sources = len(source_audios)
756
+
757
+ if num_sources == 0:
758
+ raise ValueError("source_audios cannot be empty")
759
+
760
+ # Build expanded lists by cycling through sources
761
+ expanded_audios = []
762
+ expanded_categories = []
763
+
764
+ for i in range(num_clips):
765
+ idx = i % num_sources
766
+ expanded_audios.append(source_audios[idx])
767
+ expanded_categories.append(source_categories[idx])
768
+
769
+ logger.debug(
770
+ f"Repeated {num_sources} sources to {num_clips} clips for "
771
+ f"{target_duration_seconds}s target duration"
772
+ )
773
+
774
+ return expanded_audios, expanded_categories, num_clips
775
+
776
+
777
+ def build_consecutive_sources_for_count_task(
778
+ source_audios: List[AudioSegment],
779
+ source_categories: List[str],
780
+ target_duration_seconds: float,
781
+ source_clip_duration_seconds: float = 5.0,
782
+ min_silence_between_sources_ms: int = 100,
783
+ max_extra_silence_per_gap_ms: int = 500,
784
+ crossfade_within_source_ms: int = 50
785
+ ) -> Tuple[AudioSegment, List[str], dict]:
786
+ """
787
+ Build audio for COUNT task with consecutive same-class clips.
788
+
789
+ For count task, same-class clips must be consecutive (AAA BBB CCC) so they
790
+ are perceived as ONE sound source. Silences are only inserted BETWEEN
791
+ different classes, not within same-class repetitions.
792
+
793
+ Pipeline: pick classes -> for each class concatenate clips consecutively ->
794
+ insert silences only between different classes -> distribute remainder
795
+
796
+ Args:
797
+ source_audios: List of unique source audio segments (one per class)
798
+ source_categories: List of category names
799
+ target_duration_seconds: Target total duration
800
+ source_clip_duration_seconds: Duration of each source clip
801
+ min_silence_between_sources_ms: Minimum silence between different sources
802
+ max_extra_silence_per_gap_ms: Max extra silence per gap for remainder distribution
803
+ crossfade_within_source_ms: Small crossfade within same-source repetitions
804
+
805
+ Returns:
806
+ Tuple of (final_audio, category_sequence, metadata_dict)
807
+ """
808
+ target_ms = int(target_duration_seconds * 1000)
809
+ source_ms = int(source_clip_duration_seconds * 1000)
810
+ num_sources = len(source_audios)
811
+
812
+ if num_sources == 0:
813
+ raise ValueError("source_audios cannot be empty")
814
+
815
+ # Calculate total clips needed
816
+ num_clips, remainder_seconds = get_max_clip_num_to_be_joined(
817
+ target_duration_seconds,
818
+ source_clip_duration_seconds,
819
+ min_silence_between_sources_ms
820
+ )
821
+
822
+ # Safety check: if more sources than clips can fit, warn
823
+ if num_sources > num_clips:
824
+ logger.warning(
825
+ f"More sources ({num_sources}) than clips that fit ({num_clips}). "
826
+ f"Each source needs at least 1 clip, so output may exceed target duration. "
827
+ f"Consider capping n_unique_audios <= max_clips in task_count.py"
828
+ )
829
+ # Each source gets exactly 1 rep if there are more sources than clips
830
+ num_clips = num_sources # This will exceed target but ensures each source is included
831
+
832
+ # Distribute clips across sources as evenly as possible
833
+ # Each source gets at least 1 clip since num_sources <= num_clips
834
+ base_reps = num_clips // num_sources
835
+ extra_reps = num_clips % num_sources
836
+
837
+ repetitions_per_source = []
838
+ for i in range(num_sources):
839
+ reps = base_reps + (1 if i < extra_reps else 0)
840
+ repetitions_per_source.append(reps)
841
+
842
+ # Shuffle repetition assignment to add variety
843
+ random.shuffle(repetitions_per_source)
844
+
845
+ # Build each source's audio block (consecutive clips of same class)
846
+ source_blocks = []
847
+ category_sequence = []
848
+
849
+ for i, (audio, category, reps) in enumerate(zip(source_audios, source_categories, repetitions_per_source)):
850
+ if reps == 0:
851
+ continue
852
+
853
+ # Concatenate same-source clips with minimal/no gap (just small crossfade)
854
+ block = audio
855
+ for _ in range(reps - 1):
856
+ if crossfade_within_source_ms > 0:
857
+ block = block.append(audio, crossfade=crossfade_within_source_ms)
858
+ else:
859
+ block = block + audio
860
+
861
+ source_blocks.append(block)
862
+ category_sequence.append(category)
863
+
864
+ # Now we have N source blocks, need to join them with silences
865
+ # Number of gaps = num_source_blocks - 1
866
+ num_gaps = len(source_blocks) - 1
867
+
868
+ if num_gaps <= 0:
869
+ # Only one source block
870
+ final_audio = source_blocks[0]
871
+ else:
872
+ # Calculate total audio duration from blocks
873
+ total_blocks_ms = sum(len(block) for block in source_blocks)
874
+ min_total_silence_ms = num_gaps * min_silence_between_sources_ms
875
+
876
+ # Available for extra silences
877
+ available_extra_ms = target_ms - total_blocks_ms - min_total_silence_ms
878
+ available_extra_ms = max(0, available_extra_ms)
879
+
880
+ # Distribute extra silence across gaps
881
+ extra_silences = distribute_remainder_as_silences(
882
+ available_extra_ms,
883
+ num_gaps,
884
+ max_extra_silence_per_gap_ms
885
+ )
886
+
887
+ # Build final audio with silences between source blocks
888
+ final_audio = source_blocks[0]
889
+ for i, block in enumerate(source_blocks[1:]):
890
+ gap_silence_ms = min_silence_between_sources_ms + extra_silences[i]
891
+ silence = AudioSegment.silent(duration=gap_silence_ms)
892
+ final_audio = final_audio + silence + block
893
+
894
+ # Trim or pad to exact target duration
895
+ if len(final_audio) > target_ms:
896
+ final_audio = final_audio[:target_ms]
897
+ elif len(final_audio) < target_ms:
898
+ padding = AudioSegment.silent(duration=target_ms - len(final_audio))
899
+ final_audio = final_audio + padding
900
+
901
+ # Create metadata
902
+ metadata = {
903
+ 'num_unique_sources': num_sources,
904
+ 'total_clips': num_clips,
905
+ 'ordering_mode': 'consecutive',
906
+ 'repetitions_per_source': dict(zip(source_categories, repetitions_per_source)),
907
+ 'target_duration_ms': target_ms,
908
+ 'actual_duration_ms': len(final_audio),
909
+ 'num_gaps_between_sources': num_gaps
910
+ }
911
+
912
+ logger.debug(
913
+ f"Count task (consecutive): {num_sources} sources, {num_clips} total clips, "
914
+ f"reps={repetitions_per_source}, duration={len(final_audio)}ms"
915
+ )
916
+
917
+ return final_audio, category_sequence, metadata
918
+
919
+
920
+ def build_random_order_for_count_task(
921
+ source_audios: List[AudioSegment],
922
+ source_categories: List[str],
923
+ target_duration_seconds: float,
924
+ source_clip_duration_seconds: float = 5.0,
925
+ min_silence_ms: int = 100,
926
+ max_extra_silence_per_gap_ms: int = 500
927
+ ) -> Tuple[AudioSegment, List[str], dict]:
928
+ """
929
+ Build audio for COUNT task with RANDOM ordering of clips.
930
+
931
+ Clips from different sources are shuffled randomly (A B A C B A C...).
932
+ This tests whether the model can recognize recurring sounds as the same source.
933
+ Silences are inserted between ALL clips (same or different source).
934
+
935
+ Pipeline:
936
+ 1. Calculate total clips needed
937
+ 2. Distribute clips across sources
938
+ 3. Create expanded list with all clip instances
939
+ 4. Shuffle randomly
940
+ 5. Insert silences between ALL clips
941
+ 6. Distribute remainder as extra random silences
942
+
943
+ Args:
944
+ source_audios: List of unique source audio segments (one per class)
945
+ source_categories: List of category names
946
+ target_duration_seconds: Target total duration
947
+ source_clip_duration_seconds: Duration of each source clip
948
+ min_silence_ms: Minimum silence between ALL clips
949
+ max_extra_silence_per_gap_ms: Max extra silence per gap
950
+
951
+ Returns:
952
+ Tuple of (final_audio, clip_sequence, metadata_dict)
953
+ """
954
+ target_ms = int(target_duration_seconds * 1000)
955
+ source_ms = int(source_clip_duration_seconds * 1000)
956
+ num_sources = len(source_audios)
957
+
958
+ if num_sources == 0:
959
+ raise ValueError("source_audios cannot be empty")
960
+
961
+ # Calculate total clips needed
962
+ num_clips, remainder_seconds = get_max_clip_num_to_be_joined(
963
+ target_duration_seconds,
964
+ source_clip_duration_seconds,
965
+ min_silence_ms
966
+ )
967
+
968
+ # Safety check: if more sources than clips can fit, warn and cap sources
969
+ if num_sources > num_clips:
970
+ logger.warning(
971
+ f"More sources ({num_sources}) than clips that fit ({num_clips}). "
972
+ f"Each source needs at least 1 clip, so output may exceed target duration. "
973
+ f"Consider capping n_unique_audios <= max_clips in task_count.py"
974
+ )
975
+ # Each source gets exactly 1 rep if there are more sources than clips
976
+ num_clips = num_sources # This will exceed target but ensures each source is included
977
+
978
+ # Distribute clips across sources as evenly as possible
979
+ base_reps = num_clips // num_sources # At least 1 since num_sources <= num_clips (after cap)
980
+ extra_reps = num_clips % num_sources
981
+
982
+ repetitions_per_source = []
983
+ for i in range(num_sources):
984
+ reps = base_reps + (1 if i < extra_reps else 0)
985
+ repetitions_per_source.append(reps)
986
+
987
+ # Build expanded list of (audio, category) pairs
988
+ expanded_clips = []
989
+ for audio, category, reps in zip(source_audios, source_categories, repetitions_per_source):
990
+ for _ in range(reps):
991
+ expanded_clips.append((audio, category))
992
+
993
+ # Shuffle the clips randomly
994
+ random.shuffle(expanded_clips)
995
+
996
+ # Extract shuffled audios and categories
997
+ shuffled_audios = [clip[0] for clip in expanded_clips]
998
+ clip_sequence = [clip[1] for clip in expanded_clips]
999
+
1000
+ # Build final audio with silences between ALL clips
1001
+ final_audio = build_clip_sequence_with_silences(
1002
+ shuffled_audios,
1003
+ target_duration_seconds,
1004
+ min_silence_ms=min_silence_ms,
1005
+ max_extra_silence_per_gap_ms=max_extra_silence_per_gap_ms,
1006
+ crossfade_ms=0 # No crossfade for random ordering
1007
+ )
1008
+
1009
+ # Create metadata
1010
+ metadata = {
1011
+ 'num_unique_sources': num_sources,
1012
+ 'total_clips': len(expanded_clips),
1013
+ 'ordering_mode': 'random',
1014
+ 'repetitions_per_source': dict(zip(source_categories, repetitions_per_source)),
1015
+ 'clip_sequence': clip_sequence,
1016
+ 'target_duration_ms': target_ms,
1017
+ 'actual_duration_ms': len(final_audio),
1018
+ 'num_gaps': len(expanded_clips) - 1
1019
+ }
1020
+
1021
+ logger.debug(
1022
+ f"Count task (random): {num_sources} sources, {len(expanded_clips)} clips, "
1023
+ f"sequence={clip_sequence[:5]}..., duration={len(final_audio)}ms"
1024
+ )
1025
+
1026
+ return final_audio, clip_sequence, metadata
1027
+
1028
+
1029
+ def build_count_task_audio(
1030
+ source_audios: List[AudioSegment],
1031
+ source_categories: List[str],
1032
+ target_duration_seconds: float,
1033
+ ordering_mode: str = "random",
1034
+ source_clip_duration_seconds: float = 5.0,
1035
+ min_silence_ms: int = 100,
1036
+ max_extra_silence_per_gap_ms: int = 500,
1037
+ crossfade_within_source_ms: int = 50
1038
+ ) -> Tuple[AudioSegment, List[str], dict]:
1039
+ """
1040
+ Build audio for COUNT task with configurable ordering mode.
1041
+
1042
+ Args:
1043
+ source_audios: List of unique source audio segments (one per class)
1044
+ source_categories: List of category names
1045
+ target_duration_seconds: Target total duration
1046
+ ordering_mode: "random" or "consecutive"
1047
+ - "random": Clips shuffled (A B A C B A C) - tests sound recognition
1048
+ - "consecutive": Same-source grouped (AAA BBB CCC) - easier
1049
+ source_clip_duration_seconds: Duration of each source clip
1050
+ min_silence_ms: Minimum silence between clips
1051
+ max_extra_silence_per_gap_ms: Max extra silence per gap
1052
+ crossfade_within_source_ms: Crossfade for consecutive mode only
1053
+
1054
+ Returns:
1055
+ Tuple of (final_audio, clip_sequence, metadata_dict)
1056
+ """
1057
+ if ordering_mode == "consecutive":
1058
+ return build_consecutive_sources_for_count_task(
1059
+ source_audios,
1060
+ source_categories,
1061
+ target_duration_seconds,
1062
+ source_clip_duration_seconds,
1063
+ min_silence_ms,
1064
+ max_extra_silence_per_gap_ms,
1065
+ crossfade_within_source_ms
1066
+ )
1067
+ else: # random (default)
1068
+ return build_random_order_for_count_task(
1069
+ source_audios,
1070
+ source_categories,
1071
+ target_duration_seconds,
1072
+ source_clip_duration_seconds,
1073
+ min_silence_ms,
1074
+ max_extra_silence_per_gap_ms
1075
+ )
1076
+
1077
+
1078
+ # =============================================================================
1079
+ # DURATION TASK FUNCTIONS
1080
+ # =============================================================================
1081
+
1082
+ def calculate_duration_slot_distribution(
1083
+ target_total_duration_s: float,
1084
+ effective_durations: Dict[str, float],
1085
+ target_category: str,
1086
+ question_type: str,
1087
+ multiplier_longest: float = 1.5,
1088
+ multiplier_shortest: float = 0.5,
1089
+ min_silence_between_sources_ms: int = 100
1090
+ ) -> Tuple[Dict[str, int], bool, Dict]:
1091
+ """
1092
+ Calculate how many repetitions each source gets for duration task.
1093
+
1094
+ For LONGEST: target gets max repetitions, backgrounds get 1 each
1095
+ For SHORTEST: target gets 1, backgrounds share remaining duration
1096
+
1097
+ Args:
1098
+ target_total_duration_s: Target total audio duration
1099
+ effective_durations: Dict mapping category -> effective duration in seconds
1100
+ target_category: The category that should be longest/shortest
1101
+ question_type: "longest" or "shortest"
1102
+ multiplier_longest: target >= max_background * this
1103
+ multiplier_shortest: target <= min_background * this
1104
+ min_silence_between_sources_ms: Minimum silence between different sources
1105
+
1106
+ Returns:
1107
+ Tuple of (slot_distribution, gap_satisfied, metadata)
1108
+ slot_distribution: Dict mapping category -> number of repetitions
1109
+ gap_satisfied: Whether the duration gap constraint is met
1110
+ metadata: Additional info about the calculation
1111
+ """
1112
+ categories = list(effective_durations.keys())
1113
+ n_sources = len(categories)
1114
+
1115
+ if n_sources < 2:
1116
+ # Single source - always satisfies constraint
1117
+ reps = max(1, int(target_total_duration_s / effective_durations[target_category]))
1118
+ return {target_category: reps}, True, {'note': 'single_source'}
1119
+
1120
+ # Total silence between sources
1121
+ total_silence_s = (n_sources - 1) * min_silence_between_sources_ms / 1000.0
1122
+ available_for_audio_s = target_total_duration_s - total_silence_s
1123
+
1124
+ background_categories = [c for c in categories if c != target_category]
1125
+
1126
+ if question_type == "longest":
1127
+ # Backgrounds get 1 rep each
1128
+ background_duration_s = sum(effective_durations[c] for c in background_categories)
1129
+
1130
+ # Remaining for target
1131
+ remaining_for_target_s = available_for_audio_s - background_duration_s
1132
+ target_duration_per_rep = effective_durations[target_category]
1133
+
1134
+ # Calculate reps for target
1135
+ target_reps = max(1, int(remaining_for_target_s / target_duration_per_rep))
1136
+ actual_target_duration = target_reps * target_duration_per_rep
1137
+
1138
+ # Verify gap
1139
+ max_background_duration = max(effective_durations[c] for c in background_categories)
1140
+ required_target_duration = max_background_duration * multiplier_longest
1141
+ gap_satisfied = actual_target_duration >= required_target_duration
1142
+
1143
+ slot_distribution = {c: 1 for c in background_categories}
1144
+ slot_distribution[target_category] = target_reps
1145
+
1146
+ metadata = {
1147
+ 'available_for_audio_s': available_for_audio_s,
1148
+ 'background_duration_s': background_duration_s,
1149
+ 'remaining_for_target_s': remaining_for_target_s,
1150
+ 'target_reps': target_reps,
1151
+ 'actual_target_duration_s': actual_target_duration,
1152
+ 'max_background_duration_s': max_background_duration,
1153
+ 'required_target_duration_s': required_target_duration,
1154
+ 'multiplier_used': multiplier_longest
1155
+ }
1156
+
1157
+ else: # shortest
1158
+ # Target gets 1 rep
1159
+ target_duration_s = effective_durations[target_category]
1160
+
1161
+ # Remaining for backgrounds
1162
+ remaining_for_backgrounds_s = available_for_audio_s - target_duration_s
1163
+
1164
+ # Distribute remaining to backgrounds as evenly as possible
1165
+ # while ensuring each background is longer than target * 1/multiplier
1166
+ slot_distribution = {target_category: 1}
1167
+
1168
+ # Calculate minimum required duration for each background
1169
+ min_background_required = target_duration_s / multiplier_shortest
1170
+
1171
+ background_reps = {}
1172
+ for cat in background_categories:
1173
+ eff_dur = effective_durations[cat]
1174
+ # How many reps needed to exceed min_background_required?
1175
+ min_reps = max(1, int(min_background_required / eff_dur) + 1)
1176
+ background_reps[cat] = min_reps
1177
+
1178
+ # Check if we have room for all backgrounds
1179
+ total_background_needed = sum(
1180
+ background_reps[c] * effective_durations[c]
1181
+ for c in background_categories
1182
+ )
1183
+
1184
+ if total_background_needed <= remaining_for_backgrounds_s:
1185
+ # Distribute extra reps
1186
+ extra_available = remaining_for_backgrounds_s - total_background_needed
1187
+
1188
+ # Add extra reps to backgrounds proportionally
1189
+ while extra_available > 0:
1190
+ added_any = False
1191
+ for cat in background_categories:
1192
+ eff_dur = effective_durations[cat]
1193
+ if extra_available >= eff_dur:
1194
+ background_reps[cat] += 1
1195
+ extra_available -= eff_dur
1196
+ added_any = True
1197
+ if not added_any:
1198
+ break
1199
+
1200
+ slot_distribution.update(background_reps)
1201
+ gap_satisfied = True
1202
+ else:
1203
+ # Not enough room - use minimum reps anyway
1204
+ slot_distribution.update(background_reps)
1205
+ gap_satisfied = False
1206
+
1207
+ # Calculate actual durations
1208
+ actual_durations = {
1209
+ cat: slot_distribution[cat] * effective_durations[cat]
1210
+ for cat in categories
1211
+ }
1212
+ min_background_actual = min(
1213
+ actual_durations[c] for c in background_categories
1214
+ )
1215
+
1216
+ # Re-verify gap
1217
+ gap_satisfied = actual_durations[target_category] <= min_background_actual * multiplier_shortest
1218
+
1219
+ metadata = {
1220
+ 'available_for_audio_s': available_for_audio_s,
1221
+ 'target_duration_s': target_duration_s,
1222
+ 'remaining_for_backgrounds_s': remaining_for_backgrounds_s,
1223
+ 'min_background_required_s': min_background_required,
1224
+ 'actual_durations_s': actual_durations,
1225
+ 'min_background_actual_s': min_background_actual,
1226
+ 'multiplier_used': multiplier_shortest
1227
+ }
1228
+
1229
+ return slot_distribution, gap_satisfied, metadata
1230
+
1231
+
1232
+ def build_duration_task_audio(
1233
+ source_audio_lists: Dict[str, List[AudioSegment]],
1234
+ slot_distribution: Dict[str, int],
1235
+ effective_durations: Dict[str, float],
1236
+ target_total_duration_s: float,
1237
+ min_silence_between_sources_ms: int = 100,
1238
+ max_extra_silence_per_gap_ms: int = 500,
1239
+ crossfade_within_source_ms: int = 50
1240
+ ) -> Tuple[AudioSegment, List[str], Dict]:
1241
+ """
1242
+ Build audio for DURATION task with consecutive ordering per source.
1243
+
1244
+ Structure: [SourceA × n] + silence + [SourceB × m] + silence + ...
1245
+ Order of sources is randomized to avoid patterns.
1246
+
1247
+ Args:
1248
+ source_audio_lists: Dict mapping category -> list of audio segments
1249
+ slot_distribution: Dict mapping category -> number of repetitions
1250
+ effective_durations: Dict mapping category -> effective duration per clip
1251
+ target_total_duration_s: Target total duration
1252
+ min_silence_between_sources_ms: Min silence between different sources
1253
+ max_extra_silence_per_gap_ms: Max extra silence per gap
1254
+ crossfade_within_source_ms: Crossfade between same-source repetitions
1255
+
1256
+ Returns:
1257
+ Tuple of (final_audio, category_sequence, metadata)
1258
+ """
1259
+ categories = list(slot_distribution.keys())
1260
+
1261
+ # Randomize source order
1262
+ random.shuffle(categories)
1263
+
1264
+ # Build audio blocks for each source
1265
+ source_blocks = []
1266
+ category_sequence = []
1267
+ actual_durations = {}
1268
+ block_durations_ms = [] # Track duration of each block for timestamp calculation
1269
+
1270
+ for category in categories:
1271
+ reps = slot_distribution[category]
1272
+ audio_list = source_audio_lists[category]
1273
+
1274
+ if reps == 0:
1275
+ continue
1276
+
1277
+ # Build block for this source
1278
+ block = audio_list[0]
1279
+ for i in range(1, reps):
1280
+ # Use same clip or cycle through available clips
1281
+ next_clip = audio_list[i % len(audio_list)]
1282
+
1283
+ # Crossfade within same source
1284
+ if crossfade_within_source_ms > 0:
1285
+ if len(block) > crossfade_within_source_ms and len(next_clip) > crossfade_within_source_ms:
1286
+ block = block.append(next_clip, crossfade=crossfade_within_source_ms)
1287
+ else:
1288
+ block = block + next_clip
1289
+ else:
1290
+ block = block + next_clip
1291
+
1292
+ source_blocks.append((category, block))
1293
+ block_durations_ms.append(len(block))
1294
+ category_sequence.extend([category] * reps)
1295
+ actual_durations[category] = len(block) / 1000.0
1296
+
1297
+ # Calculate total audio duration and available extra silence
1298
+ total_audio_ms = sum(len(block) for _, block in source_blocks)
1299
+ num_gaps = len(source_blocks) - 1
1300
+ min_total_silence_ms = num_gaps * min_silence_between_sources_ms
1301
+
1302
+ target_ms = int(target_total_duration_s * 1000)
1303
+ available_extra_ms = target_ms - total_audio_ms - min_total_silence_ms
1304
+
1305
+ # Distribute extra silence
1306
+ if available_extra_ms > 0 and num_gaps > 0:
1307
+ extra_silences = distribute_remainder_as_silences(
1308
+ available_extra_ms,
1309
+ num_gaps,
1310
+ max_extra_silence_per_gap_ms
1311
+ )
1312
+ else:
1313
+ extra_silences = [0] * max(num_gaps, 1)
1314
+
1315
+ # Concatenate with silences and track timestamps
1316
+ source_timestamps = [] # List of (category, start_ms, end_ms)
1317
+ current_position_ms = 0
1318
+
1319
+ if len(source_blocks) == 1:
1320
+ final_audio = source_blocks[0][1]
1321
+ cat, block = source_blocks[0]
1322
+ source_timestamps.append((cat, 0, len(block)))
1323
+ else:
1324
+ final_audio = source_blocks[0][1]
1325
+ cat, block = source_blocks[0]
1326
+ source_timestamps.append((cat, 0, len(block)))
1327
+ current_position_ms = len(block)
1328
+
1329
+ for i, (cat, block) in enumerate(source_blocks[1:]):
1330
+ gap_silence_ms = min_silence_between_sources_ms + extra_silences[i]
1331
+ silence = AudioSegment.silent(duration=gap_silence_ms)
1332
+
1333
+ # Prefer crossfading from audio -> silence for a smooth transition,
1334
+ # but avoid crossfading silence -> audio (it cuts the start of the next clip).
1335
+ # Conditions for safe crossfade:
1336
+ # - crossfade length should be less than gap silence
1337
+ # - both segments must be longer than crossfade
1338
+ crossfade_ms = min(500, gap_silence_ms)
1339
+ if crossfade_ms > 0 and crossfade_ms < gap_silence_ms and len(final_audio) > crossfade_ms and len(block) > crossfade_ms:
1340
+ final_audio = final_audio.append(silence, crossfade=crossfade_ms)
1341
+ # Append next block without crossfade to avoid trimming its start
1342
+ final_audio = final_audio.append(block, crossfade=0)
1343
+ # Track timestamp after silence (start of block)
1344
+ start_ms = current_position_ms + gap_silence_ms
1345
+ end_ms = start_ms + len(block)
1346
+ source_timestamps.append((cat, start_ms, end_ms))
1347
+ current_position_ms = end_ms
1348
+ else:
1349
+ # Fall back to simple concatenation
1350
+ final_audio = final_audio + silence + block
1351
+ start_ms = current_position_ms + gap_silence_ms
1352
+ end_ms = start_ms + len(block)
1353
+ source_timestamps.append((cat, start_ms, end_ms))
1354
+ current_position_ms = end_ms
1355
+
1356
+ # Adjust to target duration
1357
+ if len(final_audio) > target_ms:
1358
+ final_audio = final_audio[:target_ms]
1359
+ elif len(final_audio) < target_ms:
1360
+ padding = AudioSegment.silent(duration=target_ms - len(final_audio))
1361
+ final_audio = final_audio + padding
1362
+
1363
+ # Build timestamp string: "category1 start-end, category2 start-end, ..."
1364
+ timestamp_parts = []
1365
+ for cat, start_ms, end_ms in source_timestamps:
1366
+ start_s = round(start_ms / 1000.0, 2)
1367
+ end_s = round(end_ms / 1000.0, 2)
1368
+ duration_s = round((end_ms - start_ms) / 1000.0, 2)
1369
+ timestamp_parts.append(f"{cat} {start_s}s-{end_s}s ({duration_s}s)")
1370
+ timestamp_string = ", ".join(timestamp_parts)
1371
+
1372
+ metadata = {
1373
+ 'source_order': [cat for cat, _ in source_blocks],
1374
+ 'slot_distribution': slot_distribution,
1375
+ 'actual_durations_s': actual_durations,
1376
+ 'total_audio_ms': total_audio_ms,
1377
+ 'num_gaps': num_gaps,
1378
+ 'final_duration_ms': len(final_audio),
1379
+ 'source_timestamps': source_timestamps, # List of (category, start_ms, end_ms)
1380
+ 'timestamp_string': timestamp_string # Human-readable format
1381
+ }
1382
+
1383
+ logger.debug(
1384
+ f"Duration task audio: {len(source_blocks)} sources, "
1385
+ f"order={metadata['source_order']}, duration={len(final_audio)}ms"
1386
+ )
1387
+
1388
+ return final_audio, category_sequence, metadata
utils/dataset_utils.py ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ESC-50 dataset utilities for loading and sampling audio data.
3
+ """
4
+
5
+ import csv
6
+ import json
7
+ import random
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional, Tuple
10
+
11
+ import pandas as pd
12
+
13
+ from .logger import setup_logger
14
+
15
+ logger = setup_logger(__name__)
16
+
17
+
18
+ def load_or_create_class_subset(config: dict, all_categories: List[str]) -> List[str]:
19
+ """
20
+ Load persisted class subset or create a new one.
21
+
22
+ Args:
23
+ config: Configuration dictionary with dataset.use_class_subset, etc.
24
+ all_categories: List of all available categories
25
+
26
+ Returns:
27
+ List of category names to use (either subset or all)
28
+ """
29
+ dataset_config = config.get('dataset', {})
30
+ use_subset = dataset_config.get('use_class_subset', False)
31
+
32
+ if not use_subset:
33
+ logger.info(f"Using all {len(all_categories)} classes")
34
+ return all_categories
35
+
36
+ num_classes = dataset_config.get('num_classes_subset', len(all_categories))
37
+ persist_path = Path(dataset_config.get('subset_persist_path', 'class_subset.json'))
38
+ subset_seed = dataset_config.get('subset_seed', 42)
39
+
40
+ # Try to load existing subset
41
+ if persist_path.exists():
42
+ try:
43
+ with open(persist_path, 'r') as f:
44
+ data = json.load(f)
45
+ subset = data.get('classes', [])
46
+
47
+ # Validate subset
48
+ if len(subset) == num_classes and all(c in all_categories for c in subset):
49
+ logger.info(f"Loaded persisted class subset from {persist_path}: {len(subset)} classes")
50
+ return subset
51
+ else:
52
+ logger.warning(f"Invalid persisted subset, regenerating...")
53
+ except Exception as e:
54
+ logger.warning(f"Failed to load persisted subset: {e}, regenerating...")
55
+
56
+ # Create new subset
57
+ random.seed(subset_seed)
58
+ subset = random.sample(all_categories, min(num_classes, len(all_categories)))
59
+ subset.sort() # Sort for consistency
60
+
61
+ # Persist subset
62
+ persist_path.parent.mkdir(parents=True, exist_ok=True)
63
+ with open(persist_path, 'w') as f:
64
+ json.dump({
65
+ 'classes': subset,
66
+ 'num_classes': len(subset),
67
+ 'seed': subset_seed,
68
+ 'total_available': len(all_categories)
69
+ }, f, indent=2)
70
+
71
+ logger.info(f"Created and persisted new class subset: {len(subset)} classes to {persist_path}")
72
+ return subset
73
+
74
+
75
+ class ESC50Dataset:
76
+ """Handler for ESC-50 dataset."""
77
+
78
+ # All 50 ESC-50 sound categories
79
+ ALL_CATEGORIES = [
80
+ 'dog', 'chirping_birds', 'vacuum_cleaner', 'thunderstorm', 'door_wood_knock',
81
+ 'can_opening', 'crow', 'clapping', 'fireworks', 'chainsaw', 'airplane',
82
+ 'mouse_click', 'pouring_water', 'train', 'sheep', 'water_drops', 'church_bells',
83
+ 'clock_alarm', 'keyboard_typing', 'wind', 'footsteps', 'frog', 'cow',
84
+ 'brushing_teeth', 'car_horn', 'crackling_fire', 'helicopter', 'drinking_sipping',
85
+ 'rain', 'insects', 'laughing', 'hen', 'engine', 'breathing', 'crying_baby',
86
+ 'hand_saw', 'coughing', 'glass_breaking', 'snoring', 'toilet_flush', 'pig',
87
+ 'washing_machine', 'clock_tick', 'sneezing', 'rooster', 'sea_waves', 'siren',
88
+ 'cat', 'door_wood_creaks', 'crickets'
89
+ ]
90
+
91
+ def __init__(self, metadata_path: str, audio_path: str, config: Optional[dict] = None):
92
+ """
93
+ Initialize ESC-50 dataset handler.
94
+
95
+ Args:
96
+ metadata_path: Path to esc50.csv metadata file
97
+ audio_path: Path to audio directory
98
+ config: Optional configuration dict with dataset.use_class_subset settings
99
+ """
100
+ self.metadata_path = Path(metadata_path)
101
+ self.audio_path = Path(audio_path)
102
+ self.config = config or {}
103
+ self.df = None
104
+ self.category_to_target = {}
105
+ self.target_to_category = {}
106
+
107
+ # Load class subset if configured
108
+ self.CATEGORIES = load_or_create_class_subset(self.config, self.ALL_CATEGORIES)
109
+ self.category_usage_counts = {cat: 0 for cat in self.CATEGORIES}
110
+
111
+ self.load_metadata()
112
+
113
+ def load_metadata(self):
114
+ """Load ESC-50 metadata CSV."""
115
+ try:
116
+ self.df = pd.read_csv(self.metadata_path)
117
+ logger.info(f"Loaded ESC-50 metadata: {len(self.df)} files")
118
+
119
+ # Create category mappings
120
+ for target, category in zip(self.df['target'], self.df['category']):
121
+ self.category_to_target[category] = target
122
+ self.target_to_category[target] = category
123
+
124
+ logger.info(f"Found {len(self.category_to_target)} unique categories")
125
+ except Exception as e:
126
+ logger.error(f"Error loading metadata: {e}")
127
+ raise
128
+
129
+ def get_files_by_category(self, category: str) -> List[str]:
130
+ """
131
+ Get all audio files for a specific category.
132
+
133
+ Args:
134
+ category: Sound category name
135
+
136
+ Returns:
137
+ List of filenames for the category
138
+ """
139
+ if category not in self.category_to_target:
140
+ raise ValueError(f"Unknown category: {category}")
141
+
142
+ target = self.category_to_target[category]
143
+ files = self.df[self.df['target'] == target]['filename'].tolist()
144
+ return files
145
+
146
+ def get_files_by_target(self, target: int) -> List[str]:
147
+ """
148
+ Get all audio files for a specific target ID.
149
+
150
+ Args:
151
+ target: Target class ID (0-49)
152
+
153
+ Returns:
154
+ List of filenames for the target
155
+ """
156
+ files = self.df[self.df['target'] == target]['filename'].tolist()
157
+ return files
158
+
159
+ def sample_categories(self, n: int, exclude: Optional[List[str]] = None) -> List[str]:
160
+ """
161
+ Sample n unique random categories from the active subset.
162
+
163
+ Args:
164
+ n: Number of categories to sample
165
+ exclude: Optional list of categories to exclude
166
+
167
+ Returns:
168
+ List of sampled category names
169
+ """
170
+ available = [c for c in self.CATEGORIES if c not in (exclude or [])]
171
+ if n > len(available):
172
+ raise ValueError(f"Cannot sample {n} categories from subset, only {len(available)} available (subset size: {len(self.CATEGORIES)})")
173
+ return random.sample(available, n)
174
+
175
+ def sample_targets(self, n: int, exclude: Optional[List[int]] = None) -> List[int]:
176
+ """
177
+ Sample n unique random targets from the active subset.
178
+
179
+ Args:
180
+ n: Number of targets to sample
181
+ exclude: Optional list of targets to exclude
182
+
183
+ Returns:
184
+ List of sampled target IDs corresponding to categories in the subset
185
+ """
186
+ # Get targets corresponding to categories in the subset
187
+ available_targets = [self.category_to_target[cat] for cat in self.CATEGORIES]
188
+ available = [t for t in available_targets if t not in (exclude or [])]
189
+ if n > len(available):
190
+ raise ValueError(f"Cannot sample {n} targets from subset, only {len(available)} available (subset size: {len(self.CATEGORIES)})")
191
+ return random.sample(available, n)
192
+
193
+ def sample_file_from_category(self, category: str) -> Tuple[str, str]:
194
+ """
195
+ Sample a random audio file from a category.
196
+
197
+ Args:
198
+ category: Sound category name
199
+
200
+ Returns:
201
+ Tuple of (filename, full_path)
202
+ """
203
+ files = self.get_files_by_category(category)
204
+ filename = random.choice(files)
205
+ full_path = str(self.audio_path / filename)
206
+ return filename, full_path
207
+
208
+ def sample_file_from_target(self, target: int) -> Tuple[str, str, str]:
209
+ """
210
+ Sample a random audio file from a target.
211
+
212
+ Args:
213
+ target: Target class ID
214
+
215
+ Returns:
216
+ Tuple of (filename, category, full_path)
217
+ """
218
+ files = self.get_files_by_target(target)
219
+ filename = random.choice(files)
220
+ category = self.target_to_category[target]
221
+ full_path = str(self.audio_path / filename)
222
+ return filename, category, full_path
223
+
224
+ def get_category_from_filename(self, filename: str) -> str:
225
+ """Get category name from filename."""
226
+ row = self.df[self.df['filename'] == filename]
227
+ if len(row) == 0:
228
+ raise ValueError(f"Unknown filename: {filename}")
229
+ return row.iloc[0]['category']
230
+
231
+ def get_file_path(self, filename: str) -> str:
232
+ """Get full path for a filename."""
233
+ return str(self.audio_path / filename)
234
+
235
+ def sample_categories_balanced(self, n: int, exclude: Optional[List[str]] = None,
236
+ answer_category: Optional[str] = None) -> List[str]:
237
+ """
238
+ Sample n unique categories with balanced usage tracking.
239
+
240
+ This method ensures that over many samples, all categories appear
241
+ roughly equally as answers by preferentially sampling underused categories.
242
+
243
+ Args:
244
+ n: Number of categories to sample
245
+ exclude: Optional list of categories to exclude
246
+ answer_category: If provided, ensures this category is included and tracks it
247
+
248
+ Returns:
249
+ List of sampled category names with answer_category first if provided
250
+ """
251
+ available = [c for c in self.CATEGORIES if c not in (exclude or [])]
252
+ if n > len(available):
253
+ raise ValueError(f"Cannot sample {n} categories, only {len(available)} available")
254
+
255
+ if answer_category:
256
+ # Track answer category usage
257
+ self.category_usage_counts[answer_category] += 1
258
+
259
+ # Remove answer category from available and sample the rest
260
+ available = [c for c in available if c != answer_category]
261
+ other_categories = random.sample(available, n - 1)
262
+ return [answer_category] + other_categories
263
+ else:
264
+ # Sample without specific answer category
265
+ return random.sample(available, n)
266
+
267
+ def get_least_used_categories(self, n: int, exclude: Optional[List[str]] = None) -> List[str]:
268
+ """
269
+ Get n categories that have been used least as answers.
270
+
271
+ Args:
272
+ n: Number of categories to get
273
+ exclude: Optional list of categories to exclude
274
+
275
+ Returns:
276
+ List of least-used category names
277
+ """
278
+ available = [c for c in self.CATEGORIES if c not in (exclude or [])]
279
+ if n > len(available):
280
+ raise ValueError(f"Cannot get {n} categories, only {len(available)} available")
281
+
282
+ # Sort by usage count (ascending) and take n least used
283
+ sorted_categories = sorted(available, key=lambda c: self.category_usage_counts[c])
284
+
285
+ # Among least used, get all with same minimum count
286
+ min_count = self.category_usage_counts[sorted_categories[0]]
287
+ candidates = [c for c in sorted_categories if self.category_usage_counts[c] == min_count]
288
+
289
+ if len(candidates) >= n:
290
+ # Randomly sample from least used
291
+ return random.sample(candidates, n)
292
+ else:
293
+ # Take all minimum and fill with next tier
294
+ result = candidates.copy()
295
+ remaining = n - len(result)
296
+ next_tier = [c for c in sorted_categories if c not in candidates][:remaining]
297
+ result.extend(next_tier)
298
+ return result
299
+
300
+ def get_category_usage_stats(self) -> Dict[str, int]:
301
+ """Get current category usage statistics."""
302
+ return self.category_usage_counts.copy()
303
+
304
+ def reset_category_usage(self):
305
+ """Reset category usage tracking."""
306
+ self.category_usage_counts = {cat: 0 for cat in self.CATEGORIES}
307
+ logger.info("Reset category usage tracking")
308
+
309
+
310
+ class PreprocessedESC50Dataset(ESC50Dataset):
311
+ """
312
+ Handler for preprocessed ESC-50 dataset with effective durations.
313
+
314
+ Extends ESC50Dataset to use trimmed audio files and effective duration
315
+ metadata from amplitude-based preprocessing.
316
+ """
317
+
318
+ def __init__(
319
+ self,
320
+ metadata_path: str,
321
+ audio_path: str,
322
+ preprocessed_path: str,
323
+ config: Optional[dict] = None
324
+ ):
325
+ """
326
+ Initialize preprocessed ESC-50 dataset handler.
327
+
328
+ Args:
329
+ metadata_path: Path to original esc50.csv metadata file
330
+ audio_path: Path to original audio directory (fallback)
331
+ preprocessed_path: Path to preprocessed data directory
332
+ config: Optional configuration dict with dataset.use_class_subset settings
333
+ """
334
+ super().__init__(metadata_path, audio_path, config)
335
+
336
+ self.preprocessed_path = Path(preprocessed_path)
337
+ self.trimmed_audio_path = self.preprocessed_path / "trimmed_audio"
338
+ self.effective_durations_path = self.preprocessed_path / "effective_durations.csv"
339
+
340
+ # Load effective durations
341
+ self.effective_df = None
342
+ self.load_effective_durations()
343
+
344
+ def load_effective_durations(self):
345
+ """Load effective durations from preprocessed CSV."""
346
+ try:
347
+ self.effective_df = pd.read_csv(self.effective_durations_path)
348
+ logger.info(f"Loaded effective durations for {len(self.effective_df)} clips")
349
+
350
+ # Create quick lookup dictionaries
351
+ self.filename_to_effective = dict(
352
+ zip(self.effective_df['filename'], self.effective_df['effective_duration_s'])
353
+ )
354
+ self.filename_to_category = dict(
355
+ zip(self.effective_df['filename'], self.effective_df['category'])
356
+ )
357
+
358
+ # Category-level statistics
359
+ self.category_effective_stats = self.effective_df.groupby('category').agg({
360
+ 'effective_duration_s': ['mean', 'std', 'min', 'max', 'count']
361
+ }).round(4)
362
+ self.category_effective_stats.columns = ['mean', 'std', 'min', 'max', 'count']
363
+
364
+ logger.info("Created effective duration lookup tables")
365
+
366
+ except Exception as e:
367
+ logger.error(f"Error loading effective durations: {e}")
368
+ raise
369
+
370
+ def get_effective_duration(self, filename: str) -> float:
371
+ """
372
+ Get effective duration for a specific file.
373
+
374
+ Args:
375
+ filename: Audio filename
376
+
377
+ Returns:
378
+ Effective duration in seconds
379
+ """
380
+ if filename not in self.filename_to_effective:
381
+ logger.warning(f"No effective duration for {filename}, using default 5.0s")
382
+ return 5.0
383
+ return self.filename_to_effective[filename]
384
+
385
+ def get_category_effective_stats(self, category: str) -> Dict:
386
+ """
387
+ Get effective duration statistics for a category.
388
+
389
+ Args:
390
+ category: Category name
391
+
392
+ Returns:
393
+ Dict with mean, std, min, max, count
394
+ """
395
+ if category not in self.category_effective_stats.index:
396
+ return {'mean': 5.0, 'std': 0.0, 'min': 5.0, 'max': 5.0, 'count': 0}
397
+
398
+ stats = self.category_effective_stats.loc[category]
399
+ return {
400
+ 'mean': stats['mean'],
401
+ 'std': stats['std'],
402
+ 'min': stats['min'],
403
+ 'max': stats['max'],
404
+ 'count': int(stats['count'])
405
+ }
406
+
407
+ def get_files_by_category_with_durations(self, category: str) -> List[Dict]:
408
+ """
409
+ Get all files for a category with their effective durations.
410
+
411
+ Args:
412
+ category: Category name
413
+
414
+ Returns:
415
+ List of dicts with filename, effective_duration_s, filepath
416
+ """
417
+ cat_df = self.effective_df[self.effective_df['category'] == category]
418
+
419
+ results = []
420
+ for _, row in cat_df.iterrows():
421
+ results.append({
422
+ 'filename': row['filename'],
423
+ 'effective_duration_s': row['effective_duration_s'],
424
+ 'filepath': str(self.trimmed_audio_path / row['filename']),
425
+ 'raw_duration_s': row['raw_duration_s'],
426
+ 'peak_amplitude_db': row['peak_amplitude_db']
427
+ })
428
+
429
+ return results
430
+
431
+ def sample_file_from_category_with_duration(
432
+ self,
433
+ category: str,
434
+ min_effective_duration: float = None,
435
+ max_effective_duration: float = None
436
+ ) -> Tuple[str, str, float]:
437
+ """
438
+ Sample a file from category with optional duration constraints.
439
+
440
+ Args:
441
+ category: Category name
442
+ min_effective_duration: Minimum effective duration (optional)
443
+ max_effective_duration: Maximum effective duration (optional)
444
+
445
+ Returns:
446
+ Tuple of (filename, filepath, effective_duration_s)
447
+ """
448
+ files = self.get_files_by_category_with_durations(category)
449
+
450
+ # Filter by duration if constraints provided
451
+ if min_effective_duration is not None:
452
+ files = [f for f in files if f['effective_duration_s'] >= min_effective_duration]
453
+ if max_effective_duration is not None:
454
+ files = [f for f in files if f['effective_duration_s'] <= max_effective_duration]
455
+
456
+ if not files:
457
+ # Fallback to any file from category
458
+ logger.warning(f"No files match duration constraints for {category}, using any file")
459
+ files = self.get_files_by_category_with_durations(category)
460
+
461
+ selected = random.choice(files)
462
+ return selected['filename'], selected['filepath'], selected['effective_duration_s']
463
+
464
+ def sample_files_from_category_to_reach_duration(
465
+ self,
466
+ category: str,
467
+ target_duration_s: float,
468
+ prefer_same_file: bool = True
469
+ ) -> Tuple[List[str], List[str], float]:
470
+ """
471
+ Sample files from a category to reach a target total effective duration.
472
+
473
+ Args:
474
+ category: Category name
475
+ target_duration_s: Target total effective duration
476
+ prefer_same_file: If True, try repeating same file first
477
+
478
+ Returns:
479
+ Tuple of (filenames_list, filepaths_list, actual_total_duration_s)
480
+ """
481
+ files = self.get_files_by_category_with_durations(category)
482
+
483
+ if not files:
484
+ raise ValueError(f"No files found for category: {category}")
485
+
486
+ selected_filenames = []
487
+ selected_filepaths = []
488
+ total_duration = 0.0
489
+
490
+ if prefer_same_file:
491
+ # Sort by effective duration descending (prefer longer clips)
492
+ files_sorted = sorted(files, key=lambda x: x['effective_duration_s'], reverse=True)
493
+ selected_file = files_sorted[0]
494
+
495
+ # Calculate how many repetitions needed
496
+ reps_needed = max(1, int(target_duration_s / selected_file['effective_duration_s']) + 1)
497
+
498
+ for _ in range(reps_needed):
499
+ selected_filenames.append(selected_file['filename'])
500
+ selected_filepaths.append(selected_file['filepath'])
501
+ total_duration += selected_file['effective_duration_s']
502
+
503
+ if total_duration >= target_duration_s:
504
+ break
505
+ else:
506
+ # Use different files
507
+ random.shuffle(files)
508
+ file_idx = 0
509
+
510
+ while total_duration < target_duration_s:
511
+ selected_file = files[file_idx % len(files)]
512
+ selected_filenames.append(selected_file['filename'])
513
+ selected_filepaths.append(selected_file['filepath'])
514
+ total_duration += selected_file['effective_duration_s']
515
+ file_idx += 1
516
+
517
+ # Safety limit
518
+ if file_idx > 100:
519
+ logger.warning(f"Hit safety limit when sampling files for {category}")
520
+ break
521
+
522
+ return selected_filenames, selected_filepaths, total_duration
523
+
524
+ def get_categories_sorted_by_effective_duration(self, ascending: bool = True) -> List[str]:
525
+ """
526
+ Get categories sorted by their mean effective duration.
527
+
528
+ Args:
529
+ ascending: If True, shortest first; if False, longest first
530
+
531
+ Returns:
532
+ List of category names sorted by mean effective duration
533
+ """
534
+ sorted_stats = self.category_effective_stats.sort_values('mean', ascending=ascending)
535
+ return sorted_stats.index.tolist()
536
+
utils/llm_utils.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM-based question generation utilities.
3
+
4
+ Supports multiple LLM providers for generating natural, lexically consistent questions.
5
+ """
6
+
7
+ import os
8
+ import random
9
+ from typing import Dict, List, Optional, Tuple
10
+ import json
11
+
12
+ from .logger import setup_logger
13
+
14
+ logger = setup_logger(__name__)
15
+
16
+
17
+ class LLMQuestionGenerator:
18
+ """Generate questions using local Llama 3.1 8B Instruct LLM."""
19
+
20
+ def __init__(
21
+ self,
22
+ enabled: bool = False,
23
+ template_questions: Optional[Dict] = None
24
+ ):
25
+ """
26
+ Initialize LLM question generator.
27
+
28
+ Args:
29
+ enabled: Whether LLM generation is enabled
30
+ template_questions: Template questions for fallback
31
+ """
32
+ self.enabled = enabled
33
+ self.template_questions = template_questions or {}
34
+
35
+ if not self.enabled:
36
+ logger.info("LLM generation disabled, using templates")
37
+ return
38
+
39
+ # TODO: Initialize local Llama 3.1 8B model connection
40
+ # This will be implemented based on your local LLM setup
41
+ logger.info("LLM generation enabled (local Llama 3.1 8B)")
42
+ logger.warning("Local LLM integration not yet implemented, falling back to templates")
43
+
44
+
45
+ def generate_count_questions(
46
+ self,
47
+ correct_count: int,
48
+ categories_present: List[str],
49
+ generate_both: bool = True
50
+ ) -> Dict:
51
+ """
52
+ Generate count task questions.
53
+
54
+ Args:
55
+ correct_count: Correct number of unique sounds
56
+ categories_present: List of sound categories in the audio
57
+ generate_both: Whether to generate both MCQ and open-text
58
+
59
+ Returns:
60
+ Dictionary with mcq_question and/or open_text_question
61
+ """
62
+ # TODO: Implement LLM generation when enabled
63
+ # For now, always use templates
64
+ return self._generate_count_template(correct_count)
65
+
66
+ def generate_category_questions(
67
+ self,
68
+ task_type: str,
69
+ correct_category: str,
70
+ categories_present: List[str],
71
+ context: Optional[Dict] = None
72
+ ) -> Dict:
73
+ """
74
+ Generate questions where the answer is a sound category.
75
+
76
+ Args:
77
+ task_type: Type of task (duration, order, volume)
78
+ correct_category: Correct answer category
79
+ categories_present: All categories in the audio
80
+ context: Additional context (e.g., question_type, reference_sound)
81
+
82
+ Returns:
83
+ Dictionary with mcq_question and open_text_question
84
+ """
85
+ # TODO: Implement LLM generation when enabled
86
+ # For now, always use templates
87
+ return self._generate_category_template(task_type, correct_category, context)
88
+
89
+ def _generate_count_template(self, correct_count: int) -> Dict:
90
+ """Generate count questions from templates."""
91
+ mcq_templates = self.template_questions.get("count", {}).get("mcq", [
92
+ "What is the number of distinct sound sources in the audio file?",
93
+ "How many different types of sounds can be identified in this recording?"
94
+ ])
95
+ open_templates = self.template_questions.get("count", {}).get("open_text", [
96
+ "How many distinct sound sources are present in the audio?",
97
+ "Count the number of unique sounds in this recording."
98
+ ])
99
+
100
+ return {
101
+ "mcq_question": random.choice(mcq_templates),
102
+ "open_text_question": random.choice(open_templates)
103
+ }
104
+
105
+ def _generate_category_template(
106
+ self,
107
+ task_type: str,
108
+ correct_category: str,
109
+ context: Optional[Dict]
110
+ ) -> Dict:
111
+ """Generate category questions from templates."""
112
+ context = context or {}
113
+
114
+ if task_type == "duration":
115
+ q_type = context.get("question_type", "shortest")
116
+ mcq_q = f"Which of the following sounds is heard for the {q_type} duration?"
117
+ open_q = f"Which sound is heard for the {q_type} duration in the audio?"
118
+
119
+ elif task_type == "order":
120
+ q_subtype = context.get("question_subtype", "first")
121
+ if q_subtype == "first":
122
+ mcq_q = "Which sound appears first in the audio clip?"
123
+ open_q = "What is the first sound you hear in the audio?"
124
+ elif q_subtype == "last":
125
+ mcq_q = "Which sound appears last in the audio clip?"
126
+ open_q = "What is the last sound you hear in the audio?"
127
+ elif q_subtype == "after":
128
+ ref = context.get("reference_sound", "")
129
+ mcq_q = f"Which sound comes after {ref}?"
130
+ open_q = f"What sound comes after {ref}?"
131
+ else:
132
+ ref = context.get("reference_sound", "")
133
+ mcq_q = f"Which sound comes before {ref}?"
134
+ open_q = f"What sound comes before {ref}?"
135
+
136
+ else: # volume
137
+ q_type = context.get("question_type", "loudest")
138
+ mcq_q = f"Which sound is the {q_type} in the audio?"
139
+ open_q = f"Identify the {q_type} sound in the audio clip."
140
+
141
+ return {
142
+ "mcq_question": mcq_q,
143
+ "open_text_question": open_q
144
+ }