Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +19 -0
- DOCS.md +1296 -0
- README.md +112 -0
- config.yaml +348 -0
- llm_answer_generator.py +268 -0
- main.py +272 -0
- preprocess_esc50.py +714 -0
- requirements.txt +6 -0
- run_llm_answers_all.sh +28 -0
- run_pipeline.sh +166 -0
- synthetic_silences/silent_1.wav +3 -0
- synthetic_silences/silent_10.wav +3 -0
- synthetic_silences/silent_11.wav +3 -0
- synthetic_silences/silent_12.wav +3 -0
- synthetic_silences/silent_13.wav +3 -0
- synthetic_silences/silent_14.wav +3 -0
- synthetic_silences/silent_15.wav +3 -0
- synthetic_silences/silent_16.wav +3 -0
- synthetic_silences/silent_17.wav +3 -0
- synthetic_silences/silent_18.wav +3 -0
- synthetic_silences/silent_19.wav +0 -0
- synthetic_silences/silent_2.wav +3 -0
- synthetic_silences/silent_20.wav +3 -0
- synthetic_silences/silent_3.wav +3 -0
- synthetic_silences/silent_4.wav +3 -0
- synthetic_silences/silent_5.wav +3 -0
- synthetic_silences/silent_6.wav +3 -0
- synthetic_silences/silent_7.wav +3 -0
- synthetic_silences/silent_8.wav +3 -0
- synthetic_silences/silent_9.wav +3 -0
- tasks/__pycache__/task_count.cpython-312.pyc +0 -0
- tasks/__pycache__/task_duration.cpython-312.pyc +0 -0
- tasks/__pycache__/task_order.cpython-312.pyc +0 -0
- tasks/__pycache__/task_volume.cpython-312.pyc +0 -0
- tasks/task_count.py +472 -0
- tasks/task_duration.py +820 -0
- tasks/task_order.py +598 -0
- tasks/task_volume.py +732 -0
- utils/__init__.py +50 -0
- utils/__pycache__/__init__.cpython-312.pyc +0 -0
- utils/__pycache__/__init__.cpython-314.pyc +0 -0
- utils/__pycache__/audio_utils.cpython-312.pyc +0 -0
- utils/__pycache__/audio_utils.cpython-314.pyc +0 -0
- utils/__pycache__/dataset_utils.cpython-312.pyc +0 -0
- utils/__pycache__/llm_utils.cpython-312.pyc +0 -0
- utils/__pycache__/logger.cpython-312.pyc +0 -0
- utils/__pycache__/question_utils.cpython-312.pyc +0 -0
- utils/audio_utils.py +1388 -0
- utils/dataset_utils.py +536 -0
- utils/llm_utils.py +144 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
synthetic_silences/silent_1.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
synthetic_silences/silent_10.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
synthetic_silences/silent_11.wav filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
synthetic_silences/silent_12.wav filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
synthetic_silences/silent_13.wav filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
synthetic_silences/silent_14.wav filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
synthetic_silences/silent_15.wav filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
synthetic_silences/silent_16.wav filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
synthetic_silences/silent_17.wav filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
synthetic_silences/silent_18.wav filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
synthetic_silences/silent_2.wav filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
synthetic_silences/silent_20.wav filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
synthetic_silences/silent_3.wav filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
synthetic_silences/silent_4.wav filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
synthetic_silences/silent_5.wav filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
synthetic_silences/silent_6.wav filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
synthetic_silences/silent_7.wav filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
synthetic_silences/silent_8.wav filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
synthetic_silences/silent_9.wav filter=lfs diff=lfs merge=lfs -text
|
DOCS.md
ADDED
|
@@ -0,0 +1,1296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# TREA 2.0 - Technical Documentation
|
| 2 |
+
|
| 3 |
+
Comprehensive technical documentation for the TREA 2.0 audio dataset generation pipeline. This document covers the complete implementation including algorithms, mathematical formulations, configuration parameters, preprocessing details, and capacity-aware balancing mechanisms.
|
| 4 |
+
|
| 5 |
+
**For Quick Start Guide**: See [README.md](README.md)
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Table of Contents
|
| 10 |
+
|
| 11 |
+
1. [Pipeline Overview](#pipeline-overview)
|
| 12 |
+
2. [How Sample Durations Are Generated](#how-sample-durations-are-generated)
|
| 13 |
+
3. [Configuration Reference](#configuration-reference)
|
| 14 |
+
4. [ESC-50 Preprocessing](#esc-50-preprocessing-duration-task-only)
|
| 15 |
+
5. [Audio Utilities](#audio-utilities)
|
| 16 |
+
6. [Task: COUNT](#task-count)
|
| 17 |
+
7. [Task: DURATION](#task-duration)
|
| 18 |
+
8. [Task: ORDER](#task-order)
|
| 19 |
+
9. [Task: VOLUME](#task-volume)
|
| 20 |
+
10. [Deterministic Balancing Mechanisms](#deterministic-balancing-mechanisms)
|
| 21 |
+
11. [Rejection Logic and Retry Mechanisms](#rejection-logic-and-retry-mechanisms)
|
| 22 |
+
12. [Command-Line Arguments](#command-line-arguments)
|
| 23 |
+
13. [Summary](#summary)
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
## Pipeline Overview
|
| 28 |
+
|
| 29 |
+
### Architecture
|
| 30 |
+
|
| 31 |
+
The pipeline generates four types of audio-based question-answering samples:
|
| 32 |
+
|
| 33 |
+
| Task | Question Type | Example Question |
|
| 34 |
+
|------|---------------|------------------|
|
| 35 |
+
| **COUNT** | Counting unique sounds | "How many unique sounds do you hear?" |
|
| 36 |
+
| **DURATION** | Temporal comparison | "Which sound plays for the longest duration?" |
|
| 37 |
+
| **ORDER** | Temporal ordering | "Which sound plays first/last/after X?" |
|
| 38 |
+
| **VOLUME** | Loudness comparison | "Which sound is the loudest/softest?" |
|
| 39 |
+
|
| 40 |
+
### Directory Structure
|
| 41 |
+
|
| 42 |
+
```
|
| 43 |
+
pipeline/
|
| 44 |
+
├── main.py # Entry point - orchestrates all tasks
|
| 45 |
+
├── config.yaml # All configuration parameters
|
| 46 |
+
├── tasks/
|
| 47 |
+
│ ├── task_count.py # CountTaskGenerator class
|
| 48 |
+
│ ├── task_duration.py # DurationTaskGenerator class
|
| 49 |
+
│ ├── task_order.py # OrderTaskGenerator class
|
| 50 |
+
│ └── task_volume.py # VolumeTaskGenerator class
|
| 51 |
+
├── utils/
|
| 52 |
+
│ ├── __init__.py # Exports all utilities
|
| 53 |
+
│ ├── audio_utils.py # Audio processing functions
|
| 54 |
+
│ ├── dataset_utils.py # ESC50Dataset, PreprocessedESC50Dataset
|
| 55 |
+
│ ├── question_utils.py # QuestionGenerator
|
| 56 |
+
│ ├── llm_utils.py # LLMQuestionGenerator
|
| 57 |
+
│ └── logger.py # setup_logger
|
| 58 |
+
└── output/ # Generated outputs
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
### Data Flow
|
| 62 |
+
|
| 63 |
+
```
|
| 64 |
+
ESC-50 Dataset (2000 clips, 50 categories, 5s each)
|
| 65 |
+
↓
|
| 66 |
+
[DURATION TASK ONLY] Preprocessing Script (preprocess_esc50.py)
|
| 67 |
+
├── Detects sound regions using adaptive noise-floor thresholding
|
| 68 |
+
├── Trims leading/trailing silence (keeps internal structure)
|
| 69 |
+
├── Calculates effective durations
|
| 70 |
+
↓
|
| 71 |
+
ESC-50_preprocessed/
|
| 72 |
+
├── effective_durations.csv (metadata with effective durations)
|
| 73 |
+
└── trimmed_audio/*.wav (edge-trimmed clips)
|
| 74 |
+
↓
|
| 75 |
+
Pipeline (task-specific generation with balancing)
|
| 76 |
+
├── COUNT: Uses raw ESC-50 clips
|
| 77 |
+
├── DURATION: Uses preprocessed clips with effective durations
|
| 78 |
+
├── ORDER: Uses raw ESC-50 clips
|
| 79 |
+
└── VOLUME: Uses raw ESC-50 clips (normalized then volume-adjusted)
|
| 80 |
+
↓
|
| 81 |
+
output/{task}/
|
| 82 |
+
├── audios/*.wav (generated audio samples)
|
| 83 |
+
├── {task}_mcq.csv (multiple choice questions)
|
| 84 |
+
├── {task}_open_text.csv (open-ended questions)
|
| 85 |
+
└── {task}_metadata.csv (detailed metadata)
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
### Entry Point: `main.py`
|
| 89 |
+
|
| 90 |
+
The main orchestration happens via individual task runner functions:
|
| 91 |
+
|
| 92 |
+
```python
|
| 93 |
+
def run_count_task(config: dict, logger):
|
| 94 |
+
generator = CountTaskGenerator(config, logger)
|
| 95 |
+
generator.dataset.reset_category_usage()
|
| 96 |
+
generator.generate_dataset()
|
| 97 |
+
|
| 98 |
+
def run_duration_task(config: dict, logger):
|
| 99 |
+
generator = DurationTaskGenerator(config, logger)
|
| 100 |
+
generator.dataset.reset_category_usage()
|
| 101 |
+
generator.generate_dataset()
|
| 102 |
+
|
| 103 |
+
def run_order_task(config: dict, logger):
|
| 104 |
+
generator = OrderTaskGenerator(config, logger)
|
| 105 |
+
generator.dataset.reset_category_usage()
|
| 106 |
+
generator.generate_dataset()
|
| 107 |
+
|
| 108 |
+
def run_volume_task(config: dict, logger):
|
| 109 |
+
generator = VolumeTaskGenerator(config, logger)
|
| 110 |
+
generator.dataset.reset_category_usage()
|
| 111 |
+
generator.generate_dataset()
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
---
|
| 115 |
+
|
| 116 |
+
## How Sample Durations Are Generated
|
| 117 |
+
|
| 118 |
+
**IMPORTANT**: Sample durations are generated upfront to **exactly fill the target task duration**.
|
| 119 |
+
|
| 120 |
+
### The Algorithm
|
| 121 |
+
|
| 122 |
+
Located in `utils/audio_utils.py`:
|
| 123 |
+
|
| 124 |
+
```python
|
| 125 |
+
def generate_sample_durations_for_task(
|
| 126 |
+
task_duration_hours: float,
|
| 127 |
+
min_clip_duration: float,
|
| 128 |
+
max_clip_duration: float
|
| 129 |
+
) -> list:
|
| 130 |
+
"""
|
| 131 |
+
Generate sample durations that exactly fill the target task duration.
|
| 132 |
+
"""
|
| 133 |
+
task_duration_seconds = task_duration_hours * 3600
|
| 134 |
+
remaining = task_duration_seconds
|
| 135 |
+
durations = []
|
| 136 |
+
|
| 137 |
+
while remaining >= min_clip_duration:
|
| 138 |
+
# Cap max at remaining to avoid overshoot
|
| 139 |
+
effective_max = min(max_clip_duration, remaining)
|
| 140 |
+
|
| 141 |
+
# If remaining is less than min, we can't fit another sample
|
| 142 |
+
if effective_max < min_clip_duration:
|
| 143 |
+
break
|
| 144 |
+
|
| 145 |
+
# Sample uniformly within valid range
|
| 146 |
+
d = random.uniform(min_clip_duration, effective_max)
|
| 147 |
+
durations.append(d)
|
| 148 |
+
remaining -= d
|
| 149 |
+
|
| 150 |
+
# Shuffle to randomize order
|
| 151 |
+
random.shuffle(durations)
|
| 152 |
+
|
| 153 |
+
return durations
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
1. Start with `remaining = total_seconds`
|
| 157 |
+
2. While `remaining >= min_clip_duration`:
|
| 158 |
+
- Sample `d ~ Uniform(min, min(max, remaining))`
|
| 159 |
+
- Append `d` to durations list
|
| 160 |
+
- Subtract `d` from remaining
|
| 161 |
+
3. Shuffle and return
|
| 162 |
+
|
| 163 |
+
### Mathematical Properties
|
| 164 |
+
|
| 165 |
+
**Guarantee**: $\sum_{i=1}^{N} d_i \leq T$ and $T - \sum d_i < d_{\min}$
|
| 166 |
+
|
| 167 |
+
Where:
|
| 168 |
+
- $T$ = total task duration
|
| 169 |
+
- $d_i$ = duration of sample $i$
|
| 170 |
+
- $d_{\min}$ = minimum clip duration
|
| 171 |
+
- $N$ = number of samples generated (variable, not fixed!)
|
| 172 |
+
|
| 173 |
+
**Each duration**: $d_i \sim \text{Uniform}(d_{\min}, \min(d_{\max}, \text{remaining}_i))$
|
| 174 |
+
|
| 175 |
+
### Example
|
| 176 |
+
|
| 177 |
+
With `task_duration_size = 1.0` hours (3600s), `min = 20s`, `max = 60s`:
|
| 178 |
+
|
| 179 |
+
```
|
| 180 |
+
remaining=3600 → d₁=45.2s → remaining=3554.8
|
| 181 |
+
remaining=3554.8 → d₂=28.7s → remaining=3526.1
|
| 182 |
+
remaining=3526.1 → d₃=52.1s → remaining=3474.0
|
| 183 |
+
...
|
| 184 |
+
remaining=35.2 → d₈₉=35.2s → remaining=0 (capped at remaining)
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
Result: 89 samples totaling exactly 3600s (instead of estimated 90)
|
| 188 |
+
|
| 189 |
+
### Where It's Called
|
| 190 |
+
|
| 191 |
+
Each task's `generate_dataset()` method uses this:
|
| 192 |
+
|
| 193 |
+
```python
|
| 194 |
+
def generate_dataset(self) -> tuple:
|
| 195 |
+
# Generate all durations upfront
|
| 196 |
+
sample_durations = generate_sample_durations_for_task(
|
| 197 |
+
self.task_duration_hours,
|
| 198 |
+
self.min_clip_duration,
|
| 199 |
+
self.max_clip_duration
|
| 200 |
+
)
|
| 201 |
+
num_samples = len(sample_durations)
|
| 202 |
+
|
| 203 |
+
self.logger.info(f"Generating {num_samples} samples...")
|
| 204 |
+
|
| 205 |
+
# Each sample uses its pre-assigned duration
|
| 206 |
+
for i, target_duration in enumerate(sample_durations):
|
| 207 |
+
metadata = self.generate_sample(i, target_duration=target_duration, ...)
|
| 208 |
+
```
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
---
|
| 212 |
+
|
| 213 |
+
## Configuration Reference
|
| 214 |
+
|
| 215 |
+
All parameters are defined in `config.yaml`.
|
| 216 |
+
|
| 217 |
+
### Dataset Class Subset Configuration
|
| 218 |
+
|
| 219 |
+
```yaml
|
| 220 |
+
dataset:
|
| 221 |
+
use_class_subset: false # Enable to use only a subset of ESC-50 classes
|
| 222 |
+
num_classes_subset: 40 # Number of classes for train/val/test (e.g., 40 of 50)
|
| 223 |
+
subset_persist_path: "output/class_subset.json" # Path to save/load class subset
|
| 224 |
+
subset_seed: 42 # Random seed for subset selection (persisted)
|
| 225 |
+
```
|
| 226 |
+
|
| 227 |
+
**Purpose**: Create in-distribution (ID) splits using a subset of classes, then optionally test on out-of-distribution (OOD) using all classes.
|
| 228 |
+
|
| 229 |
+
**Workflow**:
|
| 230 |
+
1. Set `use_class_subset: true` and `num_classes_subset: 40`
|
| 231 |
+
2. Run pipeline - 40 classes randomly selected and saved to `class_subset.json`
|
| 232 |
+
3. Generate train/val/test splits - all use same 40 classes
|
| 233 |
+
4. For OOD test: Set `use_class_subset: false`, use different output path
|
| 234 |
+
|
| 235 |
+
### Global Audio Parameters
|
| 236 |
+
|
| 237 |
+
```yaml
|
| 238 |
+
audio:
|
| 239 |
+
min_clip_duration: 20.0 # Minimum generated clip duration (seconds)
|
| 240 |
+
max_clip_duration: 60.0 # Maximum generated clip duration (seconds)
|
| 241 |
+
source_clip_duration: 5.0 # ESC-50 clip length (seconds)
|
| 242 |
+
|
| 243 |
+
# Silence and crossfade parameters (applied to ALL tasks)
|
| 244 |
+
min_silence_duration: 100 # Minimum silence ALWAYS between clips (ms)
|
| 245 |
+
max_extra_silence_per_gap: 500 # Max extra silence per gap when distributing remainder (ms)
|
| 246 |
+
crossfade_duration: 500 # Crossfade between audio-silence transitions (ms) for smooth joins
|
| 247 |
+
crossfade_within_source: 50 # Small crossfade within same-source repetitions (ms) for COUNT task
|
| 248 |
+
with_silence: true # Enable silence insertion between clips
|
| 249 |
+
|
| 250 |
+
normalize: false
|
| 251 |
+
normalize_target_dBFS: -20.0
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
### Task-Specific Parameters
|
| 255 |
+
|
| 256 |
+
#### COUNT Task
|
| 257 |
+
```yaml
|
| 258 |
+
count:
|
| 259 |
+
enabled: true
|
| 260 |
+
task_duration_size: 2.0 # Hours of total audio to generate
|
| 261 |
+
max_clips_per_sample: 10 # Maximum unique sounds per sample (1 to 10)
|
| 262 |
+
ordering_mode: "random" # "random" (shuffled clips) or "consecutive" (grouped by source)
|
| 263 |
+
|
| 264 |
+
# CAPACITY-AWARE ANSWER BALANCING:
|
| 265 |
+
# - Creates balanced distribution of answers from 1 to max_clips_per_sample
|
| 266 |
+
# - Sorts samples by capacity (max_clips each can fit)
|
| 267 |
+
# - Assigns higher targets to high-capacity samples
|
| 268 |
+
# - Clamps targets to what actually fits (reduces excessive silence)
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
#### DURATION Task
|
| 272 |
+
```yaml
|
| 273 |
+
duration:
|
| 274 |
+
enabled: true
|
| 275 |
+
task_duration_size: 2.0
|
| 276 |
+
preprocessed_data_path: "/home/debarpanb1/TREA_2.0/ESC-50_preprocessed"
|
| 277 |
+
question_types: ["shortest", "longest"]
|
| 278 |
+
num_unique_sources: 10 # Can be int or list (e.g., [2,3,4,5])
|
| 279 |
+
ordering_methods: ["consecutive"] # Only consecutive for duration task
|
| 280 |
+
|
| 281 |
+
# Preprocessing parameters (adaptive noise-floor thresholding)
|
| 282 |
+
threshold_strategy: "noise_floor" # Adaptive per-clip (recommended)
|
| 283 |
+
noise_floor_percentile: 2.0 # Use 2nd percentile as noise floor
|
| 284 |
+
noise_floor_delta_db: 5.0 # Threshold = noise_floor + 5dB
|
| 285 |
+
min_sound_duration_ms: 25 # Filter transient spikes
|
| 286 |
+
|
| 287 |
+
# Gap multipliers
|
| 288 |
+
multiplier_longest: 1.5 # Target must be ≥ 1.5x max background
|
| 289 |
+
multiplier_shortest: 0.75 # Target must be ≤ 0.75x min background (changed from 0.5)
|
| 290 |
+
min_effective_duration_per_source: 1.0 # Minimum duration per source (seconds)
|
| 291 |
+
|
| 292 |
+
reject_if_gap_not_met: true
|
| 293 |
+
sample_different_clips_same_class: true
|
| 294 |
+
```
|
| 295 |
+
|
| 296 |
+
#### ORDER Task
|
| 297 |
+
```yaml
|
| 298 |
+
order:
|
| 299 |
+
enabled: true
|
| 300 |
+
task_duration_size: 2.0
|
| 301 |
+
max_clips_per_sample: 10 # Cap for maximum clips to join
|
| 302 |
+
question_types: ["first", "last", "second", "second_last", "after", "before"]
|
| 303 |
+
min_clips_for_second_questions: 3 # "second" and "second_last" require ≥3 clips
|
| 304 |
+
allow_source_repetition: false # Each clip from unique source
|
| 305 |
+
|
| 306 |
+
# CAPACITY-AWARE QUESTION TYPE BALANCING:
|
| 307 |
+
# - Each question type appears equally across samples
|
| 308 |
+
# - Advanced types (second, second_last) assigned to high-capacity samples
|
| 309 |
+
# - Basic types (first, last, after, before) for lower-capacity samples
|
| 310 |
+
# - NO n_clips balancing: randomly samples from [max(2, max_clips-3), max_clips_per_sample]
|
| 311 |
+
```
|
| 312 |
+
|
| 313 |
+
#### VOLUME Task
|
| 314 |
+
```yaml
|
| 315 |
+
volume:
|
| 316 |
+
enabled: true
|
| 317 |
+
task_duration_size: 2.0
|
| 318 |
+
max_clips_per_sample: 10 # Cap for maximum clips with different volumes
|
| 319 |
+
question_types: ["max_loudness", "min_loudness"]
|
| 320 |
+
|
| 321 |
+
# Normalization (CRITICAL for controlled volume comparison)
|
| 322 |
+
normalize_to_baseline: true
|
| 323 |
+
baseline_dBFS: -20.0 # All clips normalized to this level first
|
| 324 |
+
use_lufs: false # DISABLED - LUFS makes everything same perceived loudness!
|
| 325 |
+
baseline_lufs: -23.0 # EBU R128 standard (not used when use_lufs=false)
|
| 326 |
+
|
| 327 |
+
# Volume gap constraints (multipliers)
|
| 328 |
+
multiplier_max_loudness: 4.0 # Max must be ≥ 4x second-loudest (~12 dB)
|
| 329 |
+
multiplier_min_loudness: 0.25 # Min must be ≤ 0.25x second-softest (~12 dB)
|
| 330 |
+
reject_if_gap_not_met: true
|
| 331 |
+
|
| 332 |
+
# Source clip options
|
| 333 |
+
use_same_clip_different_volumes: false # Use different clips (not same clip repeated)
|
| 334 |
+
repetitions_per_source: [2, 3, 4] # If same clip used, how many repetitions
|
| 335 |
+
|
| 336 |
+
# QUESTION TYPE BALANCING: Each question type appears equally across samples
|
| 337 |
+
# NO n_clips balancing: randomly samples from [max(2, max_clips-3), max_clips_per_sample]
|
| 338 |
+
```
|
| 339 |
+
|
| 340 |
+
---
|
| 341 |
+
|
| 342 |
+
## ESC-50 Preprocessing (Duration Task Only)
|
| 343 |
+
|
| 344 |
+
**File**: `preprocess_esc50.py`
|
| 345 |
+
**Purpose**: Preprocess ESC-50 clips for duration task by detecting actual sound regions and trimming silence.
|
| 346 |
+
|
| 347 |
+
### Why Preprocessing?
|
| 348 |
+
|
| 349 |
+
The DURATION task compares sound durations. Raw ESC-50 clips have variable amounts of leading/trailing silence, which would make duration comparisons ambiguous. Preprocessing:
|
| 350 |
+
|
| 351 |
+
1. **Detects actual sound regions** using adaptive amplitude thresholding
|
| 352 |
+
2. **Trims leading and trailing silence** (preserves internal structure)
|
| 353 |
+
3. **Calculates effective duration** (sum of all sound regions)
|
| 354 |
+
4. **Generates metadata CSV** with per-clip durations
|
| 355 |
+
|
| 356 |
+
### Preprocessing Pipeline
|
| 357 |
+
|
| 358 |
+
```
|
| 359 |
+
Raw ESC-50 clip (5s with silence)
|
| 360 |
+
↓
|
| 361 |
+
1. Load audio and convert to amplitude array
|
| 362 |
+
2. Compute RMS envelope (frame-by-frame energy)
|
| 363 |
+
3. Convert RMS to dB values
|
| 364 |
+
4. Apply adaptive threshold strategy
|
| 365 |
+
5. Detect contiguous sound regions
|
| 366 |
+
6. Trim edges (only if silence >= 100ms)
|
| 367 |
+
7. Calculate effective duration
|
| 368 |
+
8. Save trimmed audio + metadata
|
| 369 |
+
```
|
| 370 |
+
|
| 371 |
+
### Adaptive Noise-Floor Thresholding
|
| 372 |
+
|
| 373 |
+
The preprocessing uses an **adaptive per-clip threshold** strategy:
|
| 374 |
+
|
| 375 |
+
```python
|
| 376 |
+
# Strategy: 'noise_floor' (adaptive, recommended)
|
| 377 |
+
noise_floor_db = np.percentile(db_values, noise_floor_percentile) # e.g., 2nd percentile
|
| 378 |
+
absolute_threshold = noise_floor_db + noise_floor_delta_db # e.g., +5 dB above noise floor
|
| 379 |
+
```
|
| 380 |
+
|
| 381 |
+
**Key Parameters** (from `config.yaml`):
|
| 382 |
+
```yaml
|
| 383 |
+
duration:
|
| 384 |
+
threshold_strategy: "noise_floor" # Adaptive per-clip (recommended)
|
| 385 |
+
noise_floor_percentile: 2.0 # Use 2nd percentile as noise floor estimate
|
| 386 |
+
noise_floor_delta_db: 5.0 # Threshold = noise_floor + 5 dB
|
| 387 |
+
min_sound_duration_ms: 25 # Filter out transient spikes < 25ms
|
| 388 |
+
```
|
| 389 |
+
|
| 390 |
+
**Why Adaptive?**
|
| 391 |
+
- Each clip has different background noise levels
|
| 392 |
+
- Fixed threshold (e.g., -40 dB) works poorly across diverse sounds
|
| 393 |
+
- Adaptive threshold adjusts per-clip based on its own noise floor
|
| 394 |
+
|
| 395 |
+
**Alternative** (legacy):
|
| 396 |
+
```yaml
|
| 397 |
+
threshold_strategy: "peak_relative" # threshold = peak_dB - 20 dB (fixed offset)
|
| 398 |
+
amplitude_threshold_db: -20.0
|
| 399 |
+
```
|
| 400 |
+
|
| 401 |
+
### Edge Trimming Strategy
|
| 402 |
+
|
| 403 |
+
**ADAPTIVE EDGE-ONLY TRIMMING** - preserves natural periodicity:
|
| 404 |
+
|
| 405 |
+
```python
|
| 406 |
+
def extract_sound_with_edges_trimmed(audio, regions, min_silence_to_trim_ms=100, buffer_ratio=0.1):
|
| 407 |
+
"""
|
| 408 |
+
Trim ONLY leftmost and rightmost silence IF significant.
|
| 409 |
+
Preserves ALL internal structure (perfect for periodic sounds).
|
| 410 |
+
"""
|
| 411 |
+
leading_silence_ms = regions[0][0] # Time before first sound
|
| 412 |
+
trailing_silence_ms = len(audio) - regions[-1][1] # Time after last sound
|
| 413 |
+
|
| 414 |
+
# Only trim if silence >= 100ms
|
| 415 |
+
if leading_silence_ms >= min_silence_to_trim_ms:
|
| 416 |
+
buffer_ms = max(200, int(leading_silence_ms * 0.1)) # Keep 10% as buffer
|
| 417 |
+
trim_start_ms = max(0, regions[0][0] - buffer_ms)
|
| 418 |
+
else:
|
| 419 |
+
trim_start_ms = 0 # Keep from start
|
| 420 |
+
|
| 421 |
+
# Similar for trailing silence
|
| 422 |
+
...
|
| 423 |
+
|
| 424 |
+
return audio[trim_start_ms:trim_end_ms]
|
| 425 |
+
```
|
| 426 |
+
|
| 427 |
+
**Why Edge-Only?**
|
| 428 |
+
- Clock ticks, footsteps, typing have periodic silence between sounds
|
| 429 |
+
- Removing internal silences destroys natural rhythm
|
| 430 |
+
- Edge trimming removes irrelevant silence while preserving periodicity
|
| 431 |
+
|
| 432 |
+
### Output Files
|
| 433 |
+
|
| 434 |
+
```
|
| 435 |
+
ESC-50_preprocessed/
|
| 436 |
+
├── effective_durations.csv
|
| 437 |
+
│ ├── filename
|
| 438 |
+
│ ├── category
|
| 439 |
+
│ ├── raw_duration_s (original 5.0s)
|
| 440 |
+
│ ├── final_duration_s (after edge trimming)
|
| 441 |
+
│ ├── effective_duration_s (sum of sound regions)
|
| 442 |
+
│ ├── num_sound_regions
|
| 443 |
+
│ ├── peak_amplitude_db
|
| 444 |
+
│ ├── avg_rms_db
|
| 445 |
+
│ └── threshold_strategy, noise_floor_percentile, noise_floor_delta_db
|
| 446 |
+
└── trimmed_audio/
|
| 447 |
+
├── 1-100032-A-0.wav (edge-trimmed clips)
|
| 448 |
+
└── ...
|
| 449 |
+
```
|
| 450 |
+
|
| 451 |
+
### Running Preprocessing
|
| 452 |
+
|
| 453 |
+
```bash
|
| 454 |
+
# Using config defaults
|
| 455 |
+
python preprocess_esc50.py --config config.yaml
|
| 456 |
+
|
| 457 |
+
# Override parameters
|
| 458 |
+
python preprocess_esc50.py --config config.yaml \
|
| 459 |
+
--threshold-strategy noise_floor \
|
| 460 |
+
--noise-floor-percentile 2.0 \
|
| 461 |
+
--noise-floor-delta-db 5.0 \
|
| 462 |
+
--min-sound-ms 25
|
| 463 |
+
|
| 464 |
+
# Don't save trimmed audio (only CSV)
|
| 465 |
+
python preprocess_esc50.py --config config.yaml --no-trimmed-audio
|
| 466 |
+
```
|
| 467 |
+
|
| 468 |
+
### Preprocessing Statistics Example
|
| 469 |
+
|
| 470 |
+
```
|
| 471 |
+
ESC-50 Preprocessing Summary
|
| 472 |
+
============================================================
|
| 473 |
+
Total clips processed: 2000
|
| 474 |
+
Successfully processed: 2000
|
| 475 |
+
|
| 476 |
+
Raw duration statistics:
|
| 477 |
+
Mean: 5.000s Std: 0.000s Min: 5.000s Max: 5.000s
|
| 478 |
+
|
| 479 |
+
Final duration statistics (edges trimmed):
|
| 480 |
+
Mean: 4.723s Std: 0.412s Min: 2.134s Max: 5.000s
|
| 481 |
+
|
| 482 |
+
Effective duration statistics (sum of sound regions):
|
| 483 |
+
Mean: 3.856s Std: 0.823s Min: 0.542s Max: 4.982s
|
| 484 |
+
|
| 485 |
+
Comparison:
|
| 486 |
+
Avg effective: 3.856s
|
| 487 |
+
Avg final: 4.723s
|
| 488 |
+
Difference: 0.867s (internal silences preserved)
|
| 489 |
+
|
| 490 |
+
Average edge trimming reduction: 5.5%
|
| 491 |
+
```
|
| 492 |
+
|
| 493 |
+
### How Duration Task Uses Preprocessed Data
|
| 494 |
+
|
| 495 |
+
The `DurationTaskGenerator` loads preprocessed data:
|
| 496 |
+
|
| 497 |
+
```python
|
| 498 |
+
self.preprocessed_dataset = PreprocessedESC50Dataset(
|
| 499 |
+
metadata_csv=config['tasks']['duration']['preprocessed_data_path'] + '/effective_durations.csv',
|
| 500 |
+
audio_dir=config['tasks']['duration']['preprocessed_data_path'] + '/trimmed_audio'
|
| 501 |
+
)
|
| 502 |
+
|
| 503 |
+
# Calculate average effective duration for slot distribution
|
| 504 |
+
effective_durations = self.preprocessed_dataset.metadata_df['effective_duration_s']
|
| 505 |
+
self.avg_effective_duration = effective_durations.mean() # ~3.856s
|
| 506 |
+
```
|
| 507 |
+
|
| 508 |
+
---
|
| 509 |
+
|
| 510 |
+
## Audio Utilities
|
| 511 |
+
|
| 512 |
+
Located in `utils/audio_utils.py`.
|
| 513 |
+
|
| 514 |
+
### `generate_single_clip_duration(min_duration, max_duration) → float`
|
| 515 |
+
|
| 516 |
+
**Purpose**: Generate a random target clip duration using UNIFORM sampling.
|
| 517 |
+
|
| 518 |
+
**Implementation**:
|
| 519 |
+
```python
|
| 520 |
+
def generate_single_clip_duration(min_duration: float, max_duration: float) -> float:
|
| 521 |
+
return random.uniform(min_duration, max_duration)
|
| 522 |
+
```
|
| 523 |
+
|
| 524 |
+
**Mathematical Formulation**:
|
| 525 |
+
$$d \sim \text{Uniform}(d_{\min}, d_{\max})$$
|
| 526 |
+
|
| 527 |
+
With default values (20s, 60s):
|
| 528 |
+
- Mean: $\mu = \frac{20 + 60}{2} = 40$ seconds
|
| 529 |
+
- Standard Deviation: $\sigma = \frac{60 - 20}{\sqrt{12}} \approx 11.5$ seconds
|
| 530 |
+
|
| 531 |
+
---
|
| 532 |
+
|
| 533 |
+
### `get_max_clip_num_to_be_joined(target_duration_s, source_duration_s, min_silence_ms) → Tuple[int, float]`
|
| 534 |
+
|
| 535 |
+
**Purpose**: Calculate maximum number of source clips that can fit in target duration.
|
| 536 |
+
|
| 537 |
+
**Returns**: Tuple of (max_clips, remainder_seconds)
|
| 538 |
+
|
| 539 |
+
**Implementation** (conceptual):
|
| 540 |
+
```python
|
| 541 |
+
def get_max_clip_num_to_be_joined(target_s, source_s, min_silence_ms):
|
| 542 |
+
silence_s = min_silence_ms / 1000.0
|
| 543 |
+
# Each clip + silence except last
|
| 544 |
+
effective_unit = source_s + silence_s
|
| 545 |
+
max_clips = int((target_s + silence_s) / effective_unit)
|
| 546 |
+
remainder = target_s - (max_clips * source_s + (max_clips - 1) * silence_s)
|
| 547 |
+
return max_clips, remainder
|
| 548 |
+
```
|
| 549 |
+
|
| 550 |
+
**Mathematical Formula**:
|
| 551 |
+
$$N_{\max} = \left\lfloor \frac{T + g}{S + g} \right\rfloor$$
|
| 552 |
+
|
| 553 |
+
Where:
|
| 554 |
+
- $T$ = target duration (seconds)
|
| 555 |
+
- $S$ = source clip duration (5.0s for ESC-50)
|
| 556 |
+
- $g$ = minimum silence gap (seconds)
|
| 557 |
+
|
| 558 |
+
---
|
| 559 |
+
|
| 560 |
+
### `build_count_task_audio(source_audios, source_categories, target_duration, ...)`
|
| 561 |
+
|
| 562 |
+
**Purpose**: Build the final audio for COUNT task.
|
| 563 |
+
|
| 564 |
+
**Parameters**:
|
| 565 |
+
- `source_audios`: List of AudioSegment objects (one per category)
|
| 566 |
+
- `source_categories`: List of category names
|
| 567 |
+
- `target_duration`: Target total duration in seconds
|
| 568 |
+
- `ordering_mode`: "random" or "consecutive"
|
| 569 |
+
- `source_clip_duration_seconds`: Duration of each source clip
|
| 570 |
+
- `min_silence_ms`, `max_extra_silence_per_gap_ms`: Silence parameters
|
| 571 |
+
|
| 572 |
+
**Returns**: Tuple of (final_audio, clip_sequence, build_metadata)
|
| 573 |
+
|
| 574 |
+
---
|
| 575 |
+
|
| 576 |
+
### `build_duration_task_audio(...)`
|
| 577 |
+
|
| 578 |
+
**Purpose**: Build audio for DURATION task with slot distribution.
|
| 579 |
+
|
| 580 |
+
---
|
| 581 |
+
|
| 582 |
+
### `build_clip_sequence_with_silences(clips, target_duration_s, min_silence_ms, max_extra_silence_per_gap_ms, crossfade_ms)`
|
| 583 |
+
|
| 584 |
+
**Purpose**: Concatenate clips with random silence gaps and smooth crossfades.
|
| 585 |
+
|
| 586 |
+
**Algorithm**:
|
| 587 |
+
1. Calculate total audio content duration
|
| 588 |
+
2. Calculate minimum required silence: `(n_clips - 1) × min_silence_ms`
|
| 589 |
+
3. Calculate available extra time: `target_duration - total_audio - min_silence`
|
| 590 |
+
4. Distribute extra time randomly across gaps (up to `max_extra_silence_per_gap_ms` per gap)
|
| 591 |
+
5. Build sequence with crossfades:
|
| 592 |
+
- Audio → Silence: crossfade for smooth transition
|
| 593 |
+
- Silence → Audio: No crossfade (preserves audio start)
|
| 594 |
+
|
| 595 |
+
**Crossfade Benefits**:
|
| 596 |
+
- Smooth transitions between audio and silence
|
| 597 |
+
- Reduces clicks/pops at audio boundaries
|
| 598 |
+
- Preserves natural sound attack (no crossfade at audio start)
|
| 599 |
+
|
| 600 |
+
---
|
| 601 |
+
|
| 602 |
+
## Task: COUNT
|
| 603 |
+
|
| 604 |
+
**File**: `tasks/task_count.py`
|
| 605 |
+
**Class**: `CountTaskGenerator`
|
| 606 |
+
|
| 607 |
+
### Complete Flow
|
| 608 |
+
|
| 609 |
+
```
|
| 610 |
+
CountTaskGenerator.__init__(config, logger)
|
| 611 |
+
↓
|
| 612 |
+
Initialize:
|
| 613 |
+
- ESC50Dataset (loads metadata, tracks category usage)
|
| 614 |
+
- AudioProcessor
|
| 615 |
+
- QuestionGenerator
|
| 616 |
+
- LLMQuestionGenerator (if enabled)
|
| 617 |
+
↓
|
| 618 |
+
generate_dataset()
|
| 619 |
+
↓
|
| 620 |
+
1. num_samples = calculate_num_samples_for_task(task_duration_hours, min, max)
|
| 621 |
+
2. Create balanced_answers list from num_clips_per_sample
|
| 622 |
+
3. Shuffle balanced_answers
|
| 623 |
+
4. For each sample:
|
| 624 |
+
generate_sample(sample_id, target_unique_count=balanced_answers[i])
|
| 625 |
+
5. Save CSVs
|
| 626 |
+
```
|
| 627 |
+
|
| 628 |
+
### Key Method: `generate_sample(sample_id, target_unique_count)`
|
| 629 |
+
|
| 630 |
+
**Pipeline**:
|
| 631 |
+
1. Generate random target duration: `clip_duration_seconds = generate_single_clip_duration(min, max)`
|
| 632 |
+
2. Calculate max clips: `max_clips, remainder = get_max_clip_num_to_be_joined(...)`
|
| 633 |
+
3. Cap `n_unique_audios` at min(target_unique_count, max_clips, 50)
|
| 634 |
+
4. Select categories: `selected_categories = dataset.get_least_used_categories(n_unique_audios)`
|
| 635 |
+
5. Track usage: Increment `category_usage_counts` for each selected category
|
| 636 |
+
6. Sample one file per category: `dataset.sample_file_from_category(category)`
|
| 637 |
+
7. Load source audios
|
| 638 |
+
8. Build final audio: `build_count_task_audio(source_audios, categories, target_duration, ordering_mode, ...)`
|
| 639 |
+
9. Export audio file
|
| 640 |
+
10. Generate MCQ and open-text questions
|
| 641 |
+
11. Return metadata dict
|
| 642 |
+
|
| 643 |
+
### Balanced Answer Distribution (Updated with max_clips_per_sample)
|
| 644 |
+
|
| 645 |
+
```python
|
| 646 |
+
# In generate_dataset()
|
| 647 |
+
max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10) # Single number: 10
|
| 648 |
+
possible_answers = list(range(1, max_clips_per_sample + 1)) # [1, 2, 3, ..., 10]
|
| 649 |
+
|
| 650 |
+
samples_per_answer = num_samples // len(possible_answers)
|
| 651 |
+
remainder = num_samples % len(possible_answers)
|
| 652 |
+
|
| 653 |
+
balanced_answers = []
|
| 654 |
+
for answer in possible_answers:
|
| 655 |
+
count = samples_per_answer + (1 if remainder > 0 else 0)
|
| 656 |
+
balanced_answers.extend([answer] * count)
|
| 657 |
+
remainder = max(0, remainder - 1)
|
| 658 |
+
|
| 659 |
+
random.shuffle(balanced_answers)
|
| 660 |
+
```
|
| 661 |
+
|
| 662 |
+
**For 90 samples, max_clips_per_sample=10**: Each answer (1-10) appears exactly 9 times.
|
| 663 |
+
|
| 664 |
+
### Silence Reduction Strategy (NEW)
|
| 665 |
+
|
| 666 |
+
Each sample's target answer is capped at what actually fits in the duration:
|
| 667 |
+
|
| 668 |
+
```python
|
| 669 |
+
# In generate_sample()
|
| 670 |
+
max_clips, _ = get_max_clip_num_to_be_joined(clip_duration_seconds, source_clip_duration, min_silence_ms)
|
| 671 |
+
|
| 672 |
+
if target_unique_count is not None:
|
| 673 |
+
# Cap target at what actually fits (reduces silence)
|
| 674 |
+
n_unique_audios = min(target_unique_count, max_clips, len(CATEGORIES))
|
| 675 |
+
```
|
| 676 |
+
|
| 677 |
+
**Example**:
|
| 678 |
+
- Target answer from balanced pool: **8 unique sounds**
|
| 679 |
+
- Duration allows: **max_clips = 7**
|
| 680 |
+
- Actual n_unique_audios: **min(8, 7) = 7** ✓ (uses max possible, reduces silence)
|
| 681 |
+
|
| 682 |
+
**Why?** Prevents excessive silence when target exceeds what fits in duration.
|
| 683 |
+
|
| 684 |
+
---
|
| 685 |
+
|
| 686 |
+
## Task: DURATION
|
| 687 |
+
|
| 688 |
+
**File**: `tasks/task_duration.py`
|
| 689 |
+
**Class**: `DurationTaskGenerator`
|
| 690 |
+
|
| 691 |
+
### Complete Flow
|
| 692 |
+
|
| 693 |
+
```
|
| 694 |
+
DurationTaskGenerator.__init__(config, logger)
|
| 695 |
+
↓
|
| 696 |
+
Initialize:
|
| 697 |
+
- PreprocessedESC50Dataset (uses effective_durations.csv)
|
| 698 |
+
- Calculate avg_effective_duration from preprocessed data
|
| 699 |
+
- AudioProcessor, QuestionGenerator
|
| 700 |
+
- Load multiplier_longest, multiplier_shortest from config
|
| 701 |
+
↓
|
| 702 |
+
generate_dataset()
|
| 703 |
+
↓
|
| 704 |
+
1. num_samples = calculate_num_samples_for_task(...)
|
| 705 |
+
2. Create balanced question types: ["longest"] * 45 + ["shortest"] * 45
|
| 706 |
+
3. Shuffle balanced_types
|
| 707 |
+
4. While len(samples) < num_samples:
|
| 708 |
+
generate_sample(sample_idx, question_type=balanced_types[idx])
|
| 709 |
+
If returns None → increment rejection_count, continue
|
| 710 |
+
5. Save CSVs
|
| 711 |
+
```
|
| 712 |
+
|
| 713 |
+
### Key Methods
|
| 714 |
+
|
| 715 |
+
#### `_calculate_max_clips_and_sources(target_duration_s, question_type)`
|
| 716 |
+
|
| 717 |
+
**Purpose**: Determine valid number of sources based on question type and duration.
|
| 718 |
+
|
| 719 |
+
**For LONGEST**:
|
| 720 |
+
- Target needs ≥2 clips to beat backgrounds by 1.5x
|
| 721 |
+
- `min_valid_sources = 2`
|
| 722 |
+
- `max_valid_sources = max_clips - 2 + 1`
|
| 723 |
+
|
| 724 |
+
**For SHORTEST**:
|
| 725 |
+
- Target gets 1 clip
|
| 726 |
+
- Each background needs ≥2 clips to be 2x target
|
| 727 |
+
- `max_valid_sources = 1 + (max_clips - 1) // 2`
|
| 728 |
+
|
| 729 |
+
```python
|
| 730 |
+
# Filter config values to valid range, then pick RANDOMLY
|
| 731 |
+
valid_config_sources = [n for n in num_sources_config if min_valid <= n <= max_valid]
|
| 732 |
+
n_sources = random.choice(valid_config_sources)
|
| 733 |
+
```
|
| 734 |
+
|
| 735 |
+
#### `_try_generate_sample(sample_id, question_type)`
|
| 736 |
+
|
| 737 |
+
**Full Algorithm**:
|
| 738 |
+
1. Generate target duration: `generate_single_clip_duration(min, max)`
|
| 739 |
+
2. Calculate max_clips and n_sources: `_calculate_max_clips_and_sources(...)`
|
| 740 |
+
3. Select target category (least used)
|
| 741 |
+
4. Select background categories (from remaining least used)
|
| 742 |
+
5. Calculate slot distribution based on question_type
|
| 743 |
+
6. For each category, select source files and generate clip durations
|
| 744 |
+
7. Load and trim clips
|
| 745 |
+
8. Calculate total effective duration per category
|
| 746 |
+
9. Verify gap constraint
|
| 747 |
+
10. If gap not satisfied, try `_try_improve_slot_distribution()`
|
| 748 |
+
11. If still not satisfied, return None (triggers retry)
|
| 749 |
+
12. Build audio and generate questions
|
| 750 |
+
13. Return metadata
|
| 751 |
+
|
| 752 |
+
#### `_try_improve_slot_distribution(slot_distribution, durations, question_type, max_clips)`
|
| 753 |
+
|
| 754 |
+
**Purpose**: Redistribute slots to satisfy gap constraint.
|
| 755 |
+
|
| 756 |
+
---
|
| 757 |
+
|
| 758 |
+
## Task: ORDER
|
| 759 |
+
|
| 760 |
+
**File**: `tasks/task_order.py`
|
| 761 |
+
**Class**: `OrderTaskGenerator`
|
| 762 |
+
|
| 763 |
+
### Complete Flow
|
| 764 |
+
|
| 765 |
+
```
|
| 766 |
+
OrderTaskGenerator.__init__(config, logger)
|
| 767 |
+
↓
|
| 768 |
+
Initialize ESC50Dataset, AudioProcessor, QuestionGenerator
|
| 769 |
+
↓
|
| 770 |
+
generate_dataset()
|
| 771 |
+
↓
|
| 772 |
+
1. Generate sample durations upfront (exact fill)
|
| 773 |
+
2. num_samples = len(sample_durations)
|
| 774 |
+
3. Create balanced question_types distribution
|
| 775 |
+
4. For each sample:
|
| 776 |
+
generate_sample(sample_id, target_question_type=balanced_types[i])
|
| 777 |
+
→ n_clips randomly selected from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)]
|
| 778 |
+
5. Save CSVs
|
| 779 |
+
```
|
| 780 |
+
|
| 781 |
+
### Key Method: `_get_valid_question_types(n_clips)`
|
| 782 |
+
|
| 783 |
+
Filters question types based on clip count:
|
| 784 |
+
- `second`, `second_last`: require `n_clips >= min_clips_for_second_questions` (default: 4)
|
| 785 |
+
- `after`, `before`: require `n_clips >= 2`
|
| 786 |
+
- `first`, `last`: always valid
|
| 787 |
+
|
| 788 |
+
### Key Method: `generate_sample(sample_id, target_question_type, target_duration_seconds)`
|
| 789 |
+
|
| 790 |
+
**Algorithm**:
|
| 791 |
+
1. Use pre-generated `target_duration_seconds` (from sample_durations)
|
| 792 |
+
2. Calculate max_clips from duration: `get_max_clip_num_to_be_joined(...)`
|
| 793 |
+
3. **Silence reduction - randomly select n_clips**:
|
| 794 |
+
```python
|
| 795 |
+
min_clips = max(2, max_clips - 3)
|
| 796 |
+
max_clips_allowed = min(max_clips, max_clips_per_sample, len(CATEGORIES))
|
| 797 |
+
if min_clips > max_clips_allowed: # Handle edge case
|
| 798 |
+
min_clips = max_clips_allowed
|
| 799 |
+
n_clips = random.randint(min_clips, max_clips_allowed)
|
| 800 |
+
```
|
| 801 |
+
4. Get valid question types for n_clips
|
| 802 |
+
5. Select answer position based on question type:
|
| 803 |
+
- `first` → position 0
|
| 804 |
+
- `last` → position n_clips - 1
|
| 805 |
+
- `second` → position 1
|
| 806 |
+
- `second_last` → position n_clips - 2
|
| 807 |
+
- `after` → random position 1 to n-1
|
| 808 |
+
- `before` → random position 0 to n-2
|
| 809 |
+
6. Select categories using least-used balancing (answer first, then others)
|
| 810 |
+
7. Build audio with `build_clip_sequence_with_silences` (includes crossfade)
|
| 811 |
+
8. Generate questions including sequence question
|
| 812 |
+
9. Return metadata
|
| 813 |
+
|
| 814 |
+
**Silence Reduction**: Target n_clips is capped at `max_clips` to avoid excessive silence.
|
| 815 |
+
|
| 816 |
+
---
|
| 817 |
+
|
| 818 |
+
## Task: VOLUME
|
| 819 |
+
|
| 820 |
+
**File**: `tasks/task_volume.py`
|
| 821 |
+
**Class**: `VolumeTaskGenerator`
|
| 822 |
+
|
| 823 |
+
### Complete Flow
|
| 824 |
+
|
| 825 |
+
```
|
| 826 |
+
VolumeTaskGenerator.__init__(config, logger)
|
| 827 |
+
↓
|
| 828 |
+
Initialize ESC50Dataset, AudioProcessor, QuestionGenerator
|
| 829 |
+
Load multiplier_max_loudness, multiplier_min_loudness, baseline normalization settings
|
| 830 |
+
↓
|
| 831 |
+
generate_dataset()
|
| 832 |
+
↓
|
| 833 |
+
1. Generate sample durations upfront (exact fill)
|
| 834 |
+
2. num_samples = len(sample_durations)
|
| 835 |
+
3. Create balanced clips_count_pool from 2 to max_clips_per_sample
|
| 836 |
+
4. Create balanced question_types: ["max_loudness"] * N/2 + ["min_loudness"] * N/2
|
| 837 |
+
5. Shuffle both pools
|
| 838 |
+
6. Store clips_count_pool as instance variable
|
| 839 |
+
7. For each sample:
|
| 840 |
+
generate_sample(sample_id, target_question_type=balanced_types[i])
|
| 841 |
+
→ Uses clips_count_pool.pop(0) internally, capped at max_clips_that_fit
|
| 842 |
+
→ Normalizes clips to baseline, applies volume adjustments
|
| 843 |
+
→ Verifies gap constraints (up to 10 attempts)
|
| 844 |
+
8. Save CSVs
|
| 845 |
+
```
|
| 846 |
+
|
| 847 |
+
### Key Methods
|
| 848 |
+
|
| 849 |
+
#### `_normalize_to_baseline(audio)`
|
| 850 |
+
|
| 851 |
+
```python
|
| 852 |
+
def _normalize_to_baseline(self, audio):
|
| 853 |
+
if not self.normalize_to_baseline:
|
| 854 |
+
return audio
|
| 855 |
+
change_in_dBFS = self.baseline_dBFS - audio.dBFS
|
| 856 |
+
return audio.apply_gain(change_in_dBFS)
|
| 857 |
+
```
|
| 858 |
+
|
| 859 |
+
#### `_verify_loudness_gap(volume_levels, question_type)`
|
| 860 |
+
|
| 861 |
+
**For MAX_LOUDNESS**:
|
| 862 |
+
```python
|
| 863 |
+
required_gap_dB = 20 * math.log10(self.multiplier_max_loudness) # ≈ 3.52 dB
|
| 864 |
+
actual_gap_dB = max_level - second_max
|
| 865 |
+
gap_satisfied = actual_gap_dB >= required_gap_dB
|
| 866 |
+
```
|
| 867 |
+
|
| 868 |
+
**For MIN_LOUDNESS**:
|
| 869 |
+
```python
|
| 870 |
+
required_gap_dB = abs(20 * math.log10(self.multiplier_min_loudness)) # ≈ 6.02 dB
|
| 871 |
+
actual_gap_dB = second_min - min_level
|
| 872 |
+
gap_satisfied = actual_gap_dB >= required_gap_dB
|
| 873 |
+
```
|
| 874 |
+
|
| 875 |
+
#### Volume Level Generation
|
| 876 |
+
|
| 877 |
+
Volume levels are generated to satisfy gap constraints:
|
| 878 |
+
- For `max_loudness`: target gets +gap_dB above baseline, backgrounds at/below baseline
|
| 879 |
+
- For `min_loudness`: target gets -gap_dB below baseline, backgrounds at/above baseline
|
| 880 |
+
|
| 881 |
+
---
|
| 882 |
+
|
| 883 |
+
## Deterministic Balancing Mechanisms
|
| 884 |
+
|
| 885 |
+
### Overview
|
| 886 |
+
|
| 887 |
+
The pipeline ensures balanced distributions across multiple dimensions with **capacity-aware assignment**.
|
| 888 |
+
|
| 889 |
+
### 1. Capacity-Aware Answer Balancing (COUNT Task)
|
| 890 |
+
|
| 891 |
+
Each possible answer (1-10) appears equally often, but **higher targets are assigned to samples with higher capacity**.
|
| 892 |
+
|
| 893 |
+
```python
|
| 894 |
+
# Calculate capacity for each sample
|
| 895 |
+
for duration in sample_durations:
|
| 896 |
+
max_clips, _ = get_max_clip_num_to_be_joined(duration, source_clip_duration, min_silence_ms)
|
| 897 |
+
max_for_sample = min(max_clips, max_clips_per_sample, len(CATEGORIES))
|
| 898 |
+
sample_max_clips.append(max_for_sample)
|
| 899 |
+
|
| 900 |
+
# Create balanced pool
|
| 901 |
+
possible_answers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
| 902 |
+
samples_per_answer = num_samples // len(possible_answers)
|
| 903 |
+
remainder = num_samples % len(possible_answers)
|
| 904 |
+
|
| 905 |
+
assignment_pool = []
|
| 906 |
+
for answer in possible_answers:
|
| 907 |
+
count = samples_per_answer + (1 if remainder > 0 else 0)
|
| 908 |
+
assignment_pool.extend([answer] * count)
|
| 909 |
+
remainder = max(0, remainder - 1)
|
| 910 |
+
|
| 911 |
+
# Sort samples by capacity (descending)
|
| 912 |
+
sample_info.sort(key=lambda x: x[2], reverse=True)
|
| 913 |
+
|
| 914 |
+
# Sort pool descending - assign high targets first
|
| 915 |
+
assignment_pool.sort(reverse=True)
|
| 916 |
+
|
| 917 |
+
# Assign targets, clamped to capacity
|
| 918 |
+
for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
|
| 919 |
+
target = min(assignment_pool[idx], capacity)
|
| 920 |
+
balanced_assignments[sample_idx] = target
|
| 921 |
+
```
|
| 922 |
+
|
| 923 |
+
**Guarantee**: Each answer value appears equally, and high targets go to samples that can fit them.
|
| 924 |
+
|
| 925 |
+
### 2. Capacity-Aware Question Type Balancing (ORDER Task)
|
| 926 |
+
|
| 927 |
+
ORDER task uses **capacity-aware balancing** - advanced question types assigned to high-capacity samples.
|
| 928 |
+
|
| 929 |
+
```python
|
| 930 |
+
# Separate question types by requirements
|
| 931 |
+
basic_types = ['first', 'last', 'after', 'before'] # Need >= 2 clips
|
| 932 |
+
advanced_types = ['second', 'second_last'] # Need >= min_clips_for_second (e.g., 3)
|
| 933 |
+
|
| 934 |
+
# Sort samples by capacity (descending)
|
| 935 |
+
sample_info.sort(key=lambda x: x[2], reverse=True)
|
| 936 |
+
|
| 937 |
+
# Build assignment pool - advanced types first
|
| 938 |
+
samples_per_type = num_samples // len(question_types)
|
| 939 |
+
remainder = num_samples % len(question_types)
|
| 940 |
+
|
| 941 |
+
assignment_pool = []
|
| 942 |
+
# Add advanced types first (for high-capacity samples)
|
| 943 |
+
for qtype in advanced_types:
|
| 944 |
+
count = samples_per_type + (1 if remainder > 0 else 0)
|
| 945 |
+
assignment_pool.extend([qtype] * count)
|
| 946 |
+
remainder = max(0, remainder - 1)
|
| 947 |
+
|
| 948 |
+
# Then basic types
|
| 949 |
+
for qtype in basic_types:
|
| 950 |
+
count = samples_per_type + (1 if remainder > 0 else 0)
|
| 951 |
+
assignment_pool.extend([qtype] * count)
|
| 952 |
+
remainder = max(0, remainder - 1)
|
| 953 |
+
|
| 954 |
+
# Assign with validation
|
| 955 |
+
for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
|
| 956 |
+
target_qtype = assignment_pool[idx]
|
| 957 |
+
valid_types = _get_valid_question_types(capacity)
|
| 958 |
+
|
| 959 |
+
if target_qtype not in valid_types:
|
| 960 |
+
# Downgrade to valid type
|
| 961 |
+
target_qtype = random.choice(valid_types)
|
| 962 |
+
|
| 963 |
+
balanced_assignments[sample_idx] = target_qtype
|
| 964 |
+
```
|
| 965 |
+
|
| 966 |
+
### 3. Simple Question Type Balancing (DURATION, VOLUME Tasks)
|
| 967 |
+
|
| 968 |
+
```python
|
| 969 |
+
# DURATION: 2 types → N/2 each
|
| 970 |
+
# VOLUME: 2 types → N/2 each
|
| 971 |
+
|
| 972 |
+
samples_per_type = num_samples // len(question_types)
|
| 973 |
+
remainder = num_samples % len(question_types)
|
| 974 |
+
|
| 975 |
+
balanced_types = []
|
| 976 |
+
for qtype in question_types:
|
| 977 |
+
count = samples_per_type + (1 if remainder > 0 else 0)
|
| 978 |
+
balanced_types.extend([qtype] * count)
|
| 979 |
+
remainder = max(0, remainder - 1)
|
| 980 |
+
|
| 981 |
+
random.shuffle(balanced_types)
|
| 982 |
+
```
|
| 983 |
+
|
| 984 |
+
### 4. Category Usage Balancing
|
| 985 |
+
|
| 986 |
+
All 50 ESC-50 categories are used equally via least-used selection:
|
| 987 |
+
|
| 988 |
+
```python
|
| 989 |
+
def get_least_used_categories(self, n: int, exclude: List[str] = None) -> List[str]:
|
| 990 |
+
# Sort categories by usage count
|
| 991 |
+
sorted_cats = sorted(
|
| 992 |
+
self.category_usage_counts.items(),
|
| 993 |
+
key=lambda x: (x[1], x[0]) # Sort by count, then alphabetically for ties
|
| 994 |
+
)
|
| 995 |
+
# Filter excluded and return first n
|
| 996 |
+
available = [cat for cat, _ in sorted_cats if cat not in (exclude or [])]
|
| 997 |
+
return available[:n]
|
| 998 |
+
```
|
| 999 |
+
|
| 1000 |
+
Each task calls `reset_category_usage()` at the start to ensure independent balancing.
|
| 1001 |
+
|
| 1002 |
+
### 5. N_Clips Selection Strategy
|
| 1003 |
+
|
| 1004 |
+
**COUNT Task**: Uses capacity-aware answer balancing (see #1 above)
|
| 1005 |
+
|
| 1006 |
+
**ORDER and VOLUME Tasks**: Use **silence reduction strategy** (NOT balanced):
|
| 1007 |
+
```python
|
| 1008 |
+
# Randomly sample n_clips from valid range to minimize silence
|
| 1009 |
+
min_clips = max(2, max_clips - 3)
|
| 1010 |
+
max_clips_allowed = min(max_clips, max_clips_per_sample, len(CATEGORIES))
|
| 1011 |
+
|
| 1012 |
+
if min_clips > max_clips_allowed:
|
| 1013 |
+
min_clips = max_clips_allowed # Handle edge case
|
| 1014 |
+
|
| 1015 |
+
n_clips = random.randint(min_clips, max_clips_allowed)
|
| 1016 |
+
```
|
| 1017 |
+
|
| 1018 |
+
This maximizes clip usage within the allowed range, minimizing excessive silence.
|
| 1019 |
+
|
| 1020 |
+
---
|
| 1021 |
+
|
| 1022 |
+
## Rejection Logic and Retry Mechanisms
|
| 1023 |
+
|
| 1024 |
+
### When Samples Are Rejected
|
| 1025 |
+
|
| 1026 |
+
Rejections occur only in tasks with gap constraints:
|
| 1027 |
+
|
| 1028 |
+
1. **DURATION Task**: Gap constraint not satisfied
|
| 1029 |
+
- LONGEST: target_duration < max_background × 1.5
|
| 1030 |
+
- SHORTEST: target_duration > min_background × 0.5
|
| 1031 |
+
|
| 1032 |
+
2. **VOLUME Task**: Gap constraint not satisfied
|
| 1033 |
+
- MAX_LOUDNESS: actual_gap_dB < required_gap_dB (3.52 dB)
|
| 1034 |
+
- MIN_LOUDNESS: actual_gap_dB < required_gap_dB (6.02 dB)
|
| 1035 |
+
|
| 1036 |
+
### DURATION Task Retry Logic
|
| 1037 |
+
|
| 1038 |
+
```python
|
| 1039 |
+
def generate_dataset(self):
|
| 1040 |
+
all_metadata = []
|
| 1041 |
+
sample_idx = 0
|
| 1042 |
+
type_idx = 0
|
| 1043 |
+
|
| 1044 |
+
while len(all_metadata) < num_samples and type_idx < len(balanced_types) * 2:
|
| 1045 |
+
question_type = balanced_types[type_idx % len(balanced_types)]
|
| 1046 |
+
|
| 1047 |
+
metadata = self.generate_sample(sample_idx, question_type)
|
| 1048 |
+
|
| 1049 |
+
if metadata is not None:
|
| 1050 |
+
all_metadata.append(metadata)
|
| 1051 |
+
sample_idx += 1
|
| 1052 |
+
# If None, sample was rejected - just move to next
|
| 1053 |
+
|
| 1054 |
+
type_idx += 1
|
| 1055 |
+
```
|
| 1056 |
+
|
| 1057 |
+
### Rejection Rate Calculation
|
| 1058 |
+
|
| 1059 |
+
$$\text{Rejection Rate} = \frac{\text{rejections}}{\text{rejections} + \text{successes}} \times 100\%$$
|
| 1060 |
+
|
| 1061 |
+
---
|
| 1062 |
+
|
| 1063 |
+
## Complete Task Creation Explanation
|
| 1064 |
+
|
| 1065 |
+
### How Each Task Is Generated (Step-by-Step)
|
| 1066 |
+
|
| 1067 |
+
#### COUNT TASK - "How many unique sounds?"
|
| 1068 |
+
|
| 1069 |
+
**Goal**: Create audio with N unique sound sources, ask how many distinct sounds exist.
|
| 1070 |
+
|
| 1071 |
+
**Process**:
|
| 1072 |
+
1. **Preprocessing**: None (uses raw ESC-50 clips)
|
| 1073 |
+
2. **Duration Generation**: `target_duration ~ Uniform(20s, 60s)` per sample
|
| 1074 |
+
3. **Calculate Max Clips**: `max_clips = get_max_clip_num_to_be_joined(target_duration, 5s, 100ms)`
|
| 1075 |
+
- Example: 45s duration → ~8 clips of 5s each with 100ms silence between
|
| 1076 |
+
4. **Balanced Answer Selection**: Pre-generated pool of answers [1,2,3,...,10] balanced equally
|
| 1077 |
+
- Target answer (e.g., 5 unique sounds) selected from pool
|
| 1078 |
+
5. **Silence Reduction**: Cap target at `min(target_answer, max_clips)`
|
| 1079 |
+
- If target=8 but max_clips=6 → use 6 (prevents excessive silence)
|
| 1080 |
+
6. **Category Selection**: Pick N least-used categories from ESC-50 (balancing)
|
| 1081 |
+
7. **Audio Construction**:
|
| 1082 |
+
- Load one file per category
|
| 1083 |
+
- Calculate repetitions needed: `total_clips = max_clips`
|
| 1084 |
+
- Distribute repetitions across N sources
|
| 1085 |
+
- **Ordering mode**:
|
| 1086 |
+
- `random`: Shuffle clips (A B A C B...) - harder, tests recognition
|
| 1087 |
+
- `consecutive`: Group same-source (AAA BBB CCC) - easier
|
| 1088 |
+
8. **Silence Insertion**:
|
| 1089 |
+
- Minimum 100ms silence between EVERY clip
|
| 1090 |
+
- Extra silence (up to 500ms per gap) distributed from remainder
|
| 1091 |
+
- **Crossfade**: 50ms within same-source, 500ms at audio-silence boundaries
|
| 1092 |
+
9. **Question Generation**: MCQ + open-text asking "How many unique sounds?"
|
| 1093 |
+
10. **Export**: Save audio WAV + metadata
|
| 1094 |
+
|
| 1095 |
+
**Example**:
|
| 1096 |
+
- Target duration: 40s
|
| 1097 |
+
- Max clips that fit: 7 clips (7×5s + 6×0.1s = 35.6s)
|
| 1098 |
+
- Target answer: 3 unique sounds
|
| 1099 |
+
- Actual: 3 unique sounds (7 total clips: 3+2+2 repetitions)
|
| 1100 |
+
- Ordering: Random shuffle → [A B A C B A C]
|
| 1101 |
+
- Result: Audio with 3 distinct sounds, some repeated, with silences and crossfades
|
| 1102 |
+
|
| 1103 |
+
#### DURATION TASK - "Which sound is longest/shortest?"
|
| 1104 |
+
|
| 1105 |
+
**Goal**: Create audio where one sound has clearly longest/shortest duration compared to others.
|
| 1106 |
+
|
| 1107 |
+
**Process**:
|
| 1108 |
+
1. **Preprocessing** (preprocess_esc50.py - REQUIRED):
|
| 1109 |
+
- Load raw ESC-50 clips
|
| 1110 |
+
- Detect sound regions using adaptive noise-floor thresholding
|
| 1111 |
+
- Trim leading/trailing silence (preserve internal structure)
|
| 1112 |
+
- Calculate effective duration per clip
|
| 1113 |
+
- Save trimmed audio + effective_durations.csv
|
| 1114 |
+
2. **Duration Generation**: `target_duration ~ Uniform(20s, 60s)` per sample
|
| 1115 |
+
3. **Calculate Max Clips**: Based on average effective duration (~3.86s)
|
| 1116 |
+
4. **Determine N Sources**: Based on question type and max_clips
|
| 1117 |
+
- **LONGEST**: Target needs ≥2 clips, backgrounds get 1 each → `n_sources ≤ max_clips - 1`
|
| 1118 |
+
- **SHORTEST**: Target gets 1 clip, backgrounds need ≥2 each → `n_sources ≤ 1 + (max_clips-1)//2`
|
| 1119 |
+
5. **Category Selection**: Pick target + backgrounds from least-used categories
|
| 1120 |
+
6. **Slot Distribution**: Allocate clips to each source
|
| 1121 |
+
- LONGEST: Give most clips to target, 1 to each background
|
| 1122 |
+
- SHORTEST: Give 1 to target, multiple to each background
|
| 1123 |
+
7. **Clip Selection**: For each source, select clips from preprocessed dataset
|
| 1124 |
+
8. **Gap Verification**:
|
| 1125 |
+
- LONGEST: `target_duration ≥ max_background × 1.5` ✓
|
| 1126 |
+
- SHORTEST: `target_duration ≤ min_background × 0.75` ✓
|
| 1127 |
+
- If gap not satisfied: Try redistributing slots, or reject sample
|
| 1128 |
+
9. **Audio Construction**:
|
| 1129 |
+
- Load trimmed clips
|
| 1130 |
+
- Concatenate with consecutive ordering (preserve periodicity)
|
| 1131 |
+
- Insert silences with crossfades
|
| 1132 |
+
10. **Question Generation**: "Which sound is longest/shortest?"
|
| 1133 |
+
11. **Export**: Audio + metadata
|
| 1134 |
+
|
| 1135 |
+
**Example**:
|
| 1136 |
+
- Question type: LONGEST
|
| 1137 |
+
- Target duration: 50s, max_clips: 12
|
| 1138 |
+
- N sources: 4 (target + 3 backgrounds)
|
| 1139 |
+
- Slot distribution: Target=6 clips (6×3.8s=22.8s), Backgrounds=2 clips each (2×3.8s=7.6s)
|
| 1140 |
+
- Gap check: 22.8s ≥ 7.6s × 1.5 = 11.4s ✓
|
| 1141 |
+
- Result: Target sound clearly longest
|
| 1142 |
+
|
| 1143 |
+
#### ORDER TASK - "Which sound is first/last/after X?"
|
| 1144 |
+
|
| 1145 |
+
**Goal**: Create ordered sequence of sounds, ask about temporal relationships.
|
| 1146 |
+
|
| 1147 |
+
**Process**:
|
| 1148 |
+
1. **Preprocessing**: None (uses raw ESC-50)
|
| 1149 |
+
2. **Duration Generation**: Pre-generated durations to exactly fill task duration
|
| 1150 |
+
3. **Calculate Max Clips**: `get_max_clip_num_to_be_joined(target_duration, 5s, 100ms)`
|
| 1151 |
+
4. **Balanced N_Clips Selection**: Pre-generated pool [2,3,4,...,10] balanced equally
|
| 1152 |
+
- Target n_clips (e.g., 5) selected from pool
|
| 1153 |
+
- Capped at `min(target_n_clips, max_clips)` (silence reduction)
|
| 1154 |
+
5. **Question Type Selection**: From balanced pool (first, last, second, after, before, second_last)
|
| 1155 |
+
6. **Answer Position Determination**: Based on question type
|
| 1156 |
+
- `first` → position 0
|
| 1157 |
+
- `last` → position n_clips-1
|
| 1158 |
+
- `second` → position 1
|
| 1159 |
+
- `second_last` → position n_clips-2
|
| 1160 |
+
- `after`/`before` → random valid position
|
| 1161 |
+
7. **Category Selection**: Answer category at determined position, others from least-used
|
| 1162 |
+
8. **Audio Construction**:
|
| 1163 |
+
- Load one clip per position
|
| 1164 |
+
- Build sequence with silences (min 100ms + random extra up to 500ms per gap)
|
| 1165 |
+
- **Crossfade**: 500ms at audio-silence boundaries for smooth transitions
|
| 1166 |
+
9. **Question Generation**:
|
| 1167 |
+
- MCQ: "Which sound is first?" with 4 options
|
| 1168 |
+
- Open-text: "What is the first sound?" + full sequence
|
| 1169 |
+
10. **Export**: Audio + metadata
|
| 1170 |
+
|
| 1171 |
+
**Example**:
|
| 1172 |
+
- Target n_clips: 4, max_clips: 8 → use 4 ✓
|
| 1173 |
+
- Question: "Which sound is second?"
|
| 1174 |
+
- Answer position: 1 (0-indexed)
|
| 1175 |
+
- Sequence: [dog, cat, bird, rain] → Answer: cat
|
| 1176 |
+
- Audio: 4 clips in order with silences and crossfades
|
| 1177 |
+
|
| 1178 |
+
#### VOLUME TASK - "Which sound is loudest/softest?"
|
| 1179 |
+
|
| 1180 |
+
**Goal**: Create audio with clips at different volume levels, ask about loudness comparison.
|
| 1181 |
+
|
| 1182 |
+
**Process**:
|
| 1183 |
+
1. **Preprocessing**: None (uses raw ESC-50)
|
| 1184 |
+
2. **Duration Generation**: Pre-generated durations
|
| 1185 |
+
3. **Calculate Max Clips**: `get_max_clip_num_to_be_joined(...)`
|
| 1186 |
+
4. **Balanced N_Clips Selection**: From pool [2,3,...,10], capped at max_clips
|
| 1187 |
+
5. **Question Type Selection**: "max_loudness" or "min_loudness" (balanced 50/50)
|
| 1188 |
+
6. **Volume Level Generation**: Create n_clips volume adjustments (in dB)
|
| 1189 |
+
- Ensure gap constraint (multiplier 4.0 for max, 0.25 for min)
|
| 1190 |
+
- Example: [+12dB, 0dB, -6dB] → max at +12dB has ≥12dB gap from second
|
| 1191 |
+
7. **Gap Verification** (up to 10 attempts):
|
| 1192 |
+
- MAX: `max_level - second_max ≥ 20×log10(4.0) ≈ 12dB`
|
| 1193 |
+
- MIN: `second_min - min_level ≥ 20×log10(4.0) ≈ 12dB`
|
| 1194 |
+
- If not satisfied: Regenerate levels or reject
|
| 1195 |
+
8. **Category Selection**: Answer at determined position, others from least-used
|
| 1196 |
+
9. **Audio Construction**:
|
| 1197 |
+
- Load clips
|
| 1198 |
+
- **CRITICAL: Normalize all to baseline (-20 dBFS)** → ensures controlled comparison
|
| 1199 |
+
- Apply volume adjustments to normalized clips
|
| 1200 |
+
- Concatenate with silences and crossfades
|
| 1201 |
+
10. **Question Generation**: "Which sound has maximum/minimum loudness?"
|
| 1202 |
+
11. **Export**: Audio + metadata with volume levels
|
| 1203 |
+
|
| 1204 |
+
**Example**:
|
| 1205 |
+
- Target n_clips: 3, max_clips: 6 → use 3 ✓
|
| 1206 |
+
- Question: "max_loudness"
|
| 1207 |
+
- Volume levels: [+12dB, 0dB, -6dB]
|
| 1208 |
+
- Gap check: 12 - 0 = 12dB ≥ 12dB ✓
|
| 1209 |
+
- Process: Normalize all clips to -20dBFS, then adjust to [-8dBFS, -20dBFS, -26dBFS]
|
| 1210 |
+
- Result: First sound clearly loudest
|
| 1211 |
+
|
| 1212 |
+
### Key Innovations
|
| 1213 |
+
|
| 1214 |
+
1. **Crossfade Everywhere**: Smooth transitions at audio-silence boundaries (500ms), small crossfade within same-source repetitions (50ms)
|
| 1215 |
+
2. **Adaptive Preprocessing**: Noise-floor thresholding adapts per-clip (duration task)
|
| 1216 |
+
3. **Silence Reduction**: ORDER/VOLUME tasks sample n_clips from [max_clips-3, max_clips_per_sample] to minimize silence
|
| 1217 |
+
4. **Balanced Distribution**:
|
| 1218 |
+
- **COUNT**: Balances answers (1 to max_clips_per_sample) + question types
|
| 1219 |
+
- **ORDER/VOLUME**: Balances question types only (n_clips uses silence reduction)
|
| 1220 |
+
5. **Category Balancing**: Least-used selection ensures all 50 ESC-50 categories used evenly
|
| 1221 |
+
6. **Gap Constraints**: Mathematical guarantees for duration/volume comparisons
|
| 1222 |
+
7. **Exact Duration Filling**: Pre-generate sample durations to exactly fill task duration (no wasted time)
|
| 1223 |
+
|
| 1224 |
+
---
|
| 1225 |
+
|
| 1226 |
+
## Command-Line Arguments
|
| 1227 |
+
|
| 1228 |
+
### Main Pipeline (`main.py`)
|
| 1229 |
+
|
| 1230 |
+
```bash
|
| 1231 |
+
python main.py [OPTIONS]
|
| 1232 |
+
|
| 1233 |
+
Options:
|
| 1234 |
+
--config, -c PATH Path to config YAML (default: config.yaml)
|
| 1235 |
+
--tasks, -t TASKS Specific tasks to run (choices: count, duration, order, volume)
|
| 1236 |
+
--output, -o PATH Custom output directory (overrides config)
|
| 1237 |
+
|
| 1238 |
+
Examples:
|
| 1239 |
+
# Run all enabled tasks with default config
|
| 1240 |
+
python main.py
|
| 1241 |
+
|
| 1242 |
+
# Run specific tasks only
|
| 1243 |
+
python main.py --tasks count order
|
| 1244 |
+
|
| 1245 |
+
# Use custom config and output
|
| 1246 |
+
python main.py --config my_config.yaml --output ./my_dataset
|
| 1247 |
+
```
|
| 1248 |
+
|
| 1249 |
+
### Preprocessing Script (`preprocess_esc50.py`)
|
| 1250 |
+
|
| 1251 |
+
```bash
|
| 1252 |
+
python preprocess_esc50.py [OPTIONS]
|
| 1253 |
+
|
| 1254 |
+
Options:
|
| 1255 |
+
--config PATH Path to config YAML (default: config.yaml)
|
| 1256 |
+
--threshold-strategy STRATEGY "noise_floor" or "peak_relative"
|
| 1257 |
+
--threshold-db FLOAT Threshold in dB (for peak_relative)
|
| 1258 |
+
--noise-floor-percentile FLOAT Percentile for noise floor estimation
|
| 1259 |
+
--noise-floor-delta-db FLOAT Delta above noise floor in dB
|
| 1260 |
+
--min-sound-ms INT Minimum sound duration in ms
|
| 1261 |
+
--no-trimmed-audio Skip saving trimmed audio files
|
| 1262 |
+
--output-dir PATH Custom output directory
|
| 1263 |
+
|
| 1264 |
+
Examples:
|
| 1265 |
+
# Use config defaults
|
| 1266 |
+
python preprocess_esc50.py --config config.yaml
|
| 1267 |
+
|
| 1268 |
+
# Override threshold parameters
|
| 1269 |
+
python preprocess_esc50.py --config config.yaml \
|
| 1270 |
+
--threshold-strategy noise_floor \
|
| 1271 |
+
--noise-floor-percentile 2.0 \
|
| 1272 |
+
--noise-floor-delta-db 5.0 \
|
| 1273 |
+
--min-sound-ms 25
|
| 1274 |
+
|
| 1275 |
+
# Generate metadata only (no trimmed audio)
|
| 1276 |
+
python preprocess_esc50.py --config config.yaml --no-trimmed-audio
|
| 1277 |
+
```
|
| 1278 |
+
|
| 1279 |
+
---
|
| 1280 |
+
|
| 1281 |
+
## Summary
|
| 1282 |
+
|
| 1283 |
+
The TREA 2.0 pipeline generates balanced, constraint-satisfying audio QA samples through:
|
| 1284 |
+
|
| 1285 |
+
1. **Preprocessing** (Duration only): Adaptive noise-floor thresholding + edge trimming
|
| 1286 |
+
2. **Exact Duration Filling**: Pre-generate sample durations to sum exactly to task duration
|
| 1287 |
+
3. **Capacity-Aware Balancing**:
|
| 1288 |
+
- **COUNT**: High answer targets → high-capacity samples
|
| 1289 |
+
- **ORDER**: Advanced question types → high-capacity samples
|
| 1290 |
+
4. **Silence Reduction**: ORDER/VOLUME randomly sample n_clips from [max_clips-3, max_clips_per_sample]
|
| 1291 |
+
5. **Crossfade Transitions**: Smooth audio-silence boundaries (500ms) + within-source (50ms)
|
| 1292 |
+
6. **Category Balancing**: Least-used selection ensures even ESC-50 category distribution
|
| 1293 |
+
7. **Gap Constraints**: Mathematical guarantees (1.5x for longest, 0.75x for shortest, 4.0x/0.25x for volume)
|
| 1294 |
+
8. **Retry Mechanisms**: Failed samples rejected, pipeline continues until target count reached
|
| 1295 |
+
|
| 1296 |
+
All randomness is seeded (`random_seed: 42`) for reproducibility.
|
README.md
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# TREA 2.0 Pipeline
|
| 2 |
+
|
| 3 |
+
Audio question-answering dataset generator using ESC-50. Creates four task types: COUNT, DURATION, ORDER, and VOLUME.
|
| 4 |
+
|
| 5 |
+
## Quick Start
|
| 6 |
+
|
| 7 |
+
```bash
|
| 8 |
+
# 1. Install dependencies
|
| 9 |
+
pip install -r requirements.txt
|
| 10 |
+
|
| 11 |
+
# 2. Preprocess ESC-50 (required for DURATION task only)
|
| 12 |
+
python preprocess_esc50.py --config config.yaml
|
| 13 |
+
|
| 14 |
+
# 3. Generate datasets
|
| 15 |
+
python main.py --config config.yaml
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
## Configuration
|
| 19 |
+
|
| 20 |
+
Edit `config.yaml` to set:
|
| 21 |
+
- **Task duration**: `task_duration_size` (hours) per task
|
| 22 |
+
- **Clip duration range**: `min_clip_duration` to `max_clip_duration` (seconds)
|
| 23 |
+
- **ESC-50 paths**: Point to your ESC-50 dataset location
|
| 24 |
+
- **Enable/disable tasks**: Set `enabled: true/false` for each task
|
| 25 |
+
|
| 26 |
+
## Key Files
|
| 27 |
+
|
| 28 |
+
- **`config.yaml`** - All configuration parameters
|
| 29 |
+
- **`main.py`** - Pipeline entry point (runs all tasks)
|
| 30 |
+
- **`preprocess_esc50.py`** - Preprocess ESC-50 for duration task
|
| 31 |
+
- **`tasks/task_*.py`** - Individual task generators
|
| 32 |
+
|
| 33 |
+
## Tasks
|
| 34 |
+
|
| 35 |
+
| Task | Question | Example |
|
| 36 |
+
|------|----------|---------|
|
| 37 |
+
| **COUNT** | "How many unique sounds?" | Audio with 5 distinct sound types |
|
| 38 |
+
| **DURATION** | "Which sound is longest/shortest?" | Compare sound durations |
|
| 39 |
+
| **ORDER** | "Which sound is first/last/after X?" | Temporal sequence questions |
|
| 40 |
+
| **VOLUME** | "Which sound is loudest/softest?" | Loudness comparison |
|
| 41 |
+
|
| 42 |
+
## Output Structure
|
| 43 |
+
|
| 44 |
+
```
|
| 45 |
+
output/{task}/
|
| 46 |
+
├── audios/*.wav # Generated audio files
|
| 47 |
+
├── {task}_mcq.csv # Multiple choice questions
|
| 48 |
+
├── {task}_open_text.csv # Open-ended questions
|
| 49 |
+
└── {task}_metadata.csv # Detailed metadata
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
## Shell scripts (quick)
|
| 53 |
+
|
| 54 |
+
Use the provided shell helpers for simple runs.
|
| 55 |
+
|
| 56 |
+
Run full pipeline (uses `python main.py` under the hood):
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
# Make executable and run (from pipeline/)
|
| 60 |
+
./run_pipeline.sh
|
| 61 |
+
|
| 62 |
+
# With custom config, tasks, and output
|
| 63 |
+
./run_pipeline.sh --config my_config.yaml --tasks count,order --output ./my_dataset
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
Run the LLM answer generation across splits (uses `llm_answer_generator.py`):
|
| 67 |
+
|
| 68 |
+
```bash
|
| 69 |
+
# Processes open_text CSVs across splits/tasks defined in the script
|
| 70 |
+
./run_llm_answers_all.sh
|
| 71 |
+
|
| 72 |
+
# Or run per-file with the helper script directly
|
| 73 |
+
python llm_answer_generator.py --input /path/to/count_open_text.csv --mode open_text --task count
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
## Advanced Usage
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
# Run specific tasks only
|
| 81 |
+
python main.py --tasks count order
|
| 82 |
+
|
| 83 |
+
# Use custom config
|
| 84 |
+
python main.py --config my_config.yaml
|
| 85 |
+
|
| 86 |
+
# Custom output directory
|
| 87 |
+
python main.py --output /path/to/output
|
| 88 |
+
|
| 89 |
+
# Preprocess with custom parameters
|
| 90 |
+
python preprocess_esc50.py --config config.yaml \
|
| 91 |
+
--threshold-strategy noise_floor \
|
| 92 |
+
--noise-floor-percentile 2.0 \
|
| 93 |
+
--noise-floor-delta-db 5.0
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
## Documentation
|
| 97 |
+
|
| 98 |
+
See **`DOCS.md`** for complete technical documentation including:
|
| 99 |
+
- Mathematical formulations
|
| 100 |
+
- Detailed algorithm explanations
|
| 101 |
+
- Configuration parameter reference
|
| 102 |
+
- Preprocessing pipeline details
|
| 103 |
+
- Balancing mechanisms
|
| 104 |
+
|
| 105 |
+
## Requirements
|
| 106 |
+
|
| 107 |
+
- Python 3.8+
|
| 108 |
+
- pydub
|
| 109 |
+
- numpy
|
| 110 |
+
- pandas
|
| 111 |
+
- tqdm
|
| 112 |
+
- pyyaml
|
config.yaml
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Temporal Reasoning Audio Dataset Pipeline Configuration
|
| 2 |
+
##uniform distributuon for clip duration
|
| 3 |
+
##not mixing datasets
|
| 4 |
+
|
| 5 |
+
##count
|
| 6 |
+
##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder
|
| 7 |
+
|
| 8 |
+
##duration
|
| 9 |
+
##amplitude based filtering -> normalize -> threshold based selection
|
| 10 |
+
##gap between audio clips - x2/1.5 the shorter one -> add as param
|
| 11 |
+
##different clips of the same class can be contatenated to reach target duration
|
| 12 |
+
##consecutive ordering only
|
| 13 |
+
##based on n unique sources and total clips we can have -> shortest and longest duration calculation
|
| 14 |
+
|
| 15 |
+
##reject datapoint if same target audio clip cannot be repeated to maintain the gap - arg
|
| 16 |
+
##sample different clip from the same class -> check if different clips can be used to fill the gap - arg
|
| 17 |
+
|
| 18 |
+
##amplitude filtered durations in metadata csv
|
| 19 |
+
|
| 20 |
+
##get_max_clip_num_to_be_joined()
|
| 21 |
+
##pick dataset -> pick class -> pick audio clip -> get duration -> concatenate clips to reach target duration -> modulo to get num clips -> inserting silences randomly based on remainder
|
| 22 |
+
|
| 23 |
+
##ensure_silence_between_clips()
|
| 24 |
+
##silence should always be there between two clips
|
| 25 |
+
|
| 26 |
+
##order
|
| 27 |
+
##repeat target clips
|
| 28 |
+
##second and second last - modify question types
|
| 29 |
+
|
| 30 |
+
##volume
|
| 31 |
+
##amplitude average loudness for a audio clip -> repetitions but same clip(argument) -> different volume levels based on dB levels
|
| 32 |
+
|
| 33 |
+
##add crossfade
|
| 34 |
+
|
| 35 |
+
##trimming - threshold separately for each audio clip - normalize 1 and 0 - get threshold -> trim -> concatenate
|
| 36 |
+
##leftmost and rightmost silence trimming
|
| 37 |
+
##buffer for trimming - cutting early and cutting a bit late to avoid cutting important parts
|
| 38 |
+
##periodicity affect
|
| 39 |
+
|
| 40 |
+
##volume - trim and get average loudness -> normalize -> adjust volume levels
|
| 41 |
+
|
| 42 |
+
##number of clips per samples to avoid silence
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ESC-50 Dataset paths (each clip is 5 seconds)
|
| 46 |
+
esc50:
|
| 47 |
+
audio_path: "/path/to/ESC-50_github/audio"
|
| 48 |
+
metadata_path: "/path/to/ESC-50_github/meta/esc50.csv"
|
| 49 |
+
|
| 50 |
+
# Synthetic silence audio for concatenation
|
| 51 |
+
synthetic_silence:
|
| 52 |
+
path: "/path/to/synthetic_silences"
|
| 53 |
+
|
| 54 |
+
# Output configuration
|
| 55 |
+
output:
|
| 56 |
+
base_path: "/path/to/pipeline/test_ood"
|
| 57 |
+
# Dataset class-subset configuration
|
| 58 |
+
# Use this to create datasets (train/val/test) from a persistent subset
|
| 59 |
+
# of classes (e.g. use 40 of 50 classes for in-distribution splits and
|
| 60 |
+
# optionally create an OOD test set using all 50 classes).
|
| 61 |
+
dataset:
|
| 62 |
+
use_class_subset: false # if false, use all available classes
|
| 63 |
+
num_classes_subset: 40 # number of classes to use for train/val/test
|
| 64 |
+
subset_persist_path: "/path/to/class_subset.json"
|
| 65 |
+
subset_seed: 42 # RNG seed when sampling the subset (persisted)
|
| 66 |
+
|
| 67 |
+
# Audio generation parameters
|
| 68 |
+
audio:
|
| 69 |
+
# Duration range for each GENERATED clip (in seconds)
|
| 70 |
+
# Original ESC-50 clips are 5s and will be concatenated to create clips in this range
|
| 71 |
+
min_clip_duration: 20.0 # Minimum duration for each generated clip
|
| 72 |
+
max_clip_duration: 60.0 # Maximum duration for each generated clip
|
| 73 |
+
|
| 74 |
+
# Crossfade and silence
|
| 75 |
+
crossfade_duration: 500 # Crossfade between audio and silence (milliseconds) for smooth transitions
|
| 76 |
+
silence_duration: 1000 # Default silence between clips (milliseconds)
|
| 77 |
+
min_silence_duration: 100 # Minimum silence ALWAYS inserted between clips (milliseconds)
|
| 78 |
+
max_extra_silence_per_gap: 500 # Maximum extra silence per gap when distributing remainder
|
| 79 |
+
crossfade_within_source: 50 # Small crossfade within same-source repetitions (count task)
|
| 80 |
+
with_silence: true # Add silence between clips
|
| 81 |
+
# Duration (seconds) of individual source clips (ESC-50 are 5s by default).
|
| 82 |
+
# Used to compute how many source clips are concatenated to reach a target
|
| 83 |
+
# generated clip duration. Change only if your source clips differ.
|
| 84 |
+
source_clip_duration: 5.0
|
| 85 |
+
|
| 86 |
+
# Audio normalization
|
| 87 |
+
normalize: false
|
| 88 |
+
normalize_target_dBFS: -20.0
|
| 89 |
+
|
| 90 |
+
# Random seed for reproducibility
|
| 91 |
+
random_seed: 42
|
| 92 |
+
|
| 93 |
+
# LLM for question generation (local Llama 3.1 8B)
|
| 94 |
+
llm:
|
| 95 |
+
enabled: false # Set to true to use LLM for question generation
|
| 96 |
+
|
| 97 |
+
# Task-specific configurations
|
| 98 |
+
tasks:
|
| 99 |
+
count:
|
| 100 |
+
enabled: true
|
| 101 |
+
# Total duration for ALL samples in this task combined (in hours)
|
| 102 |
+
# Pipeline will calculate number of samples based on min/max clip durations
|
| 103 |
+
task_duration_size: 2.0 # hours
|
| 104 |
+
|
| 105 |
+
# Maximum unique sound sources per sample (single number)
|
| 106 |
+
# Actual number will be subsampled from max(1, max_clips-3) to min(max_clips, max_clips_per_sample)
|
| 107 |
+
max_clips_per_sample: 10
|
| 108 |
+
|
| 109 |
+
# Ordering mode for repeated clips of same source:
|
| 110 |
+
# "random": Clips are shuffled randomly (A B A C B A C...) - tests recognition of recurring sounds
|
| 111 |
+
# "consecutive": Same-source clips grouped together (AAA BBB CCC) - easier, just count blocks
|
| 112 |
+
ordering_mode: "random"
|
| 113 |
+
|
| 114 |
+
# Question templates for MCQ
|
| 115 |
+
mcq_questions:
|
| 116 |
+
- "What is the number of distinct sound sources in the audio file?"
|
| 117 |
+
- "How many different types of sounds can be identified in this recording?"
|
| 118 |
+
- "How many unique types of sound are present in this audio?"
|
| 119 |
+
- "Identify the count of different sound sources in this clip."
|
| 120 |
+
- "What is the total number of unique sounds heard in this audio?"
|
| 121 |
+
- "How many distinct sound categories are there in this audio file?"
|
| 122 |
+
- "Determine the number of unique sound sources in this recording."
|
| 123 |
+
- "How many separate sound sources are included in the audio?"
|
| 124 |
+
- "What is the total number of unique sound types in this audio?"
|
| 125 |
+
- "How many different sound sources can be heard in this clip?"
|
| 126 |
+
# Question templates for open-text
|
| 127 |
+
open_text_questions:
|
| 128 |
+
- "How many distinct sound sources are present in the audio?"
|
| 129 |
+
- "Count the number of unique sounds in this recording."
|
| 130 |
+
- "What is the total count of different sound categories heard?"
|
| 131 |
+
- "Identify and count all unique sound types in the clip."
|
| 132 |
+
|
| 133 |
+
duration:
|
| 134 |
+
enabled: true
|
| 135 |
+
# Total duration for ALL samples in this task combined (in hours)
|
| 136 |
+
task_duration_size: 2.0 # hours
|
| 137 |
+
|
| 138 |
+
# Number of unique sound sources per sample (can be single int or list)
|
| 139 |
+
# Single int (e.g., 15): randomly samples from 1 to 15 (like count/order tasks)
|
| 140 |
+
# List (e.g., [2,3,4]): randomly picks from the list
|
| 141 |
+
# The script will automatically generate repetition patterns to create
|
| 142 |
+
# shortest/longest variations based on the target clip duration
|
| 143 |
+
num_unique_sources: 10
|
| 144 |
+
|
| 145 |
+
# Ordering: only keep "consecutive" so repeated segments of the same
|
| 146 |
+
# source remain grouped together, ensuring that multiple consecutive
|
| 147 |
+
# clips of the same audio yield the longest duration unambiguously.
|
| 148 |
+
ordering_methods: ["consecutive"]
|
| 149 |
+
|
| 150 |
+
# =====================================================
|
| 151 |
+
# Amplitude-based filtering parameters (preprocessing)
|
| 152 |
+
# =====================================================
|
| 153 |
+
# RELATIVE dB threshold below peak to consider as silence
|
| 154 |
+
# For each clip: silence_threshold = clip_peak_dB + amplitude_threshold_db
|
| 155 |
+
# Example: If clip peak is -5 dB and threshold is -20, silence threshold = -25 dB
|
| 156 |
+
# Based on ESC-50 analysis: -20 dB gives ~60% effective duration (good balance)
|
| 157 |
+
# More aggressive (removes more silence): -15 dB
|
| 158 |
+
# More conservative (keeps more sound): -25 dB
|
| 159 |
+
amplitude_threshold_db: -20.0
|
| 160 |
+
|
| 161 |
+
# Minimum duration of sound region to keep (milliseconds)
|
| 162 |
+
# Filters out very short transient noise spikes
|
| 163 |
+
# ESC-50 is curated, so 20-30ms is sufficient
|
| 164 |
+
min_sound_duration_ms: 25
|
| 165 |
+
|
| 166 |
+
# =====================================================
|
| 167 |
+
# Adaptive threshold strategy
|
| 168 |
+
# =====================================================
|
| 169 |
+
# "peak_relative": threshold = peak_dB + amplitude_threshold_db (fixed offset from peak)
|
| 170 |
+
# - Simple but not adaptive to actual noise levels
|
| 171 |
+
# "noise_floor": threshold = percentile(dB, N) + delta_dB (RECOMMENDED)
|
| 172 |
+
# - Fully adaptive per-clip based on its own noise floor
|
| 173 |
+
# - Each clip analyzed independently - no fixed dB values needed
|
| 174 |
+
# - Better for diverse audio with varying noise levels
|
| 175 |
+
threshold_strategy: "noise_floor"
|
| 176 |
+
|
| 177 |
+
# Noise floor estimation percentile (used when threshold_strategy = noise_floor)
|
| 178 |
+
# Lower percentile = more conservative estimate of background noise
|
| 179 |
+
# 5 = use 5th percentile of dB values as noise floor estimate (better for sparse sounds)
|
| 180 |
+
noise_floor_percentile: 2.0
|
| 181 |
+
|
| 182 |
+
# Delta above noise floor (dB) to set as threshold
|
| 183 |
+
# This is relative to EACH clip's own noise floor, not a fixed dB value
|
| 184 |
+
# 8dB above the clip's noise floor works well for most ESC-50 clips
|
| 185 |
+
# Higher = more conservative (keeps more), Lower = more aggressive (removes more)
|
| 186 |
+
noise_floor_delta_db: 5.0
|
| 187 |
+
|
| 188 |
+
# Path to preprocessed ESC-50 data (effective durations + trimmed audio)
|
| 189 |
+
preprocessed_data_path: "/path/to/ESC-50_preprocessed"
|
| 190 |
+
|
| 191 |
+
# =====================================================
|
| 192 |
+
# Duration gap multipliers
|
| 193 |
+
# =====================================================
|
| 194 |
+
# For LONGEST questions: target_effective >= max_background × multiplier_longest
|
| 195 |
+
multiplier_longest: 1.5
|
| 196 |
+
# For SHORTEST questions: target_effective <= min_background × multiplier_shortest
|
| 197 |
+
# Using 0.75 instead of 0.5 for smaller gaps (easier to distinguish)
|
| 198 |
+
multiplier_shortest: 0.75
|
| 199 |
+
|
| 200 |
+
# Minimum effective duration per source (seconds)
|
| 201 |
+
# Clips with less than this duration are harder to distinguish
|
| 202 |
+
min_effective_duration_per_source: 1.0
|
| 203 |
+
|
| 204 |
+
# =====================================================
|
| 205 |
+
# Fallback/rejection options
|
| 206 |
+
# =====================================================
|
| 207 |
+
# Reject sample if duration gap cannot be satisfied
|
| 208 |
+
reject_if_gap_not_met: true
|
| 209 |
+
# Try different clips from same class if one clip isn't enough
|
| 210 |
+
sample_different_clips_same_class: true
|
| 211 |
+
|
| 212 |
+
# Question types
|
| 213 |
+
question_types: ["shortest", "longest"]
|
| 214 |
+
# MCQ questions
|
| 215 |
+
mcq_questions:
|
| 216 |
+
shortest: "Which of the following sounds is heard for the shortest duration?"
|
| 217 |
+
longest: "Which of the following sounds is heard for the longest duration?"
|
| 218 |
+
# Open-text questions
|
| 219 |
+
open_text_questions:
|
| 220 |
+
shortest: "Which sound is heard for the shortest duration in the audio?"
|
| 221 |
+
longest: "Which sound is heard for the longest duration in the audio?"
|
| 222 |
+
|
| 223 |
+
order:
|
| 224 |
+
enabled: true
|
| 225 |
+
# Total duration for ALL samples in this task combined (in hours)
|
| 226 |
+
task_duration_size: 2.0 # hours
|
| 227 |
+
|
| 228 |
+
# Maximum clips to join per sample (minimum 2 for ordering)
|
| 229 |
+
# Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
|
| 230 |
+
max_clips_per_sample: 10
|
| 231 |
+
|
| 232 |
+
# Whether to allow repeating clips from the same source category
|
| 233 |
+
# If true: sequence could be [dog, dog, cat, bird] (same clip repeated)
|
| 234 |
+
# If false: sequence is always unique sources
|
| 235 |
+
allow_source_repetition: false
|
| 236 |
+
|
| 237 |
+
# Minimum clips needed for "second" and "second_last" questions
|
| 238 |
+
# Set to 4 to ensure second and second_last refer to different positions
|
| 239 |
+
# (with 3 clips, both would refer to middle clip at position 1)
|
| 240 |
+
min_clips_for_second_questions: 3
|
| 241 |
+
|
| 242 |
+
# Question types: "first", "last", "after", "before", "second", "second_last"
|
| 243 |
+
# "second" and "second_last" only generated when n_clips >= min_clips_for_second_questions
|
| 244 |
+
question_types: ["first", "last", "after", "before", "second", "second_last"]
|
| 245 |
+
|
| 246 |
+
# MCQ question templates
|
| 247 |
+
mcq_questions:
|
| 248 |
+
first: "Which sound appears first in the audio clip?"
|
| 249 |
+
last: "Which sound appears last in the audio clip?"
|
| 250 |
+
after: "Which sound comes after {sound1}?"
|
| 251 |
+
before: "Which sound comes before {sound2}?"
|
| 252 |
+
second: "Which sound appears second in the audio clip?"
|
| 253 |
+
second_last: "Which sound appears second to last in the audio clip?"
|
| 254 |
+
# Open-text question templates
|
| 255 |
+
open_text_questions:
|
| 256 |
+
first: "What is the first sound you hear in the audio?"
|
| 257 |
+
last: "What is the last sound you hear in the audio?"
|
| 258 |
+
after: "What sound comes after {sound1}?"
|
| 259 |
+
before: "What sound comes before {sound2}?"
|
| 260 |
+
second: "What is the second sound you hear in the audio?"
|
| 261 |
+
second_last: "What sound is second to last in the audio?"
|
| 262 |
+
sequence: "List the sounds in the order they appear in the audio."
|
| 263 |
+
|
| 264 |
+
volume:
|
| 265 |
+
enabled: true
|
| 266 |
+
# Total duration for ALL samples in this task combined (in hours)
|
| 267 |
+
task_duration_size: 2.0 # hours
|
| 268 |
+
|
| 269 |
+
# Maximum clips with different volumes per sample
|
| 270 |
+
# Actual number will be subsampled from max(2, max_clips-3) to min(max_clips, max_clips_per_sample)
|
| 271 |
+
max_clips_per_sample: 10
|
| 272 |
+
|
| 273 |
+
# =====================================================
|
| 274 |
+
# Normalization settings (CRITICAL for volume comparison)
|
| 275 |
+
# =====================================================
|
| 276 |
+
# All clips are FIRST normalized to baseline, THEN volume adjusted
|
| 277 |
+
# This ensures volume differences are controlled and comparable
|
| 278 |
+
normalize_to_baseline: true
|
| 279 |
+
baseline_dBFS: -20.0 # Normalize all clips to this level first (used if use_lufs=false)
|
| 280 |
+
|
| 281 |
+
# =====================================================
|
| 282 |
+
# LUFS (Perceived Loudness) Settings
|
| 283 |
+
# =====================================================
|
| 284 |
+
# LUFS (Loudness Units Full Scale) measures PERCEIVED loudness
|
| 285 |
+
# Unlike dBFS which only measures RMS amplitude, LUFS accounts for
|
| 286 |
+
# human hearing sensitivity to different frequencies (K-weighting)
|
| 287 |
+
#
|
| 288 |
+
# IMPORTANT: For volume comparison task, we DISABLE LUFS normalization!
|
| 289 |
+
# LUFS makes everything the same perceived loudness, defeating the purpose.
|
| 290 |
+
# Instead, we normalize to a baseline dBFS then apply LARGE volume adjustments.
|
| 291 |
+
use_lufs: false # DISABLED for audible volume differences
|
| 292 |
+
baseline_lufs: -23.0 # EBU R128 standard (not used when use_lufs=false)
|
| 293 |
+
|
| 294 |
+
# =====================================================
|
| 295 |
+
# Volume gap multipliers (similar to duration task)
|
| 296 |
+
# =====================================================
|
| 297 |
+
# For MAX_LOUDNESS questions: target_loudness >= second_loudest × multiplier_max
|
| 298 |
+
# Multiplier 2.5 = ~8dB difference = clearly audible
|
| 299 |
+
# Multiplier 4.0 = ~12dB difference = very obvious (4x perceived loudness)
|
| 300 |
+
multiplier_max_loudness: 4.0
|
| 301 |
+
|
| 302 |
+
# For MIN_LOUDNESS questions: target_loudness <= second_softest × multiplier_min
|
| 303 |
+
# Multiplier 0.25 = ~12dB quieter = clearly distinguishable
|
| 304 |
+
multiplier_min_loudness: 0.25
|
| 305 |
+
|
| 306 |
+
# Reject sample if loudness gap cannot be satisfied
|
| 307 |
+
reject_if_gap_not_met: true
|
| 308 |
+
|
| 309 |
+
# =====================================================
|
| 310 |
+
# Source clip options
|
| 311 |
+
# =====================================================
|
| 312 |
+
# If true: same clip can be repeated at different volumes
|
| 313 |
+
# If false: always use different source clips (default behavior)
|
| 314 |
+
use_same_clip_different_volumes: false
|
| 315 |
+
|
| 316 |
+
# If use_same_clip_different_volumes is true, how many repetitions per source?
|
| 317 |
+
# Can be a single int or list for variety
|
| 318 |
+
repetitions_per_source: [2, 3, 4]
|
| 319 |
+
|
| 320 |
+
# Question types: "max_loudness", "min_loudness"
|
| 321 |
+
question_types: ["max_loudness", "min_loudness"]
|
| 322 |
+
|
| 323 |
+
# MCQ questions
|
| 324 |
+
mcq_questions:
|
| 325 |
+
max_loudness: "Which sound has the maximum loudness in the audio?"
|
| 326 |
+
min_loudness: "Which sound has the minimum loudness in the audio?"
|
| 327 |
+
# Open-text questions
|
| 328 |
+
open_text_questions:
|
| 329 |
+
max_loudness: "Identify the sound with maximum loudness in the audio clip."
|
| 330 |
+
min_loudness: "Identify the sound with minimum loudness in the audio clip."
|
| 331 |
+
order_volume: "List the sounds in order from maximum to minimum loudness."
|
| 332 |
+
|
| 333 |
+
# MCQ options configuration
|
| 334 |
+
mcq:
|
| 335 |
+
num_options: 4
|
| 336 |
+
option_labels: ["A", "B", "C", "D"]
|
| 337 |
+
# Strategy for generating distractor options
|
| 338 |
+
# "present_only": only use sounds present in audio
|
| 339 |
+
# "mixed": mix of present and absent sounds
|
| 340 |
+
# "balanced": balanced distribution
|
| 341 |
+
distractor_strategy: "balanced"
|
| 342 |
+
|
| 343 |
+
# Logging configuration
|
| 344 |
+
logging:
|
| 345 |
+
level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
|
| 346 |
+
log_file: "pipeline.log"
|
| 347 |
+
console_output: true
|
| 348 |
+
|
llm_answer_generator.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import argparse
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 4 |
+
import torch
|
| 5 |
+
import random
|
| 6 |
+
|
| 7 |
+
# Convert MCQ CSV to NL answers using a text-only LLM (meta-llama/Llama-3.1-8B-Instruct)
|
| 8 |
+
# Adds: (1) stronger LLM-driven variability for duration/volume in open_text mode via system prompt
|
| 9 |
+
# (2) --one_word_ratio (default 0.2) to skip forward pass for a fraction of rows,
|
| 10 |
+
# outputting the normalized (underscore-removed) answer only.
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def convert_to_natural_phrase(val):
|
| 14 |
+
"""Convert underscore-separated tokens to natural phrases."""
|
| 15 |
+
if isinstance(val, str) and "_" in val:
|
| 16 |
+
val = val.replace("_", " ")
|
| 17 |
+
return val
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def generate_answer(tokenizer, model, question, correct_value, device, mode="mcq"):
|
| 21 |
+
"""Generate a natural language answer using a text-only LLM.
|
| 22 |
+
|
| 23 |
+
mode: "mcq" (default) uses the original MCQ-oriented prompt.
|
| 24 |
+
"open_text" uses a direct rewrite prompt for provided question/answer pairs.
|
| 25 |
+
"""
|
| 26 |
+
correct_value = convert_to_natural_phrase(correct_value)
|
| 27 |
+
|
| 28 |
+
if mode == "open_text":
|
| 29 |
+
system_preamble = (
|
| 30 |
+
"You convert (Question, short Answer) into EXACTLY ONE natural English sentence that answers the Question.\n\n"
|
| 31 |
+
"HARD RULES:\n"
|
| 32 |
+
"- Output exactly ONE sentence. No newlines, no bullet points, no labels, no quotes.\n"
|
| 33 |
+
"- Use ONLY the provided Answer content as the factual answer; do not add any new facts.\n"
|
| 34 |
+
"- Be concise and direct.\n"
|
| 35 |
+
"- Do NOT include any numbers unless the question is a COUNT question.\n"
|
| 36 |
+
"- Vary phrasing strongly across items; avoid repeating the same structure.\n\n"
|
| 37 |
+
"VARIABILITY REQUIREMENT (IMPORTANT):\n"
|
| 38 |
+
"- For all questions, you MUST vary sentence structure.\n"
|
| 39 |
+
"- Randomly choose ONE of these patterns each time:\n"
|
| 40 |
+
" (A) Start with the sound name (Answer) -> then the relation.\n"
|
| 41 |
+
" (B) Start with the relation -> then the sound name (Answer).\n"
|
| 42 |
+
" (C) Use an 'it`s...' style clause after the Answer.\n"
|
| 43 |
+
" (D) Use a short, natural rephrase with different verbs (e.g., lasts, continues, stands out, comes through).\n"
|
| 44 |
+
"- Do not always use 'The sound with the ... is ...' — that pattern should be rare.\n\n"
|
| 45 |
+
"TASK HANDLING (infer from the Question):\n"
|
| 46 |
+
"- COUNT questions (how many / count / number):\n"
|
| 47 |
+
" * If Answer is numeric, write it EITHER as digits (e.g., 10) OR as a word (e.g., ten). Do NOT include both.\n"
|
| 48 |
+
"- DURATION questions (longest/shortest):\n"
|
| 49 |
+
" * Clearly state longest vs shortest, and use the Answer as the sound name. Do not include any numbers.\n"
|
| 50 |
+
"- VOLUME questions (minimum/maximum loudness, quietest/loudest):\n"
|
| 51 |
+
" * Match minimum vs maximum loudness and use the Answer as the sound name. No dB values.\n"
|
| 52 |
+
"- ORDER questions (first/second/before/after/second-to-last):\n"
|
| 53 |
+
" * Match the requested relation and use the Answer as the sound name.\n\n"
|
| 54 |
+
"Return only the sentence."
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
user_prompt = (
|
| 58 |
+
f"Question: {question}\n"
|
| 59 |
+
f"Answer: {correct_value}\n"
|
| 60 |
+
"Rewrite the answer as a single, natural sentence that directly answers the question."
|
| 61 |
+
)
|
| 62 |
+
else:
|
| 63 |
+
system_preamble = (
|
| 64 |
+
"You are a helpful assistant that converts multiple-choice QA pairs into natural language answers.\n"
|
| 65 |
+
"CRITICAL RULES:\n"
|
| 66 |
+
"1. Write as a human would naturally speak - vary sentence structure and avoid repetitive patterns\n"
|
| 67 |
+
"2. Keep responses concise but natural and affirmative avoiding words like 'might/may' or 'could' - one clear sentence\n"
|
| 68 |
+
"3. Do not mention 'among the options/among the following' even if the question mentions it. This natural language statement is supposed to be a direct answer.\n"
|
| 69 |
+
"4. Do NOT invent sounds.\n"
|
| 70 |
+
"5. Do not reason to answer the question, you're just supposed to provide the correct mcq answer as a natural language answer in a single sentence.\n"
|
| 71 |
+
"Return only the natural language answer, nothing else."
|
| 72 |
+
)
|
| 73 |
+
user_prompt = (
|
| 74 |
+
f"Now, given the question: '{question}' and the correct answer: '{correct_value}', "
|
| 75 |
+
f"write one natural-language answer as you would expect from a human."
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# Chat format
|
| 79 |
+
messages = [
|
| 80 |
+
{"role": "system", "content": system_preamble},
|
| 81 |
+
{"role": "user", "content": user_prompt},
|
| 82 |
+
]
|
| 83 |
+
inputs = tokenizer.apply_chat_template(
|
| 84 |
+
messages,
|
| 85 |
+
tokenize=True,
|
| 86 |
+
add_generation_prompt=True,
|
| 87 |
+
return_tensors="pt",
|
| 88 |
+
).to(device)
|
| 89 |
+
|
| 90 |
+
input_length = inputs.shape[1]
|
| 91 |
+
|
| 92 |
+
with torch.no_grad():
|
| 93 |
+
output = model.generate(
|
| 94 |
+
inputs,
|
| 95 |
+
max_new_tokens=64,
|
| 96 |
+
do_sample=True,
|
| 97 |
+
temperature=0.8,
|
| 98 |
+
top_p=0.9,
|
| 99 |
+
repetition_penalty=1.05,
|
| 100 |
+
no_repeat_ngram_size=3,
|
| 101 |
+
pad_token_id=tokenizer.eos_token_id,
|
| 102 |
+
eos_token_id=tokenizer.eos_token_id,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
generated_ids = output[0, input_length:]
|
| 106 |
+
response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
|
| 107 |
+
print(f"Model response: {response}")
|
| 108 |
+
return response
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def detect_csv_format(df):
|
| 112 |
+
"""
|
| 113 |
+
Detect CSV layout and return column mappings.
|
| 114 |
+
Supports:
|
| 115 |
+
- original MCQ format
|
| 116 |
+
- perturbed MCQ format
|
| 117 |
+
- open-text format (question/answer present)
|
| 118 |
+
"""
|
| 119 |
+
columns = df.columns.tolist()
|
| 120 |
+
|
| 121 |
+
if "correct" in columns and "id" in columns and "audio_path" in columns:
|
| 122 |
+
# Original format (count.csv)
|
| 123 |
+
return {
|
| 124 |
+
"id_col": "id",
|
| 125 |
+
"audio_path_col": "audio_path",
|
| 126 |
+
"answer_col": "correct",
|
| 127 |
+
"question_col": "question",
|
| 128 |
+
"format_type": "original",
|
| 129 |
+
}
|
| 130 |
+
if "answer" in columns and "idx" in columns and "new_audio_path" in columns:
|
| 131 |
+
# Perturbed format (count_perturbed.csv)
|
| 132 |
+
return {
|
| 133 |
+
"id_col": "idx",
|
| 134 |
+
"audio_path_col": "new_audio_path",
|
| 135 |
+
"answer_col": "answer",
|
| 136 |
+
"question_col": "question",
|
| 137 |
+
"format_type": "perturbed",
|
| 138 |
+
}
|
| 139 |
+
if "answer" in columns and "question" in columns:
|
| 140 |
+
# Open-text format
|
| 141 |
+
return {
|
| 142 |
+
"id_col": "id" if "id" in columns else None,
|
| 143 |
+
"audio_path_col": "audio_path" if "audio_path" in columns else None,
|
| 144 |
+
"answer_col": "answer",
|
| 145 |
+
"question_col": "question",
|
| 146 |
+
"format_type": "open_text",
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
raise ValueError(f"Unknown CSV format. Columns found: {columns}")
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def main():
|
| 153 |
+
parser = argparse.ArgumentParser(
|
| 154 |
+
description="Convert CSV to NL answers (MCQ or open-text) using meta-llama/Llama-3.1-8B-Instruct"
|
| 155 |
+
)
|
| 156 |
+
parser.add_argument("--input", required=True, help="Input CSV file")
|
| 157 |
+
parser.add_argument("--output", required=False, help="Output CSV file (defaults to input for in-place append)")
|
| 158 |
+
parser.add_argument(
|
| 159 |
+
"--mode",
|
| 160 |
+
required=True,
|
| 161 |
+
choices=["mcq", "open_text"],
|
| 162 |
+
help="Conversion mode: mcq -> convert MCQ correct option to natural answer; open_text -> rewrite provided short answer to a natural sentence",
|
| 163 |
+
)
|
| 164 |
+
parser.add_argument(
|
| 165 |
+
"--task",
|
| 166 |
+
required=True,
|
| 167 |
+
choices=["count", "duration", "order", "volume"],
|
| 168 |
+
help="Task type this CSV belongs to (used for bookkeeping/logging)",
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
# NEW: one-word skipping
|
| 172 |
+
parser.add_argument(
|
| 173 |
+
"--one_word_ratio",
|
| 174 |
+
type=float,
|
| 175 |
+
default=0.2,
|
| 176 |
+
help="Fraction of samples to output as just the normalized one-word/phrase answer (no LLM forward pass). Default 0.2",
|
| 177 |
+
)
|
| 178 |
+
parser.add_argument(
|
| 179 |
+
"--seed",
|
| 180 |
+
type=int,
|
| 181 |
+
default=123,
|
| 182 |
+
help="Random seed for reproducible one_word sampling.",
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
args = parser.parse_args()
|
| 186 |
+
random.seed(args.seed)
|
| 187 |
+
|
| 188 |
+
print("Loading meta-llama/Llama-3.1-8B-Instruct tokenizer and model...")
|
| 189 |
+
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", use_fast=False)
|
| 190 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 191 |
+
"meta-llama/Llama-3.1-8B-Instruct",
|
| 192 |
+
torch_dtype="auto",
|
| 193 |
+
device_map="auto",
|
| 194 |
+
)
|
| 195 |
+
model.eval()
|
| 196 |
+
|
| 197 |
+
df = pd.read_csv(args.input)
|
| 198 |
+
|
| 199 |
+
# Detect CSV format and get column mappings
|
| 200 |
+
format_info = detect_csv_format(df)
|
| 201 |
+
print(f"Detected CSV format: {format_info['format_type']}")
|
| 202 |
+
|
| 203 |
+
# Validate requested mode against detected CSV format
|
| 204 |
+
if args.mode == "mcq" and format_info["format_type"] == "open_text":
|
| 205 |
+
raise ValueError(
|
| 206 |
+
"Requested mode=mcq but input appears to be open_text format. Use --mode open_text or supply an MCQ CSV."
|
| 207 |
+
)
|
| 208 |
+
if args.mode == "open_text" and format_info["format_type"] != "open_text":
|
| 209 |
+
raise ValueError(
|
| 210 |
+
"Requested mode=open_text but input does not appear to be open_text format. Use --mode mcq or supply an open_text CSV."
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
output_path = args.output if args.output else args.input
|
| 214 |
+
|
| 215 |
+
nl_rows = []
|
| 216 |
+
device = model.device
|
| 217 |
+
|
| 218 |
+
for i, row in df.iterrows():
|
| 219 |
+
question = row[format_info["question_col"]]
|
| 220 |
+
|
| 221 |
+
# Resolve correct_value from CSV format
|
| 222 |
+
if format_info["format_type"] == "open_text":
|
| 223 |
+
correct_value = row[format_info["answer_col"]]
|
| 224 |
+
else:
|
| 225 |
+
correct_letter = row[format_info["answer_col"]]
|
| 226 |
+
option_map = {"A": "optionA", "B": "optionB", "C": "optionC", "D": "optionD"}
|
| 227 |
+
correct_value = row[option_map[correct_letter]]
|
| 228 |
+
|
| 229 |
+
# Normalize underscores BEFORE deciding one_word skip
|
| 230 |
+
correct_value = convert_to_natural_phrase(correct_value)
|
| 231 |
+
|
| 232 |
+
print(f"[{i+1}/{len(df)}] Q: {question} | Ans: {correct_value}")
|
| 233 |
+
|
| 234 |
+
# 20%: one-word/phrase answer, no forward pass
|
| 235 |
+
if random.random() < args.one_word_ratio:
|
| 236 |
+
nl_answer = correct_value
|
| 237 |
+
print(f"Skipped LLM (one_word_ratio). Output: {nl_answer}")
|
| 238 |
+
else:
|
| 239 |
+
nl_answer = generate_answer(
|
| 240 |
+
tokenizer,
|
| 241 |
+
model,
|
| 242 |
+
question,
|
| 243 |
+
correct_value,
|
| 244 |
+
device,
|
| 245 |
+
mode=("open_text" if format_info["format_type"] == "open_text" else "mcq"),
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
nl_rows.append(
|
| 249 |
+
{
|
| 250 |
+
"question": question,
|
| 251 |
+
"id": row[format_info["id_col"]] if format_info.get("id_col") and format_info["id_col"] in row else None,
|
| 252 |
+
"audio_path": row[format_info["audio_path_col"]]
|
| 253 |
+
if format_info.get("audio_path_col")
|
| 254 |
+
else None,
|
| 255 |
+
"original_answer": correct_value,
|
| 256 |
+
"open_text_answer": nl_answer,
|
| 257 |
+
}
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
# Merge back as new column to the original CSV to preserve all fields
|
| 261 |
+
nl_df = pd.DataFrame(nl_rows)
|
| 262 |
+
df["open_text_answer"] = nl_df["open_text_answer"]
|
| 263 |
+
df.to_csv(output_path, index=False)
|
| 264 |
+
print(f"Appended natural language answers to {output_path}")
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
if __name__ == "__main__":
|
| 268 |
+
main()
|
main.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main pipeline runner for temporal reasoning audio dataset generation.
|
| 3 |
+
|
| 4 |
+
This script orchestrates the generation of all task datasets.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
import sys
|
| 9 |
+
import yaml
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import List, Optional
|
| 12 |
+
|
| 13 |
+
# Add project root to path
|
| 14 |
+
sys.path.append(str(Path(__file__).parent))
|
| 15 |
+
|
| 16 |
+
from utils import setup_logger, set_random_seed
|
| 17 |
+
from tasks.task_count import CountTaskGenerator
|
| 18 |
+
from tasks.task_duration import DurationTaskGenerator
|
| 19 |
+
from tasks.task_order import OrderTaskGenerator
|
| 20 |
+
from tasks.task_volume import VolumeTaskGenerator
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def load_config(config_path: str) -> dict:
|
| 24 |
+
"""Load configuration from YAML file."""
|
| 25 |
+
with open(config_path, 'r') as f:
|
| 26 |
+
config = yaml.safe_load(f)
|
| 27 |
+
return config
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def run_count_task(config: dict, logger):
|
| 31 |
+
"""Run the count task generation."""
|
| 32 |
+
if not config['tasks']['count']['enabled']:
|
| 33 |
+
logger.info("Count task is disabled, skipping...")
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
logger.info("=" * 80)
|
| 37 |
+
logger.info("STARTING COUNT TASK GENERATION")
|
| 38 |
+
logger.info("=" * 80)
|
| 39 |
+
|
| 40 |
+
generator = CountTaskGenerator(config, logger)
|
| 41 |
+
generator.dataset.reset_category_usage() # Reset counter for this task
|
| 42 |
+
generator.generate_dataset()
|
| 43 |
+
|
| 44 |
+
# Log category usage statistics
|
| 45 |
+
usage_stats = generator.dataset.get_category_usage_stats()
|
| 46 |
+
sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
|
| 47 |
+
logger.info("Category usage statistics (as answers):")
|
| 48 |
+
logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
|
| 49 |
+
logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
|
| 50 |
+
logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
|
| 51 |
+
|
| 52 |
+
logger.info("Count task completed successfully!")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def run_duration_task(config: dict, logger):
|
| 56 |
+
"""Run the duration task generation."""
|
| 57 |
+
if not config['tasks']['duration']['enabled']:
|
| 58 |
+
logger.info("Duration task is disabled, skipping...")
|
| 59 |
+
return
|
| 60 |
+
|
| 61 |
+
logger.info("=" * 80)
|
| 62 |
+
logger.info("STARTING DURATION TASK GENERATION")
|
| 63 |
+
logger.info("=" * 80)
|
| 64 |
+
|
| 65 |
+
generator = DurationTaskGenerator(config, logger)
|
| 66 |
+
generator.dataset.reset_category_usage() # Reset counter for this task
|
| 67 |
+
generator.generate_dataset()
|
| 68 |
+
|
| 69 |
+
# Log category usage statistics
|
| 70 |
+
usage_stats = generator.dataset.get_category_usage_stats()
|
| 71 |
+
sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
|
| 72 |
+
logger.info("Category usage statistics (as longest/shortest answers):")
|
| 73 |
+
logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
|
| 74 |
+
logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
|
| 75 |
+
logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
|
| 76 |
+
|
| 77 |
+
logger.info("Duration task completed successfully!")
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def run_order_task(config: dict, logger):
|
| 81 |
+
"""Run the order task generation."""
|
| 82 |
+
if not config['tasks']['order']['enabled']:
|
| 83 |
+
logger.info("Order task is disabled, skipping...")
|
| 84 |
+
return
|
| 85 |
+
|
| 86 |
+
logger.info("=" * 80)
|
| 87 |
+
logger.info("STARTING ORDER TASK GENERATION")
|
| 88 |
+
logger.info("=" * 80)
|
| 89 |
+
|
| 90 |
+
generator = OrderTaskGenerator(config, logger)
|
| 91 |
+
generator.dataset.reset_category_usage() # Reset counter for this task
|
| 92 |
+
generator.generate_dataset()
|
| 93 |
+
|
| 94 |
+
# Log category usage statistics
|
| 95 |
+
usage_stats = generator.dataset.get_category_usage_stats()
|
| 96 |
+
sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
|
| 97 |
+
logger.info("Category usage statistics (as first/last/after/before answers):")
|
| 98 |
+
logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
|
| 99 |
+
logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
|
| 100 |
+
logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
|
| 101 |
+
|
| 102 |
+
logger.info("Order task completed successfully!")
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def run_volume_task(config: dict, logger):
|
| 106 |
+
"""Run the volume task generation."""
|
| 107 |
+
if not config['tasks']['volume']['enabled']:
|
| 108 |
+
logger.info("Volume task is disabled, skipping...")
|
| 109 |
+
return
|
| 110 |
+
|
| 111 |
+
logger.info("=" * 80)
|
| 112 |
+
logger.info("STARTING VOLUME TASK GENERATION")
|
| 113 |
+
logger.info("=" * 80)
|
| 114 |
+
|
| 115 |
+
generator = VolumeTaskGenerator(config, logger)
|
| 116 |
+
generator.dataset.reset_category_usage() # Reset counter for this task
|
| 117 |
+
generator.generate_dataset()
|
| 118 |
+
|
| 119 |
+
# Log category usage statistics
|
| 120 |
+
usage_stats = generator.dataset.get_category_usage_stats()
|
| 121 |
+
sorted_stats = sorted(usage_stats.items(), key=lambda x: x[1], reverse=True)
|
| 122 |
+
logger.info("Category usage statistics (as loudest/softest answers):")
|
| 123 |
+
logger.info(f" Min usage: {sorted_stats[-1][1]} (category: {sorted_stats[-1][0]})")
|
| 124 |
+
logger.info(f" Max usage: {sorted_stats[0][1]} (category: {sorted_stats[0][0]})")
|
| 125 |
+
logger.info(f" Mean usage: {sum(usage_stats.values()) / len(usage_stats):.2f}")
|
| 126 |
+
|
| 127 |
+
logger.info("Volume task completed successfully!")
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def run_pipeline(
|
| 131 |
+
config_path: str,
|
| 132 |
+
tasks: Optional[List[str]] = None,
|
| 133 |
+
output_path: Optional[str] = None
|
| 134 |
+
):
|
| 135 |
+
"""
|
| 136 |
+
Run the complete dataset generation pipeline.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
config_path: Path to configuration YAML file
|
| 140 |
+
tasks: Optional list of specific tasks to run (default: all enabled tasks)
|
| 141 |
+
output_path: Optional custom output path (overrides config)
|
| 142 |
+
"""
|
| 143 |
+
# Load configuration
|
| 144 |
+
config = load_config(config_path)
|
| 145 |
+
|
| 146 |
+
# Override output path if provided
|
| 147 |
+
if output_path:
|
| 148 |
+
config['output']['base_path'] = output_path
|
| 149 |
+
|
| 150 |
+
# Create output directory
|
| 151 |
+
output_base = Path(config['output']['base_path'])
|
| 152 |
+
output_base.mkdir(parents=True, exist_ok=True)
|
| 153 |
+
|
| 154 |
+
# Set random seed
|
| 155 |
+
set_random_seed(config['random_seed'])
|
| 156 |
+
|
| 157 |
+
# Setup main logger
|
| 158 |
+
logger = setup_logger(
|
| 159 |
+
'pipeline',
|
| 160 |
+
log_file=str(output_base / config['logging']['log_file']),
|
| 161 |
+
level=config['logging']['level'],
|
| 162 |
+
console_output=config['logging']['console_output']
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
logger.info("=" * 80)
|
| 166 |
+
logger.info("TEMPORAL REASONING AUDIO DATASET GENERATION PIPELINE")
|
| 167 |
+
logger.info("=" * 80)
|
| 168 |
+
logger.info(f"Configuration: {config_path}")
|
| 169 |
+
logger.info(f"Output directory: {output_base}")
|
| 170 |
+
logger.info(f"Random seed: {config['random_seed']}")
|
| 171 |
+
logger.info(f"ESC-50 audio path: {config['esc50']['audio_path']}")
|
| 172 |
+
logger.info(f"ESC-50 metadata path: {config['esc50']['metadata_path']}")
|
| 173 |
+
|
| 174 |
+
# Determine which tasks to run
|
| 175 |
+
task_map = {
|
| 176 |
+
'count': run_count_task,
|
| 177 |
+
'duration': run_duration_task,
|
| 178 |
+
'order': run_order_task,
|
| 179 |
+
'volume': run_volume_task
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
if tasks:
|
| 183 |
+
tasks_to_run = {k: v for k, v in task_map.items() if k in tasks}
|
| 184 |
+
logger.info(f"Running specific tasks: {', '.join(tasks)}")
|
| 185 |
+
else:
|
| 186 |
+
tasks_to_run = task_map
|
| 187 |
+
logger.info("Running all enabled tasks")
|
| 188 |
+
|
| 189 |
+
# Run tasks
|
| 190 |
+
for task_name, task_func in tasks_to_run.items():
|
| 191 |
+
try:
|
| 192 |
+
task_func(config, logger)
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logger.error(f"Error running {task_name} task: {e}", exc_info=True)
|
| 195 |
+
raise
|
| 196 |
+
|
| 197 |
+
logger.info("=" * 80)
|
| 198 |
+
logger.info("PIPELINE COMPLETED SUCCESSFULLY!")
|
| 199 |
+
logger.info("=" * 80)
|
| 200 |
+
logger.info(f"All outputs saved to: {output_base}")
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def main():
|
| 204 |
+
"""Main entry point with argument parsing."""
|
| 205 |
+
parser = argparse.ArgumentParser(
|
| 206 |
+
description="Temporal Reasoning Audio Dataset Generation Pipeline",
|
| 207 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 208 |
+
epilog="""
|
| 209 |
+
Examples:
|
| 210 |
+
# Run all tasks with default config
|
| 211 |
+
python main.py
|
| 212 |
+
|
| 213 |
+
# Run with custom config
|
| 214 |
+
python main.py --config my_config.yaml
|
| 215 |
+
|
| 216 |
+
# Run specific tasks only
|
| 217 |
+
python main.py --tasks count duration
|
| 218 |
+
|
| 219 |
+
# Use custom output directory
|
| 220 |
+
python main.py --output /path/to/output
|
| 221 |
+
|
| 222 |
+
# Combine options
|
| 223 |
+
python main.py --config custom.yaml --tasks count order --output ./my_dataset
|
| 224 |
+
"""
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
parser.add_argument(
|
| 228 |
+
'--config', '-c',
|
| 229 |
+
type=str,
|
| 230 |
+
default='config.yaml',
|
| 231 |
+
help='Path to configuration YAML file (default: config.yaml)'
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
parser.add_argument(
|
| 235 |
+
'--tasks', '-t',
|
| 236 |
+
nargs='+',
|
| 237 |
+
choices=['count', 'duration', 'order', 'volume'],
|
| 238 |
+
help='Specific tasks to run (default: all enabled tasks)'
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
parser.add_argument(
|
| 242 |
+
'--output', '-o',
|
| 243 |
+
type=str,
|
| 244 |
+
help='Custom output directory (overrides config)'
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
args = parser.parse_args()
|
| 248 |
+
|
| 249 |
+
# Check if config file exists
|
| 250 |
+
config_path = Path(args.config)
|
| 251 |
+
if not config_path.exists():
|
| 252 |
+
# Try relative to script directory
|
| 253 |
+
script_dir = Path(__file__).parent
|
| 254 |
+
config_path = script_dir / args.config
|
| 255 |
+
if not config_path.exists():
|
| 256 |
+
print(f"Error: Config file not found: {args.config}")
|
| 257 |
+
sys.exit(1)
|
| 258 |
+
|
| 259 |
+
# Run pipeline
|
| 260 |
+
try:
|
| 261 |
+
run_pipeline(
|
| 262 |
+
config_path=str(config_path),
|
| 263 |
+
tasks=args.tasks,
|
| 264 |
+
output_path=args.output
|
| 265 |
+
)
|
| 266 |
+
except Exception as e:
|
| 267 |
+
print(f"Pipeline failed with error: {e}")
|
| 268 |
+
sys.exit(1)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
if __name__ == '__main__':
|
| 272 |
+
main()
|
preprocess_esc50.py
ADDED
|
@@ -0,0 +1,714 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
ESC-50 Preprocessing Script for Duration Task
|
| 4 |
+
|
| 5 |
+
This script processes all ESC-50 audio clips to:
|
| 6 |
+
1. Apply amplitude-based filtering to detect actual sound regions
|
| 7 |
+
2. Calculate effective duration (portion containing actual sound)
|
| 8 |
+
3. Save trimmed audio files (with silence removed)
|
| 9 |
+
4. Generate a CSV with all metadata including effective durations
|
| 10 |
+
|
| 11 |
+
Usage:
|
| 12 |
+
python preprocess_esc50.py --config config.yaml
|
| 13 |
+
python preprocess_esc50.py --config config.yaml --threshold-db -40 --min-sound-ms 50
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import argparse
|
| 17 |
+
import os
|
| 18 |
+
import sys
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import Dict, List, Optional, Tuple
|
| 21 |
+
|
| 22 |
+
import numpy as np
|
| 23 |
+
import pandas as pd
|
| 24 |
+
from pydub import AudioSegment
|
| 25 |
+
from tqdm import tqdm
|
| 26 |
+
|
| 27 |
+
# Add parent directory to path for imports
|
| 28 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 29 |
+
|
| 30 |
+
from utils.logger import setup_logger
|
| 31 |
+
|
| 32 |
+
logger = setup_logger(__name__)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def get_amplitude_array(audio: AudioSegment) -> np.ndarray:
|
| 36 |
+
"""
|
| 37 |
+
Convert AudioSegment to numpy array of amplitudes.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
audio: Input audio segment
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
Numpy array of amplitude values (normalized to -1 to 1)
|
| 44 |
+
"""
|
| 45 |
+
samples = np.array(audio.get_array_of_samples())
|
| 46 |
+
|
| 47 |
+
# Handle stereo by averaging channels
|
| 48 |
+
if audio.channels == 2:
|
| 49 |
+
samples = samples.reshape((-1, 2)).mean(axis=1)
|
| 50 |
+
|
| 51 |
+
# Normalize to -1 to 1 range
|
| 52 |
+
max_val = float(2 ** (audio.sample_width * 8 - 1))
|
| 53 |
+
samples = samples / max_val
|
| 54 |
+
|
| 55 |
+
return samples
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def compute_rms_envelope(samples: np.ndarray, frame_size_ms: int, hop_size_ms: int,
|
| 59 |
+
sample_rate: int) -> Tuple[np.ndarray, np.ndarray]:
|
| 60 |
+
"""
|
| 61 |
+
Compute RMS envelope of audio signal.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
samples: Audio samples as numpy array
|
| 65 |
+
frame_size_ms: Frame size in milliseconds
|
| 66 |
+
hop_size_ms: Hop size in milliseconds
|
| 67 |
+
sample_rate: Audio sample rate
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
Tuple of (rms_values, time_stamps_ms)
|
| 71 |
+
"""
|
| 72 |
+
frame_size = int(sample_rate * frame_size_ms / 1000)
|
| 73 |
+
hop_size = int(sample_rate * hop_size_ms / 1000)
|
| 74 |
+
|
| 75 |
+
rms_values = []
|
| 76 |
+
time_stamps = []
|
| 77 |
+
|
| 78 |
+
for i in range(0, len(samples) - frame_size + 1, hop_size):
|
| 79 |
+
frame = samples[i:i + frame_size]
|
| 80 |
+
rms = np.sqrt(np.mean(frame ** 2))
|
| 81 |
+
rms_values.append(rms)
|
| 82 |
+
time_stamps.append(i / sample_rate * 1000) # Convert to ms
|
| 83 |
+
|
| 84 |
+
return np.array(rms_values), np.array(time_stamps)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def rms_to_db(rms: np.ndarray, reference: float = 1.0) -> np.ndarray:
|
| 88 |
+
"""
|
| 89 |
+
Convert RMS values to decibels.
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
rms: RMS values
|
| 93 |
+
reference: Reference value (default 1.0 for normalized audio)
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
dB values
|
| 97 |
+
"""
|
| 98 |
+
# Avoid log(0) by using a small epsilon
|
| 99 |
+
epsilon = 1e-10
|
| 100 |
+
return 20 * np.log10(np.maximum(rms, epsilon) / reference)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def detect_sound_regions(
|
| 104 |
+
audio: AudioSegment,
|
| 105 |
+
threshold_db: float = -40.0,
|
| 106 |
+
min_sound_duration_ms: int = 50,
|
| 107 |
+
frame_size_ms: int = 20,
|
| 108 |
+
hop_size_ms: int = 10,
|
| 109 |
+
merge_gap_ms: int = 100,
|
| 110 |
+
threshold_strategy: str = 'noise_floor',
|
| 111 |
+
noise_floor_percentile: float = 10.0,
|
| 112 |
+
noise_floor_delta_db: float = 15.0
|
| 113 |
+
) -> List[Tuple[int, int]]:
|
| 114 |
+
"""
|
| 115 |
+
Detect regions in audio that contain actual sound (above threshold).
|
| 116 |
+
|
| 117 |
+
Supports two threshold strategies:
|
| 118 |
+
- 'peak_relative': threshold = peak_db + threshold_db (old behavior)
|
| 119 |
+
- 'noise_floor': threshold = percentile(db_values, p) + delta_db (adaptive per-clip)
|
| 120 |
+
|
| 121 |
+
The 'noise_floor' strategy is recommended as it adapts to each clip's
|
| 122 |
+
actual background noise level rather than using a fixed offset from peak.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
audio: Input audio segment
|
| 126 |
+
threshold_db: dB threshold below peak (used if strategy='peak_relative')
|
| 127 |
+
min_sound_duration_ms: Minimum duration of sound region to keep
|
| 128 |
+
frame_size_ms: Frame size for RMS computation
|
| 129 |
+
hop_size_ms: Hop size for RMS computation
|
| 130 |
+
merge_gap_ms: Merge regions separated by less than this gap
|
| 131 |
+
threshold_strategy: 'peak_relative' or 'noise_floor'
|
| 132 |
+
noise_floor_percentile: Percentile for noise floor estimation (default 10)
|
| 133 |
+
noise_floor_delta_db: dB above noise floor to set threshold (default 15)
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
List of (start_ms, end_ms) tuples for sound regions
|
| 137 |
+
"""
|
| 138 |
+
samples = get_amplitude_array(audio)
|
| 139 |
+
sample_rate = audio.frame_rate
|
| 140 |
+
|
| 141 |
+
# Compute RMS envelope
|
| 142 |
+
rms_values, time_stamps = compute_rms_envelope(
|
| 143 |
+
samples, frame_size_ms, hop_size_ms, sample_rate
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
if len(rms_values) == 0:
|
| 147 |
+
return []
|
| 148 |
+
|
| 149 |
+
# Convert to dB
|
| 150 |
+
db_values = rms_to_db(rms_values)
|
| 151 |
+
|
| 152 |
+
# Compute threshold based on strategy
|
| 153 |
+
peak_db = np.max(db_values)
|
| 154 |
+
|
| 155 |
+
if threshold_strategy == 'noise_floor':
|
| 156 |
+
# ADAPTIVE: Use noise floor (low percentile) + delta
|
| 157 |
+
# This adapts to each clip's actual background noise level
|
| 158 |
+
noise_floor_db = np.percentile(db_values, noise_floor_percentile)
|
| 159 |
+
absolute_threshold = noise_floor_db + noise_floor_delta_db
|
| 160 |
+
|
| 161 |
+
# Safeguard: don't exceed peak (would detect nothing)
|
| 162 |
+
# Leave at least 1 dB below peak
|
| 163 |
+
absolute_threshold = min(absolute_threshold, peak_db - 1.0)
|
| 164 |
+
|
| 165 |
+
logger.debug(
|
| 166 |
+
f"Noise-floor threshold: floor={noise_floor_db:.1f}dB (p{noise_floor_percentile}), "
|
| 167 |
+
f"delta={noise_floor_delta_db}dB, threshold={absolute_threshold:.1f}dB, peak={peak_db:.1f}dB"
|
| 168 |
+
)
|
| 169 |
+
else:
|
| 170 |
+
# OLD: peak-relative threshold
|
| 171 |
+
absolute_threshold = peak_db + threshold_db # threshold_db is negative
|
| 172 |
+
logger.debug(
|
| 173 |
+
f"Peak-relative threshold: peak={peak_db:.1f}dB, offset={threshold_db}dB, "
|
| 174 |
+
f"threshold={absolute_threshold:.1f}dB"
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
# Find frames above threshold
|
| 178 |
+
above_threshold = db_values > absolute_threshold
|
| 179 |
+
|
| 180 |
+
# Find contiguous regions
|
| 181 |
+
regions = []
|
| 182 |
+
in_region = False
|
| 183 |
+
region_start = 0
|
| 184 |
+
|
| 185 |
+
for i, (is_above, time_ms) in enumerate(zip(above_threshold, time_stamps)):
|
| 186 |
+
if is_above and not in_region:
|
| 187 |
+
# Start of new region
|
| 188 |
+
in_region = True
|
| 189 |
+
region_start = time_ms
|
| 190 |
+
elif not is_above and in_region:
|
| 191 |
+
# End of region
|
| 192 |
+
in_region = False
|
| 193 |
+
region_end = time_ms
|
| 194 |
+
if region_end - region_start >= min_sound_duration_ms:
|
| 195 |
+
regions.append((int(region_start), int(region_end)))
|
| 196 |
+
|
| 197 |
+
# Handle case where audio ends while still in a region
|
| 198 |
+
if in_region:
|
| 199 |
+
region_end = time_stamps[-1] + hop_size_ms
|
| 200 |
+
if region_end - region_start >= min_sound_duration_ms:
|
| 201 |
+
regions.append((int(region_start), int(region_end)))
|
| 202 |
+
|
| 203 |
+
# Merge regions that are close together
|
| 204 |
+
if len(regions) > 1:
|
| 205 |
+
merged_regions = [regions[0]]
|
| 206 |
+
for start, end in regions[1:]:
|
| 207 |
+
prev_start, prev_end = merged_regions[-1]
|
| 208 |
+
if start - prev_end <= merge_gap_ms:
|
| 209 |
+
# Merge with previous region
|
| 210 |
+
merged_regions[-1] = (prev_start, end)
|
| 211 |
+
else:
|
| 212 |
+
merged_regions.append((start, end))
|
| 213 |
+
regions = merged_regions
|
| 214 |
+
|
| 215 |
+
return regions
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def get_sound_regions(
|
| 219 |
+
audio: AudioSegment,
|
| 220 |
+
threshold_db: float = -40.0,
|
| 221 |
+
min_sound_duration_ms: int = 50,
|
| 222 |
+
threshold_strategy: str = 'noise_floor',
|
| 223 |
+
noise_floor_percentile: float = 10.0,
|
| 224 |
+
noise_floor_delta_db: float = 15.0
|
| 225 |
+
) -> List[Tuple[int, int]]:
|
| 226 |
+
"""
|
| 227 |
+
Detect sound regions in audio using adaptive threshold.
|
| 228 |
+
|
| 229 |
+
Args:
|
| 230 |
+
audio: Input audio segment
|
| 231 |
+
threshold_db: dB threshold below peak (used if strategy='peak_relative')
|
| 232 |
+
min_sound_duration_ms: Minimum duration of sound region to keep
|
| 233 |
+
threshold_strategy: 'peak_relative' or 'noise_floor'
|
| 234 |
+
noise_floor_percentile: Percentile for noise floor estimation
|
| 235 |
+
noise_floor_delta_db: dB above noise floor to set threshold
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
List of (start_ms, end_ms) tuples for sound regions
|
| 239 |
+
"""
|
| 240 |
+
return detect_sound_regions(
|
| 241 |
+
audio,
|
| 242 |
+
threshold_db=threshold_db,
|
| 243 |
+
min_sound_duration_ms=min_sound_duration_ms,
|
| 244 |
+
threshold_strategy=threshold_strategy,
|
| 245 |
+
noise_floor_percentile=noise_floor_percentile,
|
| 246 |
+
noise_floor_delta_db=noise_floor_delta_db
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def extract_sound_with_edges_trimmed(
|
| 251 |
+
audio: AudioSegment,
|
| 252 |
+
regions: List[Tuple[int, int]],
|
| 253 |
+
min_silence_to_trim_ms: int = 100,
|
| 254 |
+
buffer_ratio: float = 0.1
|
| 255 |
+
) -> AudioSegment:
|
| 256 |
+
"""
|
| 257 |
+
Extract audio with ONLY leftmost and rightmost silence removed IF present.
|
| 258 |
+
|
| 259 |
+
Trimming is ADAPTIVE:
|
| 260 |
+
- Only trims if edge silence >= min_silence_to_trim_ms
|
| 261 |
+
- Keeps a small percentage (buffer_ratio) of the silence to preserve transients
|
| 262 |
+
- Buffer size adapts to actual silence duration (not fixed)
|
| 263 |
+
|
| 264 |
+
Preserves all internal structure and silence between sounds.
|
| 265 |
+
Perfect for periodic sounds (clock ticks, footsteps, typing).
|
| 266 |
+
|
| 267 |
+
Args:
|
| 268 |
+
audio: Input audio segment
|
| 269 |
+
regions: List of (start_ms, end_ms) tuples for sound regions
|
| 270 |
+
min_silence_to_trim_ms: Only trim if edge silence is at least this long (default 100ms)
|
| 271 |
+
buffer_ratio: Keep this fraction of the silence as buffer (default 0.1 = 10%)
|
| 272 |
+
Example: 500ms silence -> keep 50ms buffer
|
| 273 |
+
|
| 274 |
+
Returns:
|
| 275 |
+
Audio segment with edges trimmed (or original if no significant silence)
|
| 276 |
+
"""
|
| 277 |
+
if not regions:
|
| 278 |
+
# No sound detected - return original
|
| 279 |
+
return audio
|
| 280 |
+
|
| 281 |
+
# Find the overall sound boundaries (first sound start, last sound end)
|
| 282 |
+
first_sound_start_ms = regions[0][0]
|
| 283 |
+
last_sound_end_ms = regions[-1][1]
|
| 284 |
+
audio_duration_ms = len(audio)
|
| 285 |
+
|
| 286 |
+
# Calculate actual silence durations at edges
|
| 287 |
+
leading_silence_ms = first_sound_start_ms
|
| 288 |
+
trailing_silence_ms = audio_duration_ms - last_sound_end_ms
|
| 289 |
+
|
| 290 |
+
# Adaptive trimming: only trim if there's significant silence
|
| 291 |
+
# Keep a small percentage as buffer to avoid cutting transients
|
| 292 |
+
if leading_silence_ms >= min_silence_to_trim_ms:
|
| 293 |
+
buffer_ms = max(200, int(leading_silence_ms * buffer_ratio)) # At least 200ms buffer
|
| 294 |
+
trim_start_ms = max(0, first_sound_start_ms - buffer_ms)
|
| 295 |
+
else:
|
| 296 |
+
# Not enough silence to trim - keep from start
|
| 297 |
+
trim_start_ms = 0
|
| 298 |
+
|
| 299 |
+
if trailing_silence_ms >= min_silence_to_trim_ms:
|
| 300 |
+
buffer_ms = max(200, int(trailing_silence_ms * buffer_ratio))
|
| 301 |
+
trim_end_ms = min(audio_duration_ms, last_sound_end_ms + buffer_ms)
|
| 302 |
+
else:
|
| 303 |
+
# Not enough silence to trim - keep to end
|
| 304 |
+
trim_end_ms = audio_duration_ms
|
| 305 |
+
|
| 306 |
+
# Extract the edge-trimmed portion (internal structure preserved)
|
| 307 |
+
trimmed_audio = audio[trim_start_ms:trim_end_ms]
|
| 308 |
+
|
| 309 |
+
logger.debug(
|
| 310 |
+
f"Edge trim: {audio_duration_ms}ms -> {len(trimmed_audio)}ms "
|
| 311 |
+
f"(leading_silence={leading_silence_ms}ms, trailing_silence={trailing_silence_ms}ms, "
|
| 312 |
+
f"trim_start={trim_start_ms}ms, trim_end={trim_end_ms}ms)"
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
return trimmed_audio
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def extract_all_sound_regions(
|
| 319 |
+
audio: AudioSegment,
|
| 320 |
+
regions: List[Tuple[int, int]],
|
| 321 |
+
crossfade_ms: int = 10,
|
| 322 |
+
padding_ms: int = 20
|
| 323 |
+
) -> AudioSegment:
|
| 324 |
+
"""
|
| 325 |
+
Extract ALL sound portions and join them, removing ALL silence.
|
| 326 |
+
|
| 327 |
+
WARNING: This destroys natural periodicity! Use trim_edges_only() instead
|
| 328 |
+
for most use cases. This function is kept for backward compatibility.
|
| 329 |
+
|
| 330 |
+
Args:
|
| 331 |
+
audio: Input audio segment
|
| 332 |
+
regions: List of (start_ms, end_ms) tuples for sound regions
|
| 333 |
+
crossfade_ms: Crossfade duration when joining regions
|
| 334 |
+
padding_ms: Padding around each region to avoid cutting transients
|
| 335 |
+
|
| 336 |
+
Returns:
|
| 337 |
+
Audio segment containing only sound portions (internal silence removed)
|
| 338 |
+
"""
|
| 339 |
+
if not regions:
|
| 340 |
+
return audio
|
| 341 |
+
|
| 342 |
+
# Extract each region
|
| 343 |
+
extracted_parts = []
|
| 344 |
+
for start_ms, end_ms in regions:
|
| 345 |
+
# Add padding to avoid cutting off transients
|
| 346 |
+
padded_start = max(0, start_ms - padding_ms)
|
| 347 |
+
padded_end = min(len(audio), end_ms + padding_ms)
|
| 348 |
+
part = audio[padded_start:padded_end]
|
| 349 |
+
extracted_parts.append(part)
|
| 350 |
+
|
| 351 |
+
# Concatenate with crossfade
|
| 352 |
+
if len(extracted_parts) == 1:
|
| 353 |
+
return extracted_parts[0]
|
| 354 |
+
|
| 355 |
+
result = extracted_parts[0]
|
| 356 |
+
for part in extracted_parts[1:]:
|
| 357 |
+
if len(result) > crossfade_ms and len(part) > crossfade_ms:
|
| 358 |
+
result = result.append(part, crossfade=crossfade_ms)
|
| 359 |
+
else:
|
| 360 |
+
result = result + part
|
| 361 |
+
|
| 362 |
+
return result
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def process_esc50_dataset(
|
| 366 |
+
audio_dir: str,
|
| 367 |
+
metadata_path: str,
|
| 368 |
+
output_dir: str,
|
| 369 |
+
threshold_db: float = -40.0,
|
| 370 |
+
min_sound_duration_ms: int = 50,
|
| 371 |
+
save_trimmed_audio: bool = True,
|
| 372 |
+
threshold_strategy: str = 'noise_floor',
|
| 373 |
+
noise_floor_percentile: float = 10.0,
|
| 374 |
+
noise_floor_delta_db: float = 15.0
|
| 375 |
+
) -> pd.DataFrame:
|
| 376 |
+
"""
|
| 377 |
+
Process entire ESC-50 dataset and compute effective durations.
|
| 378 |
+
|
| 379 |
+
Uses ADAPTIVE EDGE-ONLY trimming to preserve natural periodicity of sounds.
|
| 380 |
+
Only leading and trailing silence is removed IF significant (>=100ms).
|
| 381 |
+
Trimming is adaptive: keeps a small percentage of silence as buffer for transients.
|
| 382 |
+
All internal structure is preserved.
|
| 383 |
+
|
| 384 |
+
Supports two threshold strategies for adaptive per-clip thresholding:
|
| 385 |
+
- 'peak_relative': threshold = peak_db + threshold_db (fixed offset from peak)
|
| 386 |
+
- 'noise_floor': threshold = percentile(db, p) + delta (adapts to noise floor)
|
| 387 |
+
|
| 388 |
+
Args:
|
| 389 |
+
audio_dir: Path to ESC-50 audio directory
|
| 390 |
+
metadata_path: Path to ESC-50 metadata CSV
|
| 391 |
+
output_dir: Output directory for processed files
|
| 392 |
+
threshold_db: dB threshold for silence detection (peak_relative mode)
|
| 393 |
+
min_sound_duration_ms: Minimum sound duration to keep
|
| 394 |
+
save_trimmed_audio: Whether to save trimmed audio files
|
| 395 |
+
threshold_strategy: 'peak_relative' or 'noise_floor' (recommended)
|
| 396 |
+
noise_floor_percentile: Percentile for noise floor estimation (default 5)
|
| 397 |
+
noise_floor_delta_db: dB above noise floor to set threshold (default 8)
|
| 398 |
+
|
| 399 |
+
Returns:
|
| 400 |
+
DataFrame with processed metadata
|
| 401 |
+
"""
|
| 402 |
+
# Load original metadata
|
| 403 |
+
original_metadata = pd.read_csv(metadata_path)
|
| 404 |
+
logger.info(f"Loaded metadata for {len(original_metadata)} clips")
|
| 405 |
+
|
| 406 |
+
# Create output directories
|
| 407 |
+
output_path = Path(output_dir)
|
| 408 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 409 |
+
|
| 410 |
+
if save_trimmed_audio:
|
| 411 |
+
trimmed_audio_dir = output_path / "trimmed_audio"
|
| 412 |
+
trimmed_audio_dir.mkdir(exist_ok=True)
|
| 413 |
+
|
| 414 |
+
# Process each audio file
|
| 415 |
+
results = []
|
| 416 |
+
|
| 417 |
+
for _, row in tqdm(original_metadata.iterrows(), total=len(original_metadata),
|
| 418 |
+
desc="Processing ESC-50 clips"):
|
| 419 |
+
filename = row['filename']
|
| 420 |
+
category = row['category']
|
| 421 |
+
audio_path = Path(audio_dir) / filename
|
| 422 |
+
|
| 423 |
+
try:
|
| 424 |
+
# Load audio
|
| 425 |
+
audio = AudioSegment.from_file(str(audio_path), format="wav")
|
| 426 |
+
raw_duration_s = len(audio) / 1000.0
|
| 427 |
+
|
| 428 |
+
# Detect sound regions (using adaptive threshold)
|
| 429 |
+
regions = get_sound_regions(
|
| 430 |
+
audio,
|
| 431 |
+
threshold_db=threshold_db,
|
| 432 |
+
min_sound_duration_ms=min_sound_duration_ms,
|
| 433 |
+
threshold_strategy=threshold_strategy,
|
| 434 |
+
noise_floor_percentile=noise_floor_percentile,
|
| 435 |
+
noise_floor_delta_db=noise_floor_delta_db
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
# Trim edges only (leftmost and rightmost silence)
|
| 439 |
+
# Adaptive trimming: only trims if silence >= 100ms, keeps 10% as buffer
|
| 440 |
+
trimmed_audio = extract_sound_with_edges_trimmed(audio, regions)
|
| 441 |
+
final_duration_s = len(trimmed_audio) / 1000.0
|
| 442 |
+
|
| 443 |
+
# Calculate peak amplitude and RMS from trimmed audio
|
| 444 |
+
samples = get_amplitude_array(trimmed_audio)
|
| 445 |
+
peak_amplitude = np.max(np.abs(samples))
|
| 446 |
+
peak_amplitude_db = 20 * np.log10(peak_amplitude + 1e-10)
|
| 447 |
+
rms = np.sqrt(np.mean(samples ** 2))
|
| 448 |
+
avg_rms_db = 20 * np.log10(rms + 1e-10)
|
| 449 |
+
|
| 450 |
+
# Calculate effective duration (sum of sound regions)
|
| 451 |
+
effective_duration_s = sum(end - start for start, end in regions) / 1000.0 if regions else final_duration_s
|
| 452 |
+
|
| 453 |
+
# Save trimmed audio
|
| 454 |
+
trimmed_filename = None
|
| 455 |
+
if save_trimmed_audio:
|
| 456 |
+
trimmed_filename = filename
|
| 457 |
+
trimmed_path = trimmed_audio_dir / trimmed_filename
|
| 458 |
+
trimmed_audio.export(str(trimmed_path), format="wav")
|
| 459 |
+
|
| 460 |
+
# Store results
|
| 461 |
+
results.append({
|
| 462 |
+
'filename': filename,
|
| 463 |
+
'category': category,
|
| 464 |
+
'fold': row['fold'],
|
| 465 |
+
'target': row['target'],
|
| 466 |
+
'esc10': row['esc10'],
|
| 467 |
+
'raw_duration_s': round(raw_duration_s, 4),
|
| 468 |
+
'final_duration_s': round(final_duration_s, 4),
|
| 469 |
+
'effective_duration_s': round(effective_duration_s, 4),
|
| 470 |
+
'num_sound_regions': len(regions),
|
| 471 |
+
'peak_amplitude_db': round(peak_amplitude_db, 2),
|
| 472 |
+
'avg_rms_db': round(avg_rms_db, 2),
|
| 473 |
+
'trimmed_filename': trimmed_filename if save_trimmed_audio else None,
|
| 474 |
+
'threshold_strategy': threshold_strategy,
|
| 475 |
+
'threshold_db_used': threshold_db if threshold_strategy == 'peak_relative' else None,
|
| 476 |
+
'noise_floor_percentile': noise_floor_percentile if threshold_strategy == 'noise_floor' else None,
|
| 477 |
+
'noise_floor_delta_db': noise_floor_delta_db if threshold_strategy == 'noise_floor' else None,
|
| 478 |
+
'min_sound_duration_ms_used': min_sound_duration_ms
|
| 479 |
+
})
|
| 480 |
+
|
| 481 |
+
except Exception as e:
|
| 482 |
+
logger.error(f"Error processing {filename}: {e}")
|
| 483 |
+
results.append({
|
| 484 |
+
'filename': filename,
|
| 485 |
+
'category': category,
|
| 486 |
+
'fold': row['fold'],
|
| 487 |
+
'target': row['target'],
|
| 488 |
+
'esc10': row['esc10'],
|
| 489 |
+
'raw_duration_s': None,
|
| 490 |
+
'final_duration_s': None,
|
| 491 |
+
'effective_duration_s': None,
|
| 492 |
+
'num_sound_regions': 0,
|
| 493 |
+
'peak_amplitude_db': None,
|
| 494 |
+
'avg_rms_db': None,
|
| 495 |
+
'trimmed_filename': None,
|
| 496 |
+
'threshold_strategy': threshold_strategy,
|
| 497 |
+
'threshold_db_used': threshold_db if threshold_strategy == 'peak_relative' else None,
|
| 498 |
+
'noise_floor_percentile': noise_floor_percentile if threshold_strategy == 'noise_floor' else None,
|
| 499 |
+
'noise_floor_delta_db': noise_floor_delta_db if threshold_strategy == 'noise_floor' else None,
|
| 500 |
+
'min_sound_duration_ms_used': min_sound_duration_ms,
|
| 501 |
+
'error': str(e)
|
| 502 |
+
})
|
| 503 |
+
|
| 504 |
+
# Create DataFrame
|
| 505 |
+
results_df = pd.DataFrame(results)
|
| 506 |
+
|
| 507 |
+
# Save CSV
|
| 508 |
+
csv_path = output_path / "effective_durations.csv"
|
| 509 |
+
results_df.to_csv(csv_path, index=False)
|
| 510 |
+
logger.info(f"Saved effective durations to {csv_path}")
|
| 511 |
+
|
| 512 |
+
# Print summary statistics
|
| 513 |
+
print_summary_statistics(results_df)
|
| 514 |
+
|
| 515 |
+
return results_df
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
def print_summary_statistics(df: pd.DataFrame):
|
| 519 |
+
"""Print summary statistics of the processed dataset."""
|
| 520 |
+
print("\n" + "=" * 60)
|
| 521 |
+
print("ESC-50 Preprocessing Summary")
|
| 522 |
+
print("=" * 60)
|
| 523 |
+
|
| 524 |
+
# Filter out errors
|
| 525 |
+
valid_df = df[df['effective_duration_s'].notna()]
|
| 526 |
+
|
| 527 |
+
print(f"\nTotal clips processed: {len(df)}")
|
| 528 |
+
print(f"Successfully processed: {len(valid_df)}")
|
| 529 |
+
print(f"Errors: {len(df) - len(valid_df)}")
|
| 530 |
+
|
| 531 |
+
print(f"\nRaw duration statistics:")
|
| 532 |
+
print(f" Mean: {valid_df['raw_duration_s'].mean():.3f}s")
|
| 533 |
+
print(f" Std: {valid_df['raw_duration_s'].std():.3f}s")
|
| 534 |
+
print(f" Min: {valid_df['raw_duration_s'].min():.3f}s")
|
| 535 |
+
print(f" Max: {valid_df['raw_duration_s'].max():.3f}s")
|
| 536 |
+
|
| 537 |
+
print(f"\nFinal duration statistics (edges trimmed, internal structure preserved):")
|
| 538 |
+
print(f" Mean: {valid_df['final_duration_s'].mean():.3f}s")
|
| 539 |
+
print(f" Std: {valid_df['final_duration_s'].std():.3f}s")
|
| 540 |
+
print(f" Min: {valid_df['final_duration_s'].min():.3f}s")
|
| 541 |
+
print(f" Max: {valid_df['final_duration_s'].max():.3f}s")
|
| 542 |
+
|
| 543 |
+
print(f"\nEffective duration statistics (sum of sound regions only):")
|
| 544 |
+
print(f" Mean: {valid_df['effective_duration_s'].mean():.3f}s")
|
| 545 |
+
print(f" Std: {valid_df['effective_duration_s'].std():.3f}s")
|
| 546 |
+
print(f" Min: {valid_df['effective_duration_s'].min():.3f}s")
|
| 547 |
+
print(f" Max: {valid_df['effective_duration_s'].max():.3f}s")
|
| 548 |
+
|
| 549 |
+
# Compare effective vs final
|
| 550 |
+
print(f"\nComparison (final includes internal silences):")
|
| 551 |
+
print(f" Avg effective: {valid_df['effective_duration_s'].mean():.3f}s")
|
| 552 |
+
print(f" Avg final: {valid_df['final_duration_s'].mean():.3f}s")
|
| 553 |
+
print(f" Difference: {valid_df['final_duration_s'].mean() - valid_df['effective_duration_s'].mean():.3f}s (internal silences)")
|
| 554 |
+
|
| 555 |
+
# Duration reduction
|
| 556 |
+
reduction = (1 - valid_df['final_duration_s'].mean() / valid_df['raw_duration_s'].mean()) * 100
|
| 557 |
+
print(f"\nAverage edge trimming reduction: {reduction:.1f}%")
|
| 558 |
+
|
| 559 |
+
# Per-category statistics
|
| 560 |
+
print("\nEffective duration by category (top 10 longest):")
|
| 561 |
+
category_stats = valid_df.groupby('category')['effective_duration_s'].agg(['mean', 'std', 'min', 'max'])
|
| 562 |
+
category_stats = category_stats.sort_values('mean', ascending=False)
|
| 563 |
+
print(category_stats.head(10).to_string())
|
| 564 |
+
|
| 565 |
+
print("\nEffective duration by category (top 10 shortest):")
|
| 566 |
+
print(category_stats.tail(10).to_string())
|
| 567 |
+
|
| 568 |
+
print("\n" + "=" * 60)
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
def load_config(config_path: str) -> dict:
|
| 572 |
+
"""Load configuration from YAML file."""
|
| 573 |
+
import yaml
|
| 574 |
+
with open(config_path, 'r') as f:
|
| 575 |
+
return yaml.safe_load(f)
|
| 576 |
+
|
| 577 |
+
|
| 578 |
+
def main():
|
| 579 |
+
parser = argparse.ArgumentParser(
|
| 580 |
+
description="Preprocess ESC-50 dataset for duration task"
|
| 581 |
+
)
|
| 582 |
+
parser.add_argument(
|
| 583 |
+
'--config', '-c',
|
| 584 |
+
type=str,
|
| 585 |
+
default='config.yaml',
|
| 586 |
+
help='Path to configuration file'
|
| 587 |
+
)
|
| 588 |
+
parser.add_argument(
|
| 589 |
+
'--threshold-db',
|
| 590 |
+
type=float,
|
| 591 |
+
default=None,
|
| 592 |
+
help='dB threshold below peak for silence detection (default: -40)'
|
| 593 |
+
)
|
| 594 |
+
parser.add_argument(
|
| 595 |
+
'--min-sound-ms',
|
| 596 |
+
type=int,
|
| 597 |
+
default=None,
|
| 598 |
+
help='Minimum sound duration in ms to keep (default: 50)'
|
| 599 |
+
)
|
| 600 |
+
parser.add_argument(
|
| 601 |
+
'--output-dir',
|
| 602 |
+
type=str,
|
| 603 |
+
default=None,
|
| 604 |
+
help='Output directory (default: from config or ESC-50_preprocessed)'
|
| 605 |
+
)
|
| 606 |
+
parser.add_argument(
|
| 607 |
+
'--no-trimmed-audio',
|
| 608 |
+
action='store_true',
|
| 609 |
+
help='Do not save trimmed audio files (only save CSV)'
|
| 610 |
+
)
|
| 611 |
+
parser.add_argument(
|
| 612 |
+
'--threshold-strategy',
|
| 613 |
+
type=str,
|
| 614 |
+
choices=['peak_relative', 'noise_floor'],
|
| 615 |
+
default=None,
|
| 616 |
+
help='Threshold strategy: peak_relative (old) or noise_floor (adaptive, recommended)'
|
| 617 |
+
)
|
| 618 |
+
parser.add_argument(
|
| 619 |
+
'--noise-floor-percentile',
|
| 620 |
+
type=float,
|
| 621 |
+
default=None,
|
| 622 |
+
help='Percentile for noise floor estimation (default: 10)'
|
| 623 |
+
)
|
| 624 |
+
parser.add_argument(
|
| 625 |
+
'--noise-floor-delta-db',
|
| 626 |
+
type=float,
|
| 627 |
+
default=None,
|
| 628 |
+
help='dB above noise floor to set threshold (default: 15)'
|
| 629 |
+
)
|
| 630 |
+
|
| 631 |
+
args = parser.parse_args()
|
| 632 |
+
|
| 633 |
+
# Load config
|
| 634 |
+
config = load_config(args.config)
|
| 635 |
+
|
| 636 |
+
# Get ESC-50 paths from config
|
| 637 |
+
esc50_config = config.get('esc50', {})
|
| 638 |
+
audio_dir = esc50_config.get('audio_path', '/home/debarpanb1/ESC-50_github/audio')
|
| 639 |
+
metadata_path = esc50_config.get('metadata_path', '/home/debarpanb1/ESC-50_github/meta/esc50.csv')
|
| 640 |
+
|
| 641 |
+
# Get duration task config for preprocessing parameters
|
| 642 |
+
duration_config = config.get('tasks', {}).get('duration', {})
|
| 643 |
+
|
| 644 |
+
# Determine threshold and min sound duration
|
| 645 |
+
threshold_db = args.threshold_db
|
| 646 |
+
if threshold_db is None:
|
| 647 |
+
threshold_db = duration_config.get('amplitude_threshold_db', -40.0)
|
| 648 |
+
|
| 649 |
+
min_sound_ms = args.min_sound_ms
|
| 650 |
+
if min_sound_ms is None:
|
| 651 |
+
min_sound_ms = duration_config.get('min_sound_duration_ms', 50)
|
| 652 |
+
|
| 653 |
+
# Determine output directory
|
| 654 |
+
output_dir = args.output_dir
|
| 655 |
+
if output_dir is None:
|
| 656 |
+
output_dir = duration_config.get(
|
| 657 |
+
'preprocessed_data_path',
|
| 658 |
+
'/home/debarpanb1/TREA_2.0/ESC-50_preprocessed'
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
# Determine threshold strategy (noise_floor is recommended/default)
|
| 662 |
+
threshold_strategy = args.threshold_strategy
|
| 663 |
+
if threshold_strategy is None:
|
| 664 |
+
threshold_strategy = duration_config.get('threshold_strategy', 'noise_floor')
|
| 665 |
+
|
| 666 |
+
# Determine noise floor percentile
|
| 667 |
+
noise_floor_percentile = args.noise_floor_percentile
|
| 668 |
+
if noise_floor_percentile is None:
|
| 669 |
+
noise_floor_percentile = duration_config.get('noise_floor_percentile', 10.0)
|
| 670 |
+
|
| 671 |
+
# Determine noise floor delta dB
|
| 672 |
+
noise_floor_delta_db = args.noise_floor_delta_db
|
| 673 |
+
if noise_floor_delta_db is None:
|
| 674 |
+
noise_floor_delta_db = duration_config.get('noise_floor_delta_db', 15.0)
|
| 675 |
+
|
| 676 |
+
# Log configuration
|
| 677 |
+
logger.info("=" * 60)
|
| 678 |
+
logger.info("ESC-50 Preprocessing Configuration")
|
| 679 |
+
logger.info("=" * 60)
|
| 680 |
+
logger.info(f"Audio directory: {audio_dir}")
|
| 681 |
+
logger.info(f"Metadata path: {metadata_path}")
|
| 682 |
+
logger.info(f"Output directory: {output_dir}")
|
| 683 |
+
logger.info(f"Threshold strategy: {threshold_strategy}")
|
| 684 |
+
if threshold_strategy == 'peak_relative':
|
| 685 |
+
logger.info(f" Peak-relative threshold dB: {threshold_db}")
|
| 686 |
+
else:
|
| 687 |
+
logger.info(f" Noise floor percentile: {noise_floor_percentile}")
|
| 688 |
+
logger.info(f" Noise floor delta dB: {noise_floor_delta_db}")
|
| 689 |
+
logger.info(f"Min sound duration (ms): {min_sound_ms}")
|
| 690 |
+
logger.info(f"Adaptive edge trimming: only if silence >= 100ms, keep 10% buffer")
|
| 691 |
+
logger.info(f"Save trimmed audio: {not args.no_trimmed_audio}")
|
| 692 |
+
logger.info("=" * 60)
|
| 693 |
+
|
| 694 |
+
# Process dataset
|
| 695 |
+
results_df = process_esc50_dataset(
|
| 696 |
+
audio_dir=audio_dir,
|
| 697 |
+
metadata_path=metadata_path,
|
| 698 |
+
output_dir=output_dir,
|
| 699 |
+
threshold_db=threshold_db,
|
| 700 |
+
min_sound_duration_ms=min_sound_ms,
|
| 701 |
+
save_trimmed_audio=not args.no_trimmed_audio,
|
| 702 |
+
threshold_strategy=threshold_strategy,
|
| 703 |
+
noise_floor_percentile=noise_floor_percentile,
|
| 704 |
+
noise_floor_delta_db=noise_floor_delta_db
|
| 705 |
+
)
|
| 706 |
+
|
| 707 |
+
logger.info(f"\nPreprocessing complete!")
|
| 708 |
+
logger.info(f"Results saved to: {output_dir}")
|
| 709 |
+
|
| 710 |
+
return results_df
|
| 711 |
+
|
| 712 |
+
|
| 713 |
+
if __name__ == "__main__":
|
| 714 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pyyaml
|
| 2 |
+
pandas
|
| 3 |
+
pydub
|
| 4 |
+
numpy
|
| 5 |
+
pyloudnorm
|
| 6 |
+
|
run_llm_answers_all.sh
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Run llm_answer_generator.py across dataset folders and tasks
|
| 3 |
+
# Processes both MCQ and open_text CSVs for tasks: count, duration, order, volume
|
| 4 |
+
|
| 5 |
+
set -euo pipefail
|
| 6 |
+
export CUDA_VISIBLE_DEVICES=7
|
| 7 |
+
PY_SCRIPT="$(dirname "$0")/llm_answer_generator.py"
|
| 8 |
+
BASE_DIR="$(dirname "$0")"
|
| 9 |
+
|
| 10 |
+
DATA_SPLITS=(train validation test_large test_ood)
|
| 11 |
+
TASKS=(count duration order volume)
|
| 12 |
+
|
| 13 |
+
echo "Running LLM answer generation script across splits: ${DATA_SPLITS[*]} and tasks: ${TASKS[*]}"
|
| 14 |
+
|
| 15 |
+
for split in "${DATA_SPLITS[@]}"; do
|
| 16 |
+
for task in "${TASKS[@]}"; do
|
| 17 |
+
# open_text file
|
| 18 |
+
ot_csv="${BASE_DIR}/dataset_v2/${split}/${task}/${task}_open_text.csv"
|
| 19 |
+
if [ -f "${ot_csv}" ]; then
|
| 20 |
+
echo "[OPEN_TEXT] Processing ${ot_csv}"
|
| 21 |
+
python "${PY_SCRIPT}" --input "${ot_csv}" --mode open_text --task "${task}"
|
| 22 |
+
else
|
| 23 |
+
echo "[OPEN_TEXT] Not found: ${ot_csv}"
|
| 24 |
+
fi
|
| 25 |
+
done
|
| 26 |
+
done
|
| 27 |
+
|
| 28 |
+
echo "All tasks processed."
|
run_pipeline.sh
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
################################################################################
|
| 4 |
+
# Temporal Reasoning Audio Dataset Generation Pipeline
|
| 5 |
+
#
|
| 6 |
+
# This script orchestrates the entire dataset creation process for all tasks.
|
| 7 |
+
################################################################################
|
| 8 |
+
|
| 9 |
+
set -e # Exit on error
|
| 10 |
+
|
| 11 |
+
# Default configuration
|
| 12 |
+
CONFIG_FILE="config.yaml"
|
| 13 |
+
OUTPUT_DIR=""
|
| 14 |
+
TASKS=""
|
| 15 |
+
PYTHON_CMD="python"
|
| 16 |
+
|
| 17 |
+
# Colors for output
|
| 18 |
+
RED='\033[0;31m'
|
| 19 |
+
GREEN='\033[0;32m'
|
| 20 |
+
YELLOW='\033[1;33m'
|
| 21 |
+
BLUE='\033[0;34m'
|
| 22 |
+
NC='\033[0m' # No Color
|
| 23 |
+
|
| 24 |
+
# Function to print colored messages
|
| 25 |
+
print_info() {
|
| 26 |
+
echo -e "${BLUE}[INFO]${NC} $1"
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
print_success() {
|
| 30 |
+
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
print_warning() {
|
| 34 |
+
echo -e "${YELLOW}[WARNING]${NC} $1"
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
print_error() {
|
| 38 |
+
echo -e "${RED}[ERROR]${NC} $1"
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# Function to print usage
|
| 42 |
+
usage() {
|
| 43 |
+
cat << EOF
|
| 44 |
+
Usage: $0 [OPTIONS]
|
| 45 |
+
|
| 46 |
+
Temporal Reasoning Audio Dataset Generation Pipeline
|
| 47 |
+
|
| 48 |
+
OPTIONS:
|
| 49 |
+
-c, --config FILE Configuration file (default: config.yaml)
|
| 50 |
+
-o, --output DIR Output directory (overrides config)
|
| 51 |
+
-t, --tasks TASKS Specific tasks to run: count,duration,order,volume
|
| 52 |
+
(default: all enabled tasks)
|
| 53 |
+
-p, --python CMD Python command to use (default: python)
|
| 54 |
+
-h, --help Display this help message
|
| 55 |
+
|
| 56 |
+
EXAMPLES:
|
| 57 |
+
# Run all tasks with default config
|
| 58 |
+
$0
|
| 59 |
+
|
| 60 |
+
# Run with custom config
|
| 61 |
+
$0 --config my_config.yaml
|
| 62 |
+
|
| 63 |
+
# Run specific tasks only
|
| 64 |
+
$0 --tasks count,duration
|
| 65 |
+
|
| 66 |
+
# Use custom output directory
|
| 67 |
+
$0 --output /path/to/output
|
| 68 |
+
|
| 69 |
+
# Combine options
|
| 70 |
+
$0 --config custom.yaml --tasks count,order --output ./my_dataset
|
| 71 |
+
|
| 72 |
+
EOF
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
# Parse command line arguments
|
| 76 |
+
while [[ $# -gt 0 ]]; do
|
| 77 |
+
case $1 in
|
| 78 |
+
-c|--config)
|
| 79 |
+
CONFIG_FILE="$2"
|
| 80 |
+
shift 2
|
| 81 |
+
;;
|
| 82 |
+
-o|--output)
|
| 83 |
+
OUTPUT_DIR="$2"
|
| 84 |
+
shift 2
|
| 85 |
+
;;
|
| 86 |
+
-t|--tasks)
|
| 87 |
+
TASKS="$2"
|
| 88 |
+
shift 2
|
| 89 |
+
;;
|
| 90 |
+
-p|--python)
|
| 91 |
+
PYTHON_CMD="$2"
|
| 92 |
+
shift 2
|
| 93 |
+
;;
|
| 94 |
+
-h|--help)
|
| 95 |
+
usage
|
| 96 |
+
exit 0
|
| 97 |
+
;;
|
| 98 |
+
*)
|
| 99 |
+
print_error "Unknown option: $1"
|
| 100 |
+
usage
|
| 101 |
+
exit 1
|
| 102 |
+
;;
|
| 103 |
+
esac
|
| 104 |
+
done
|
| 105 |
+
|
| 106 |
+
# Get script directory
|
| 107 |
+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
| 108 |
+
|
| 109 |
+
# Check if config file exists
|
| 110 |
+
if [ ! -f "$SCRIPT_DIR/$CONFIG_FILE" ]; then
|
| 111 |
+
print_error "Config file not found: $CONFIG_FILE"
|
| 112 |
+
exit 1
|
| 113 |
+
fi
|
| 114 |
+
|
| 115 |
+
# Print header
|
| 116 |
+
echo ""
|
| 117 |
+
echo "================================================================================"
|
| 118 |
+
echo " TEMPORAL REASONING AUDIO DATASET GENERATION PIPELINE"
|
| 119 |
+
echo "================================================================================"
|
| 120 |
+
echo ""
|
| 121 |
+
print_info "Configuration: $CONFIG_FILE"
|
| 122 |
+
print_info "Python command: $PYTHON_CMD"
|
| 123 |
+
[ -n "$OUTPUT_DIR" ] && print_info "Output directory: $OUTPUT_DIR"
|
| 124 |
+
[ -n "$TASKS" ] && print_info "Tasks to run: $TASKS"
|
| 125 |
+
echo ""
|
| 126 |
+
|
| 127 |
+
# Check Python dependencies
|
| 128 |
+
print_info "Checking Python dependencies..."
|
| 129 |
+
$PYTHON_CMD -c "import yaml, pandas, pydub" 2>/dev/null
|
| 130 |
+
if [ $? -ne 0 ]; then
|
| 131 |
+
print_error "Missing required Python packages. Please install:"
|
| 132 |
+
echo " pip install pyyaml pandas pydub"
|
| 133 |
+
exit 1
|
| 134 |
+
fi
|
| 135 |
+
print_success "Dependencies OK"
|
| 136 |
+
echo ""
|
| 137 |
+
|
| 138 |
+
# Build Python command arguments
|
| 139 |
+
PYTHON_ARGS="$SCRIPT_DIR/main.py --config $SCRIPT_DIR/$CONFIG_FILE"
|
| 140 |
+
[ -n "$OUTPUT_DIR" ] && PYTHON_ARGS="$PYTHON_ARGS --output $OUTPUT_DIR"
|
| 141 |
+
if [ -n "$TASKS" ]; then
|
| 142 |
+
# Convert comma-separated to space-separated for Python argparse
|
| 143 |
+
TASKS_SPACE=$(echo $TASKS | tr ',' ' ')
|
| 144 |
+
PYTHON_ARGS="$PYTHON_ARGS --tasks $TASKS_SPACE"
|
| 145 |
+
fi
|
| 146 |
+
|
| 147 |
+
# Run the pipeline
|
| 148 |
+
print_info "Starting pipeline..."
|
| 149 |
+
echo ""
|
| 150 |
+
|
| 151 |
+
$PYTHON_CMD $PYTHON_ARGS
|
| 152 |
+
|
| 153 |
+
if [ $? -eq 0 ]; then
|
| 154 |
+
echo ""
|
| 155 |
+
echo "================================================================================"
|
| 156 |
+
print_success "PIPELINE COMPLETED SUCCESSFULLY!"
|
| 157 |
+
echo "================================================================================"
|
| 158 |
+
echo ""
|
| 159 |
+
else
|
| 160 |
+
echo ""
|
| 161 |
+
echo "================================================================================"
|
| 162 |
+
print_error "PIPELINE FAILED!"
|
| 163 |
+
echo "================================================================================"
|
| 164 |
+
echo ""
|
| 165 |
+
exit 1
|
| 166 |
+
fi
|
synthetic_silences/silent_1.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed8ddf138c2c59409bb4f1dbbf3fc910b486752b0c389dbb5dac6a4e68b8cbe5
|
| 3 |
+
size 263052
|
synthetic_silences/silent_10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:35fab767d2262eb552485542c6e593a5d84b7080862c577b23c11385176c7767
|
| 3 |
+
size 274840
|
synthetic_silences/silent_11.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b01397619b480a22261daa7b018b59b5fd1baf1e3d4ed81161908def25112f17
|
| 3 |
+
size 324418
|
synthetic_silences/silent_12.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2da9f4814fd0c6d50aa68696079c8d0ee880ed37d583e88a20481fd88c54e612
|
| 3 |
+
size 310108
|
synthetic_silences/silent_13.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5999933e975cd5846ac152bf888a954ea9243fa2218429998d96ceffac54a7e0
|
| 3 |
+
size 121474
|
synthetic_silences/silent_14.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42c8b935bd521534635cc4fea040023dbf420084b51d1e3529953d5d1593df48
|
| 3 |
+
size 209182
|
synthetic_silences/silent_15.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00529829944fd650a368d6fe65e25a7f3d25d8d4ba932712b35dfa5608380c3e
|
| 3 |
+
size 160682
|
synthetic_silences/silent_16.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:469eb34930878ba69a3994da5c2160314ce0c8bf0157d83f4ad349052a0c197b
|
| 3 |
+
size 112534
|
synthetic_silences/silent_17.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d788262618a55e51d12b0c2220ced172c0edf9072569ab010d48adc01607215
|
| 3 |
+
size 165986
|
synthetic_silences/silent_18.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83b6ef068680eacd83ac3d0b2f282fb37e2f4f018b03e89ab9a129aeac27a054
|
| 3 |
+
size 257330
|
synthetic_silences/silent_19.wav
ADDED
|
Binary file (96.9 kB). View file
|
|
|
synthetic_silences/silent_2.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01bbeb6e0c14200b30be0eb57484450ba5807954333fede2e4c59d32a7042eaf
|
| 3 |
+
size 310850
|
synthetic_silences/silent_20.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cda1e6a66b8cca7fc408f90cb6b8e8c13294fc33e8735a23dd72f1d36f9a991b
|
| 3 |
+
size 140232
|
synthetic_silences/silent_3.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d49fadd497b9af43be5afbb08070a6317000f15edc8924bf3c11b3fcbb140616
|
| 3 |
+
size 227846
|
synthetic_silences/silent_4.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:148c743ea43d3528a53395f579d4d337512de9d1fb3c5d5b66e55f3a5e9c4d0c
|
| 3 |
+
size 337068
|
synthetic_silences/silent_5.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51976ce15c0272f14125acaa5529a88d6f085ce153ef64bdc662586e97cb5678
|
| 3 |
+
size 205426
|
synthetic_silences/silent_6.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dda249ab269984ae15d0a78b582455c053b1cddafb78c792cafbcbf3f682a087
|
| 3 |
+
size 329056
|
synthetic_silences/silent_7.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:27156909b1191624cff0b0478477f8c40e47581bbb0be24a84e9113bf88f36a1
|
| 3 |
+
size 146876
|
synthetic_silences/silent_8.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:815a88cc01def086ca4dc23c41359eea297ec39c179b114e6e608d27bd2d9a39
|
| 3 |
+
size 216452
|
synthetic_silences/silent_9.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f88a22998b27a0e18e1801aa63ef4b83315c243762478c9cf149db4338ebafdb
|
| 3 |
+
size 307884
|
tasks/__pycache__/task_count.cpython-312.pyc
ADDED
|
Binary file (19.7 kB). View file
|
|
|
tasks/__pycache__/task_duration.cpython-312.pyc
ADDED
|
Binary file (30.9 kB). View file
|
|
|
tasks/__pycache__/task_order.cpython-312.pyc
ADDED
|
Binary file (23.7 kB). View file
|
|
|
tasks/__pycache__/task_volume.cpython-312.pyc
ADDED
|
Binary file (27.7 kB). View file
|
|
|
tasks/task_count.py
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 1: Count - Generate counting questions
|
| 3 |
+
|
| 4 |
+
This task joins multiple audio sources and asks questions about counting
|
| 5 |
+
the number of unique sound sources in the audio.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import csv
|
| 9 |
+
import random
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Dict, List
|
| 12 |
+
|
| 13 |
+
import sys
|
| 14 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 15 |
+
|
| 16 |
+
from utils import (
|
| 17 |
+
AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
|
| 18 |
+
setup_logger, set_random_seed, generate_sample_durations_for_task,
|
| 19 |
+
generate_single_clip_duration, build_count_task_audio,
|
| 20 |
+
get_max_clip_num_to_be_joined
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class CountTaskGenerator:
|
| 25 |
+
"""Generator for counting task dataset."""
|
| 26 |
+
|
| 27 |
+
def __init__(self, config: Dict, logger):
|
| 28 |
+
"""
|
| 29 |
+
Initialize count task generator.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
config: Configuration dictionary
|
| 33 |
+
logger: Logger instance
|
| 34 |
+
"""
|
| 35 |
+
self.config = config
|
| 36 |
+
self.logger = logger
|
| 37 |
+
self.task_config = config['tasks']['count']
|
| 38 |
+
|
| 39 |
+
# Initialize components
|
| 40 |
+
self.dataset = ESC50Dataset(
|
| 41 |
+
config['esc50']['metadata_path'],
|
| 42 |
+
config['esc50']['audio_path'],
|
| 43 |
+
config # Pass config for class subset loading
|
| 44 |
+
)
|
| 45 |
+
self.audio_processor = AudioProcessor(
|
| 46 |
+
crossfade_duration=config['audio']['crossfade_duration'],
|
| 47 |
+
silence_duration=config['audio']['silence_duration'],
|
| 48 |
+
with_silence=config['audio']['with_silence'],
|
| 49 |
+
normalize=config['audio']['normalize'],
|
| 50 |
+
normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
|
| 51 |
+
synthetic_silence_path=config['synthetic_silence']['path']
|
| 52 |
+
)
|
| 53 |
+
self.question_generator = QuestionGenerator(
|
| 54 |
+
num_options=config['mcq']['num_options'],
|
| 55 |
+
option_labels=config['mcq']['option_labels'],
|
| 56 |
+
distractor_strategy=config['mcq']['distractor_strategy']
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Initialize LLM question generator
|
| 60 |
+
self.llm_enabled = config.get('llm', {}).get('enabled', False)
|
| 61 |
+
self.llm_generator = LLMQuestionGenerator(
|
| 62 |
+
enabled=self.llm_enabled,
|
| 63 |
+
template_questions=self.task_config
|
| 64 |
+
)
|
| 65 |
+
if self.llm_enabled:
|
| 66 |
+
logger.info("LLM question generation enabled (local Llama 3.1 8B)")
|
| 67 |
+
else:
|
| 68 |
+
logger.info("Using template-based question generation")
|
| 69 |
+
|
| 70 |
+
# Duration settings from config
|
| 71 |
+
self.min_clip_duration = config['audio']['min_clip_duration']
|
| 72 |
+
self.max_clip_duration = config['audio']['max_clip_duration']
|
| 73 |
+
self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0)
|
| 74 |
+
self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
|
| 75 |
+
self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
|
| 76 |
+
# Small crossfade within same-source repetitions (for consecutive mode)
|
| 77 |
+
self.crossfade_within_source_ms = config['audio'].get('crossfade_within_source', 50)
|
| 78 |
+
self.task_duration_hours = self.task_config['task_duration_size']
|
| 79 |
+
|
| 80 |
+
# Ordering mode: "random" or "consecutive"
|
| 81 |
+
# random: Clips shuffled (A B A C B A C) - tests sound recognition
|
| 82 |
+
# consecutive: Same-source grouped (AAA BBB CCC) - easier
|
| 83 |
+
self.ordering_mode = self.task_config.get('ordering_mode', 'random')
|
| 84 |
+
logger.info(f"Count task ordering mode: {self.ordering_mode}")
|
| 85 |
+
|
| 86 |
+
# Set up output paths
|
| 87 |
+
self.output_base = Path(config['output']['base_path']) / 'count'
|
| 88 |
+
self.output_base.mkdir(parents=True, exist_ok=True)
|
| 89 |
+
self.audio_output = self.output_base / 'audios'
|
| 90 |
+
self.audio_output.mkdir(parents=True, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
def create_sampling_list(self, parent_list: List, n_sampling: int) -> List:
|
| 93 |
+
"""
|
| 94 |
+
Sample elements from parent list with replacement.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
parent_list: List to sample from
|
| 98 |
+
n_sampling: Number of samples
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
List of sampled elements
|
| 102 |
+
"""
|
| 103 |
+
return [random.choice(parent_list) for _ in range(n_sampling)]
|
| 104 |
+
|
| 105 |
+
def generate_sample(self, sample_id: int, target_unique_count: int = None, target_duration_seconds: float = None) -> Dict:
|
| 106 |
+
"""
|
| 107 |
+
Generate a single count task sample.
|
| 108 |
+
|
| 109 |
+
Pipeline for COUNT task:
|
| 110 |
+
1. Use pre-generated target duration (or generate if not provided)
|
| 111 |
+
2. Calculate max clips that can fit
|
| 112 |
+
3. Pick N unique classes (N <= max_clips, since each source needs at least 1 clip)
|
| 113 |
+
4. For each class, sample one audio clip
|
| 114 |
+
5. Calculate repetitions to fill target duration
|
| 115 |
+
6. Based on ordering_mode:
|
| 116 |
+
- "random": Shuffle clips (A B A C B A C) - tests recognition
|
| 117 |
+
- "consecutive": Group same-class (AAA BBB CCC) - easier
|
| 118 |
+
7. Insert silences between clips
|
| 119 |
+
8. Distribute remainder as random extra silences
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
sample_id: Sample ID number
|
| 123 |
+
target_unique_count: Target number of unique sounds (for balanced distribution)
|
| 124 |
+
target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
Dictionary with sample metadata
|
| 128 |
+
"""
|
| 129 |
+
# Use pre-generated duration or generate one (backward compatibility)
|
| 130 |
+
if target_duration_seconds is not None:
|
| 131 |
+
clip_duration_seconds = target_duration_seconds
|
| 132 |
+
else:
|
| 133 |
+
clip_duration_seconds = generate_single_clip_duration(
|
| 134 |
+
self.min_clip_duration,
|
| 135 |
+
self.max_clip_duration
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# Calculate max clips that can fit in target duration
|
| 139 |
+
max_clips, remainder_seconds = get_max_clip_num_to_be_joined(
|
| 140 |
+
clip_duration_seconds,
|
| 141 |
+
self.source_clip_duration,
|
| 142 |
+
self.min_silence_ms
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
# Ensure at least 1 clip
|
| 146 |
+
max_clips = max(1, max_clips)
|
| 147 |
+
|
| 148 |
+
max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
|
| 149 |
+
|
| 150 |
+
# Calculate valid range: n_unique_audios can be 1 to max_clips_per_sample
|
| 151 |
+
# but cannot exceed what physically fits or available categories
|
| 152 |
+
max_unique_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
|
| 153 |
+
|
| 154 |
+
if max_unique_for_sample < 1:
|
| 155 |
+
raise ValueError(
|
| 156 |
+
f"Sample {sample_id}: Cannot generate sample - max_unique_for_sample={max_unique_for_sample}. "
|
| 157 |
+
f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, "
|
| 158 |
+
f"available_categories={len(self.dataset.CATEGORIES)}, duration={clip_duration_seconds:.1f}s. "
|
| 159 |
+
f"Increase min_clip_duration or reduce max_clips_per_sample."
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Determine n_unique_audios - use target from balanced distribution or random
|
| 163 |
+
if target_unique_count is not None:
|
| 164 |
+
# Clamp target to what this specific sample duration can fit
|
| 165 |
+
# Short samples can't fit all possible answers, so we clamp down
|
| 166 |
+
n_unique_audios = min(target_unique_count, max_unique_for_sample)
|
| 167 |
+
|
| 168 |
+
if n_unique_audios != target_unique_count:
|
| 169 |
+
self.logger.debug(
|
| 170 |
+
f"Sample {sample_id}: Clamped target from {target_unique_count} to {n_unique_audios} "
|
| 171 |
+
f"(duration={clip_duration_seconds:.1f}s can only fit {max_clips} clips)"
|
| 172 |
+
)
|
| 173 |
+
else:
|
| 174 |
+
# No target specified - randomly select from valid range
|
| 175 |
+
n_unique_audios = random.randint(1, max_unique_for_sample)
|
| 176 |
+
|
| 177 |
+
self.logger.debug(
|
| 178 |
+
f"Sample {sample_id}: target={clip_duration_seconds:.1f}s, max_clips={max_clips}, "
|
| 179 |
+
f"n_unique_audios={n_unique_audios}"
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# Sample unique categories - use least-used categories for balanced distribution
|
| 183 |
+
selected_categories = self.dataset.get_least_used_categories(n_unique_audios)
|
| 184 |
+
|
| 185 |
+
# Track usage of all selected categories
|
| 186 |
+
for cat in selected_categories:
|
| 187 |
+
self.dataset.category_usage_counts[cat] += 1
|
| 188 |
+
|
| 189 |
+
# Sample one file from each unique category
|
| 190 |
+
source_files = []
|
| 191 |
+
source_paths = []
|
| 192 |
+
source_categories = []
|
| 193 |
+
|
| 194 |
+
for category in selected_categories:
|
| 195 |
+
filename, filepath = self.dataset.sample_file_from_category(category)
|
| 196 |
+
source_files.append(filename)
|
| 197 |
+
source_paths.append(filepath)
|
| 198 |
+
source_categories.append(category)
|
| 199 |
+
|
| 200 |
+
# Load unique source audios
|
| 201 |
+
source_audios = []
|
| 202 |
+
for file_path in source_paths:
|
| 203 |
+
audio = self.audio_processor.load_audio(file_path)
|
| 204 |
+
source_audios.append(audio)
|
| 205 |
+
|
| 206 |
+
# Build audio using configured ordering mode
|
| 207 |
+
final_audio, clip_sequence, build_metadata = build_count_task_audio(
|
| 208 |
+
source_audios,
|
| 209 |
+
source_categories,
|
| 210 |
+
clip_duration_seconds,
|
| 211 |
+
ordering_mode=self.ordering_mode,
|
| 212 |
+
source_clip_duration_seconds=self.source_clip_duration,
|
| 213 |
+
min_silence_ms=self.min_silence_ms,
|
| 214 |
+
max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
|
| 215 |
+
crossfade_within_source_ms=self.crossfade_within_source_ms
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
# Save the audio
|
| 219 |
+
output_audio_path = self.audio_output / f"{sample_id}.wav"
|
| 220 |
+
final_audio.export(str(output_audio_path), format="wav")
|
| 221 |
+
|
| 222 |
+
# Generate questions (using LLM if enabled)
|
| 223 |
+
if self.llm_enabled and self.llm_generator:
|
| 224 |
+
llm_questions = self.llm_generator.generate_count_questions(
|
| 225 |
+
correct_count=n_unique_audios,
|
| 226 |
+
categories_present=list(set(clip_sequence))
|
| 227 |
+
)
|
| 228 |
+
mcq_question_text = llm_questions.get('mcq_question')
|
| 229 |
+
open_text_question_text = llm_questions.get('open_text_question')
|
| 230 |
+
else:
|
| 231 |
+
mcq_question_text = random.choice(self.task_config['mcq_questions'])
|
| 232 |
+
open_text_question_text = random.choice(self.task_config['open_text_questions'])
|
| 233 |
+
|
| 234 |
+
# Generate MCQ with options
|
| 235 |
+
mcq_data = self.question_generator.generate_count_mcq(
|
| 236 |
+
mcq_question_text,
|
| 237 |
+
n_unique_audios,
|
| 238 |
+
self.dataset.CATEGORIES
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
# Generate open-text answer
|
| 242 |
+
open_text_data = self.question_generator.generate_count_open_text(
|
| 243 |
+
open_text_question_text,
|
| 244 |
+
n_unique_audios
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# Create metadata
|
| 248 |
+
metadata = {
|
| 249 |
+
'id': sample_id,
|
| 250 |
+
'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
|
| 251 |
+
'n_unique_sounds': n_unique_audios,
|
| 252 |
+
'total_clips': build_metadata['total_clips'],
|
| 253 |
+
'repetitions_per_source': build_metadata['repetitions_per_source'],
|
| 254 |
+
'ordering_mode': self.ordering_mode,
|
| 255 |
+
'source_files': source_files,
|
| 256 |
+
'source_categories': source_categories,
|
| 257 |
+
'clip_sequence': clip_sequence,
|
| 258 |
+
'unique_categories': sorted(list(set(source_categories))),
|
| 259 |
+
'target_duration_seconds': clip_duration_seconds,
|
| 260 |
+
'actual_duration_seconds': len(final_audio) / 1000.0,
|
| 261 |
+
'mcq_question': mcq_data['question'],
|
| 262 |
+
'mcq_options': mcq_data['options'],
|
| 263 |
+
'mcq_correct_answer': mcq_data['correct_answer'],
|
| 264 |
+
'open_text_question': open_text_data['question'],
|
| 265 |
+
'open_text_answer': open_text_data['correct_answer'],
|
| 266 |
+
'llm_generated': self.llm_enabled
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
self.logger.info(
|
| 270 |
+
f"Generated count sample {sample_id}: {n_unique_audios} unique sounds, "
|
| 271 |
+
f"{build_metadata['total_clips']} clips, {len(final_audio)/1000:.1f}s"
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
return metadata
|
| 275 |
+
|
| 276 |
+
def generate_dataset(self) -> tuple:
|
| 277 |
+
"""
|
| 278 |
+
Generate the complete count task dataset.
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
Tuple of (mcq_csv_path, open_text_csv_path)
|
| 282 |
+
"""
|
| 283 |
+
# Generate sample durations upfront to exactly fill target duration
|
| 284 |
+
sample_durations = generate_sample_durations_for_task(
|
| 285 |
+
self.task_duration_hours,
|
| 286 |
+
self.min_clip_duration,
|
| 287 |
+
self.max_clip_duration
|
| 288 |
+
)
|
| 289 |
+
num_samples = len(sample_durations)
|
| 290 |
+
self.logger.info(f"Generating {num_samples} count task samples (target: {self.task_duration_hours}h, actual: {sum(sample_durations)/3600:.2f}h)...")
|
| 291 |
+
|
| 292 |
+
# Calculate max clips each sample can fit based on duration
|
| 293 |
+
max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
|
| 294 |
+
sample_max_clips = []
|
| 295 |
+
for duration in sample_durations:
|
| 296 |
+
max_clips, _ = get_max_clip_num_to_be_joined(
|
| 297 |
+
duration,
|
| 298 |
+
self.source_clip_duration,
|
| 299 |
+
self.min_silence_ms
|
| 300 |
+
)
|
| 301 |
+
# Limit to config max and available categories
|
| 302 |
+
max_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
|
| 303 |
+
sample_max_clips.append(max_for_sample)
|
| 304 |
+
|
| 305 |
+
# Create balanced distribution by assigning targets based on sample capacity
|
| 306 |
+
# Sort samples by capacity to assign higher targets to samples that can fit them
|
| 307 |
+
possible_answers = list(range(1, max_clips_per_sample + 1))
|
| 308 |
+
samples_per_answer = num_samples // len(possible_answers)
|
| 309 |
+
remainder = num_samples % len(possible_answers)
|
| 310 |
+
|
| 311 |
+
# Create list of (sample_idx, duration, max_clips_capacity)
|
| 312 |
+
sample_info = [(i, sample_durations[i], sample_max_clips[i]) for i in range(num_samples)]
|
| 313 |
+
|
| 314 |
+
# Sort by capacity (descending) - assign high targets to high-capacity samples
|
| 315 |
+
sample_info.sort(key=lambda x: x[2], reverse=True)
|
| 316 |
+
|
| 317 |
+
# Assign targets: distribute each answer count across samples
|
| 318 |
+
balanced_assignments = [None] * num_samples
|
| 319 |
+
assignment_pool = []
|
| 320 |
+
|
| 321 |
+
for answer in possible_answers:
|
| 322 |
+
count = samples_per_answer + (1 if remainder > 0 else 0)
|
| 323 |
+
assignment_pool.extend([answer] * count)
|
| 324 |
+
remainder = max(0, remainder - 1)
|
| 325 |
+
|
| 326 |
+
# Reverse pool so we assign high targets first (to high-capacity samples)
|
| 327 |
+
assignment_pool.sort(reverse=True)
|
| 328 |
+
|
| 329 |
+
for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
|
| 330 |
+
# Assign target, clamped to sample's capacity
|
| 331 |
+
target = min(assignment_pool[idx], capacity)
|
| 332 |
+
balanced_assignments[sample_idx] = target
|
| 333 |
+
|
| 334 |
+
# Log the actual distribution after capacity clamping
|
| 335 |
+
from collections import Counter
|
| 336 |
+
distribution = Counter(balanced_assignments)
|
| 337 |
+
self.logger.info(f"Balanced answer distribution (after capacity-aware assignment): {dict(sorted(distribution.items()))}")
|
| 338 |
+
|
| 339 |
+
all_metadata = []
|
| 340 |
+
|
| 341 |
+
for i in range(num_samples):
|
| 342 |
+
metadata = self.generate_sample(
|
| 343 |
+
i,
|
| 344 |
+
target_unique_count=balanced_assignments[i],
|
| 345 |
+
target_duration_seconds=sample_durations[i]
|
| 346 |
+
)
|
| 347 |
+
all_metadata.append(metadata)
|
| 348 |
+
|
| 349 |
+
# Save MCQ CSV
|
| 350 |
+
mcq_csv_path = self.output_base / 'count_mcq.csv'
|
| 351 |
+
self._save_mcq_csv(all_metadata, mcq_csv_path)
|
| 352 |
+
|
| 353 |
+
# Save open-text CSV
|
| 354 |
+
open_text_csv_path = self.output_base / 'count_open_text.csv'
|
| 355 |
+
self._save_open_text_csv(all_metadata, open_text_csv_path)
|
| 356 |
+
|
| 357 |
+
# Save metadata CSV
|
| 358 |
+
metadata_csv_path = self.output_base / 'count_metadata.csv'
|
| 359 |
+
self._save_metadata_csv(all_metadata, metadata_csv_path)
|
| 360 |
+
|
| 361 |
+
self.logger.info(f"Count task dataset generation complete!")
|
| 362 |
+
self.logger.info(f" - MCQ CSV: {mcq_csv_path}")
|
| 363 |
+
self.logger.info(f" - Open-text CSV: {open_text_csv_path}")
|
| 364 |
+
self.logger.info(f" - Metadata CSV: {metadata_csv_path}")
|
| 365 |
+
self.logger.info(f" - Audio files: {self.audio_output}")
|
| 366 |
+
|
| 367 |
+
return mcq_csv_path, open_text_csv_path
|
| 368 |
+
|
| 369 |
+
def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 370 |
+
"""Save MCQ format CSV."""
|
| 371 |
+
with open(output_path, 'w', newline='') as f:
|
| 372 |
+
writer = csv.writer(f)
|
| 373 |
+
# Header
|
| 374 |
+
writer.writerow([
|
| 375 |
+
'question', 'id', 'audio_path',
|
| 376 |
+
'optionA', 'optionB', 'optionC', 'optionD',
|
| 377 |
+
'correct', 'source_wavs', 'source_categories'
|
| 378 |
+
])
|
| 379 |
+
|
| 380 |
+
# Data rows
|
| 381 |
+
for meta in metadata_list:
|
| 382 |
+
writer.writerow([
|
| 383 |
+
meta['mcq_question'],
|
| 384 |
+
meta['id'],
|
| 385 |
+
meta['audio_path'],
|
| 386 |
+
meta['mcq_options']['A'],
|
| 387 |
+
meta['mcq_options']['B'],
|
| 388 |
+
meta['mcq_options']['C'],
|
| 389 |
+
meta['mcq_options']['D'],
|
| 390 |
+
meta['mcq_correct_answer'],
|
| 391 |
+
str(meta['source_files']),
|
| 392 |
+
str(meta['unique_categories'])
|
| 393 |
+
])
|
| 394 |
+
|
| 395 |
+
def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 396 |
+
"""Save open-text format CSV."""
|
| 397 |
+
with open(output_path, 'w', newline='') as f:
|
| 398 |
+
writer = csv.writer(f)
|
| 399 |
+
# Header
|
| 400 |
+
writer.writerow([
|
| 401 |
+
'question', 'id', 'audio_path', 'answer',
|
| 402 |
+
'source_wavs', 'source_categories'
|
| 403 |
+
])
|
| 404 |
+
|
| 405 |
+
# Data rows
|
| 406 |
+
for meta in metadata_list:
|
| 407 |
+
writer.writerow([
|
| 408 |
+
meta['open_text_question'],
|
| 409 |
+
meta['id'],
|
| 410 |
+
meta['audio_path'],
|
| 411 |
+
meta['open_text_answer'],
|
| 412 |
+
str(meta['source_files']),
|
| 413 |
+
str(meta['unique_categories'])
|
| 414 |
+
])
|
| 415 |
+
|
| 416 |
+
def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 417 |
+
"""Save detailed metadata CSV."""
|
| 418 |
+
with open(output_path, 'w', newline='') as f:
|
| 419 |
+
writer = csv.writer(f)
|
| 420 |
+
# Header
|
| 421 |
+
writer.writerow([
|
| 422 |
+
'id', 'audio_path', 'total_clips', 'n_unique_sounds',
|
| 423 |
+
'source_files', 'source_categories', 'unique_categories',
|
| 424 |
+
'ordering_mode', 'target_duration_s', 'actual_duration_s', 'llm_generated'
|
| 425 |
+
])
|
| 426 |
+
|
| 427 |
+
# Data rows
|
| 428 |
+
for meta in metadata_list:
|
| 429 |
+
writer.writerow([
|
| 430 |
+
meta['id'],
|
| 431 |
+
meta['audio_path'],
|
| 432 |
+
meta['total_clips'],
|
| 433 |
+
meta['n_unique_sounds'],
|
| 434 |
+
str(meta['source_files']),
|
| 435 |
+
str(meta['source_categories']),
|
| 436 |
+
str(meta['unique_categories']),
|
| 437 |
+
meta.get('ordering_mode', 'random'),
|
| 438 |
+
meta.get('target_duration_seconds', 0),
|
| 439 |
+
meta.get('actual_duration_seconds', 0),
|
| 440 |
+
meta.get('llm_generated', False)
|
| 441 |
+
])
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
def main(config_path: str = None):
|
| 445 |
+
"""Main entry point for count task generation."""
|
| 446 |
+
import yaml
|
| 447 |
+
|
| 448 |
+
# Load configuration
|
| 449 |
+
if config_path is None:
|
| 450 |
+
config_path = Path(__file__).parent.parent / 'config.yaml'
|
| 451 |
+
|
| 452 |
+
with open(config_path, 'r') as f:
|
| 453 |
+
config = yaml.safe_load(f)
|
| 454 |
+
|
| 455 |
+
# Set random seed
|
| 456 |
+
set_random_seed(config['random_seed'])
|
| 457 |
+
|
| 458 |
+
# Setup logger
|
| 459 |
+
logger = setup_logger(
|
| 460 |
+
'count_task',
|
| 461 |
+
log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
|
| 462 |
+
level=config['logging']['level'],
|
| 463 |
+
console_output=config['logging']['console_output']
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
# Generate dataset
|
| 467 |
+
generator = CountTaskGenerator(config, logger)
|
| 468 |
+
generator.generate_dataset()
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
if __name__ == '__main__':
|
| 472 |
+
main()
|
tasks/task_duration.py
ADDED
|
@@ -0,0 +1,820 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 2: Duration - Generate duration comparison questions
|
| 3 |
+
|
| 4 |
+
This task creates audio samples where sources have different effective durations
|
| 5 |
+
and asks questions about which sound is heard for the longest or shortest time.
|
| 6 |
+
|
| 7 |
+
Key features:
|
| 8 |
+
- Uses amplitude-filtered (preprocessed) audio clips with known effective durations
|
| 9 |
+
- First calculates max clips from total duration, then distributes slots
|
| 10 |
+
- Strategically distributes repetitions to ensure clear longest/shortest answers
|
| 11 |
+
- Consecutive ordering within sources, random order between sources
|
| 12 |
+
- Gap multipliers ensure unambiguous answers (e.g., longest is 1.5x longer than next)
|
| 13 |
+
- NO category preference - random selection to avoid bias
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import csv
|
| 17 |
+
import random
|
| 18 |
+
import math
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import Dict, List, Tuple, Optional
|
| 21 |
+
from collections import Counter
|
| 22 |
+
|
| 23 |
+
import sys
|
| 24 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 25 |
+
|
| 26 |
+
from utils import (
|
| 27 |
+
AudioProcessor, PreprocessedESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
|
| 28 |
+
setup_logger, set_random_seed, calculate_num_samples_for_task,
|
| 29 |
+
generate_single_clip_duration, get_max_clip_num_to_be_joined,
|
| 30 |
+
build_duration_task_audio, distribute_remainder_as_silences,
|
| 31 |
+
generate_sample_durations_for_task
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class DurationTaskGenerator:
|
| 36 |
+
"""Generator for duration comparison task dataset using preprocessed ESC-50."""
|
| 37 |
+
|
| 38 |
+
def __init__(self, config: Dict, logger):
|
| 39 |
+
"""
|
| 40 |
+
Initialize duration task generator.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
config: Configuration dictionary
|
| 44 |
+
logger: Logger instance
|
| 45 |
+
"""
|
| 46 |
+
self.config = config
|
| 47 |
+
self.logger = logger
|
| 48 |
+
self.task_config = config['tasks']['duration']
|
| 49 |
+
|
| 50 |
+
# Initialize preprocessed dataset (with effective durations)
|
| 51 |
+
self.dataset = PreprocessedESC50Dataset(
|
| 52 |
+
metadata_path=config['esc50']['metadata_path'],
|
| 53 |
+
audio_path=config['esc50']['audio_path'],
|
| 54 |
+
preprocessed_path=self.task_config['preprocessed_data_path'],
|
| 55 |
+
config=config # Pass config for class subset loading
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# Calculate average effective duration from preprocessed data
|
| 59 |
+
self.avg_effective_duration = self.dataset.effective_df['effective_duration_s'].mean()
|
| 60 |
+
self.logger.info(f"Average effective duration: {self.avg_effective_duration:.2f}s")
|
| 61 |
+
|
| 62 |
+
# Initialize audio processor
|
| 63 |
+
self.audio_processor = AudioProcessor(
|
| 64 |
+
crossfade_duration=config['audio']['crossfade_duration'],
|
| 65 |
+
silence_duration=config['audio']['silence_duration'],
|
| 66 |
+
with_silence=config['audio']['with_silence'],
|
| 67 |
+
normalize=config['audio']['normalize'],
|
| 68 |
+
normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
|
| 69 |
+
synthetic_silence_path=config['synthetic_silence']['path']
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Initialize question generator
|
| 73 |
+
self.question_generator = QuestionGenerator(
|
| 74 |
+
num_options=config['mcq']['num_options'],
|
| 75 |
+
option_labels=config['mcq']['option_labels'],
|
| 76 |
+
distractor_strategy=config['mcq']['distractor_strategy']
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Initialize LLM question generator
|
| 80 |
+
self.llm_enabled = config.get('llm', {}).get('enabled', False)
|
| 81 |
+
self.llm_generator = LLMQuestionGenerator(
|
| 82 |
+
enabled=self.llm_enabled,
|
| 83 |
+
template_questions=self.task_config
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Duration settings from config
|
| 87 |
+
self.min_clip_duration = config['audio']['min_clip_duration']
|
| 88 |
+
self.max_clip_duration = config['audio']['max_clip_duration']
|
| 89 |
+
self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
|
| 90 |
+
self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
|
| 91 |
+
self.crossfade_within_source_ms = config['audio'].get('crossfade_within_source', 50)
|
| 92 |
+
self.task_duration_hours = self.task_config['task_duration_size']
|
| 93 |
+
|
| 94 |
+
# Duration task specific settings
|
| 95 |
+
self.multiplier_longest = self.task_config.get('multiplier_longest', 1.5)
|
| 96 |
+
self.multiplier_shortest = self.task_config.get('multiplier_shortest', 0.75)
|
| 97 |
+
self.reject_if_gap_not_met = self.task_config.get('reject_if_gap_not_met', True)
|
| 98 |
+
self.sample_different_clips = self.task_config.get('sample_different_clips_same_class', True)
|
| 99 |
+
# Minimum effective duration per source (seconds) - clips shorter than this are harder to distinguish
|
| 100 |
+
self.min_effective_duration_per_source = self.task_config.get('min_effective_duration_per_source', 1.0)
|
| 101 |
+
|
| 102 |
+
# Set up output paths
|
| 103 |
+
self.output_base = Path(config['output']['base_path']) / 'duration'
|
| 104 |
+
self.output_base.mkdir(parents=True, exist_ok=True)
|
| 105 |
+
self.audio_output = self.output_base / 'audios'
|
| 106 |
+
self.audio_output.mkdir(parents=True, exist_ok=True)
|
| 107 |
+
|
| 108 |
+
# Statistics tracking
|
| 109 |
+
self.rejection_count = 0
|
| 110 |
+
self.success_count = 0
|
| 111 |
+
|
| 112 |
+
def _calculate_max_clips_and_sources(
|
| 113 |
+
self,
|
| 114 |
+
target_duration_s: float,
|
| 115 |
+
question_type: str
|
| 116 |
+
) -> Tuple[int, int, float]:
|
| 117 |
+
"""
|
| 118 |
+
Calculate max clips possible and choose n_sources from config that satisfies gap.
|
| 119 |
+
|
| 120 |
+
Key principle:
|
| 121 |
+
1. Calculate valid range of sources that can satisfy gap constraint
|
| 122 |
+
2. Filter config values to only those within valid range
|
| 123 |
+
3. Pick RANDOMLY from valid config values (ensures variety)
|
| 124 |
+
|
| 125 |
+
For LONGEST:
|
| 126 |
+
- Target needs at least 2 clips to beat max_background by 1.5x
|
| 127 |
+
- max_sources = max_clips - 2 + 1 (backgrounds get 1 each)
|
| 128 |
+
- min_sources = 2 (need at least 1 background)
|
| 129 |
+
|
| 130 |
+
For SHORTEST:
|
| 131 |
+
- Target gets 1 clip
|
| 132 |
+
- Each background needs at least 2 clips to be 2x target (1/0.5)
|
| 133 |
+
- max_sources = 1 + (max_clips - 1) // 2
|
| 134 |
+
- min_sources = 2
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
target_duration_s: Target total audio duration
|
| 138 |
+
question_type: "longest" or "shortest"
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
Tuple of (max_clips, n_sources, remainder_s)
|
| 142 |
+
"""
|
| 143 |
+
# Get max clips using average effective duration
|
| 144 |
+
max_clips, remainder_s = get_max_clip_num_to_be_joined(
|
| 145 |
+
target_duration_s,
|
| 146 |
+
self.avg_effective_duration,
|
| 147 |
+
self.min_silence_ms
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
# Ensure at least 2 clips
|
| 151 |
+
max_clips = max(2, max_clips)
|
| 152 |
+
|
| 153 |
+
# Get config values for n_sources
|
| 154 |
+
# If single int (e.g., 15), sample from [1, 15] like count/order tasks
|
| 155 |
+
# If list (e.g., [2,3,4]), sample from the list
|
| 156 |
+
num_sources_config = self.task_config.get('num_unique_sources', [2, 3, 4, 5])
|
| 157 |
+
if isinstance(num_sources_config, int):
|
| 158 |
+
# Single int: create range [1, num_sources_config]
|
| 159 |
+
num_sources_config = list(range(1, num_sources_config + 1))
|
| 160 |
+
|
| 161 |
+
if question_type == "longest":
|
| 162 |
+
# Target needs at least 2 clips to reliably beat background by multiplier
|
| 163 |
+
# (with 1.5x multiplier, 2 clips of target vs 1 clip of background usually works)
|
| 164 |
+
min_target_clips = 2
|
| 165 |
+
|
| 166 |
+
# Minimum sources: need at least 1 background + target = 2
|
| 167 |
+
min_valid_sources = 2
|
| 168 |
+
|
| 169 |
+
# Maximum sources: max_clips - min_target_clips + 1
|
| 170 |
+
# (subtract target's clips, add 1 for the target itself)
|
| 171 |
+
max_valid_sources = max_clips - min_target_clips + 1
|
| 172 |
+
|
| 173 |
+
else: # shortest
|
| 174 |
+
# Target gets 1 clip
|
| 175 |
+
# Each background needs at least 2 clips to be >= 2x target (1/0.5 multiplier)
|
| 176 |
+
min_clips_per_background = 2
|
| 177 |
+
|
| 178 |
+
# Minimum sources: 2 (target + 1 background)
|
| 179 |
+
min_valid_sources = 2
|
| 180 |
+
|
| 181 |
+
# Maximum sources: how many backgrounds can we fit?
|
| 182 |
+
remaining_clips = max_clips - 1 # 1 for target
|
| 183 |
+
max_backgrounds = remaining_clips // min_clips_per_background
|
| 184 |
+
max_valid_sources = max_backgrounds + 1 # +1 for target
|
| 185 |
+
|
| 186 |
+
# Filter config values to only valid ones
|
| 187 |
+
valid_config_sources = [
|
| 188 |
+
n for n in num_sources_config
|
| 189 |
+
if min_valid_sources <= n <= max_valid_sources
|
| 190 |
+
]
|
| 191 |
+
|
| 192 |
+
if not valid_config_sources:
|
| 193 |
+
raise ValueError(
|
| 194 |
+
f"Duration task: No valid num_unique_sources for {question_type} question. "
|
| 195 |
+
f"Config values: {num_sources_config}, Valid range: [{min_valid_sources}, {max_valid_sources}]. "
|
| 196 |
+
f"max_clips={max_clips}, duration={target_duration_s:.1f}s. "
|
| 197 |
+
f"Increase min_clip_duration or adjust num_unique_sources config."
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
# Pick RANDOMLY from valid config values (ensures variety!)
|
| 201 |
+
n_sources = random.choice(valid_config_sources)
|
| 202 |
+
|
| 203 |
+
# Validate final value
|
| 204 |
+
if n_sources < 2 or n_sources > len(self.dataset.CATEGORIES):
|
| 205 |
+
raise ValueError(
|
| 206 |
+
f"Duration task: Invalid n_sources={n_sources}. "
|
| 207 |
+
f"Must be in range [2, {len(self.dataset.CATEGORIES)}]"
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
self.logger.debug(
|
| 211 |
+
f"Max clips: {max_clips}, Question: {question_type}, "
|
| 212 |
+
f"Valid range: [{min_valid_sources}, {max_valid_sources}], "
|
| 213 |
+
f"Valid config: {valid_config_sources}, Selected: {n_sources}"
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
return max_clips, n_sources, remainder_s
|
| 217 |
+
|
| 218 |
+
def _calculate_slot_distribution(
|
| 219 |
+
self,
|
| 220 |
+
max_clips: int,
|
| 221 |
+
n_sources: int,
|
| 222 |
+
effective_durations: Dict[str, float],
|
| 223 |
+
target_category: str,
|
| 224 |
+
question_type: str
|
| 225 |
+
) -> Tuple[Dict[str, int], bool, Dict]:
|
| 226 |
+
"""
|
| 227 |
+
Calculate how many clips each source gets.
|
| 228 |
+
|
| 229 |
+
For LONGEST: target gets (max_clips - n_backgrounds), backgrounds get 1 each
|
| 230 |
+
For SHORTEST: target gets 1, backgrounds share (max_clips - 1)
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
max_clips: Maximum number of clips that fit
|
| 234 |
+
n_sources: Number of unique sources
|
| 235 |
+
effective_durations: Dict mapping category -> effective duration
|
| 236 |
+
target_category: The category that should be longest/shortest
|
| 237 |
+
question_type: "longest" or "shortest"
|
| 238 |
+
|
| 239 |
+
Returns:
|
| 240 |
+
Tuple of (slot_distribution, gap_satisfied, metadata)
|
| 241 |
+
"""
|
| 242 |
+
categories = list(effective_durations.keys())
|
| 243 |
+
background_categories = [c for c in categories if c != target_category]
|
| 244 |
+
n_backgrounds = len(background_categories)
|
| 245 |
+
|
| 246 |
+
if question_type == "longest":
|
| 247 |
+
# Target gets max_clips - n_backgrounds
|
| 248 |
+
# Backgrounds get 1 each
|
| 249 |
+
target_clips = max_clips - n_backgrounds
|
| 250 |
+
target_clips = max(1, target_clips) # At least 1
|
| 251 |
+
|
| 252 |
+
slot_distribution = {target_category: target_clips}
|
| 253 |
+
for cat in background_categories:
|
| 254 |
+
slot_distribution[cat] = 1
|
| 255 |
+
|
| 256 |
+
# Verify gap: target_duration >= max_background × multiplier
|
| 257 |
+
target_duration = target_clips * effective_durations[target_category]
|
| 258 |
+
background_durations = [effective_durations[c] for c in background_categories]
|
| 259 |
+
max_background = max(background_durations) if background_durations else 0
|
| 260 |
+
required_target = max_background * self.multiplier_longest
|
| 261 |
+
gap_satisfied = target_duration >= required_target
|
| 262 |
+
|
| 263 |
+
metadata = {
|
| 264 |
+
'target_clips': target_clips,
|
| 265 |
+
'target_duration_s': target_duration,
|
| 266 |
+
'max_background_s': max_background,
|
| 267 |
+
'required_target_s': required_target,
|
| 268 |
+
'multiplier': self.multiplier_longest
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
else: # shortest
|
| 272 |
+
# Target gets 1 clip
|
| 273 |
+
# Backgrounds share (max_clips - 1)
|
| 274 |
+
remaining_clips = max_clips - 1
|
| 275 |
+
clips_per_background = max(1, remaining_clips // n_backgrounds)
|
| 276 |
+
extra_clips = remaining_clips % n_backgrounds
|
| 277 |
+
|
| 278 |
+
slot_distribution = {target_category: 1}
|
| 279 |
+
|
| 280 |
+
for i, cat in enumerate(background_categories):
|
| 281 |
+
clips = clips_per_background + (1 if i < extra_clips else 0)
|
| 282 |
+
slot_distribution[cat] = clips
|
| 283 |
+
|
| 284 |
+
# Verify gap: target_duration <= min_background × multiplier
|
| 285 |
+
target_duration = effective_durations[target_category]
|
| 286 |
+
background_durations = [
|
| 287 |
+
slot_distribution[c] * effective_durations[c]
|
| 288 |
+
for c in background_categories
|
| 289 |
+
]
|
| 290 |
+
min_background = min(background_durations) if background_durations else float('inf')
|
| 291 |
+
required_max_target = min_background * self.multiplier_shortest
|
| 292 |
+
|
| 293 |
+
# CRITICAL: Target must still be at least min_effective_duration_per_source
|
| 294 |
+
# Otherwise clips that are too short (e.g., 0.03s) would be used and be indistinguishable
|
| 295 |
+
target_too_short = target_duration < self.min_effective_duration_per_source
|
| 296 |
+
gap_satisfied = (target_duration <= required_max_target) and (not target_too_short)
|
| 297 |
+
|
| 298 |
+
metadata = {
|
| 299 |
+
'target_clips': 1,
|
| 300 |
+
'target_duration_s': target_duration,
|
| 301 |
+
'min_background_s': min_background,
|
| 302 |
+
'required_max_target_s': required_max_target,
|
| 303 |
+
'multiplier': self.multiplier_shortest,
|
| 304 |
+
'target_too_short': target_too_short
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
return slot_distribution, gap_satisfied, metadata
|
| 308 |
+
|
| 309 |
+
def _try_generate_sample(
|
| 310 |
+
self,
|
| 311 |
+
sample_id: int,
|
| 312 |
+
question_type: str,
|
| 313 |
+
max_retries: int = 5,
|
| 314 |
+
target_duration_seconds: float = None
|
| 315 |
+
) -> Optional[Dict]:
|
| 316 |
+
"""
|
| 317 |
+
Try to generate a valid duration sample with retries.
|
| 318 |
+
|
| 319 |
+
Args:
|
| 320 |
+
sample_id: Sample ID
|
| 321 |
+
question_type: "longest" or "shortest"
|
| 322 |
+
max_retries: Maximum retry attempts
|
| 323 |
+
target_duration_seconds: Pre-generated target duration
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
Metadata dict if successful, None if all retries failed
|
| 327 |
+
"""
|
| 328 |
+
for attempt in range(max_retries):
|
| 329 |
+
try:
|
| 330 |
+
result = self._generate_single_sample(sample_id, question_type, target_duration_seconds=target_duration_seconds)
|
| 331 |
+
if result is not None:
|
| 332 |
+
return result
|
| 333 |
+
except Exception as e:
|
| 334 |
+
self.logger.warning(f"Sample {sample_id} attempt {attempt+1} failed: {e}")
|
| 335 |
+
|
| 336 |
+
return None
|
| 337 |
+
|
| 338 |
+
def _generate_single_sample(
|
| 339 |
+
self,
|
| 340 |
+
sample_id: int,
|
| 341 |
+
question_type: str,
|
| 342 |
+
target_duration_seconds: float = None
|
| 343 |
+
) -> Optional[Dict]:
|
| 344 |
+
"""
|
| 345 |
+
Generate a single duration task sample.
|
| 346 |
+
|
| 347 |
+
Corrected Pipeline:
|
| 348 |
+
1. Use pre-generated target duration (or generate if not provided)
|
| 349 |
+
2. Calculate max_clips using get_max_clip_num_to_be_joined
|
| 350 |
+
3. Based on max_clips and question_type, determine n_sources
|
| 351 |
+
4. Select categories RANDOMLY (no bias toward short/long)
|
| 352 |
+
5. Pick target category RANDOMLY from selected
|
| 353 |
+
6. Get effective durations for all sources
|
| 354 |
+
7. Calculate slot distribution based on max_clips
|
| 355 |
+
8. Verify gap constraint
|
| 356 |
+
9. Load audio clips and build final audio
|
| 357 |
+
|
| 358 |
+
Args:
|
| 359 |
+
sample_id: Sample ID number
|
| 360 |
+
question_type: "longest" or "shortest"
|
| 361 |
+
target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
|
| 362 |
+
|
| 363 |
+
Returns:
|
| 364 |
+
Dictionary with sample metadata, or None if failed
|
| 365 |
+
"""
|
| 366 |
+
# Step 1: Use pre-generated duration or generate one (backward compatibility)
|
| 367 |
+
if target_duration_seconds is not None:
|
| 368 |
+
target_duration_s = target_duration_seconds
|
| 369 |
+
else:
|
| 370 |
+
target_duration_s = generate_single_clip_duration(
|
| 371 |
+
self.min_clip_duration,
|
| 372 |
+
self.max_clip_duration
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
# Step 2 & 3: Calculate max_clips and n_sources
|
| 376 |
+
max_clips, n_sources, remainder_s = self._calculate_max_clips_and_sources(
|
| 377 |
+
target_duration_s,
|
| 378 |
+
question_type
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
# Step 4: Select categories RANDOMLY (using least-used for balance, but no duration preference)
|
| 382 |
+
all_categories = self.dataset.get_least_used_categories(n_sources)
|
| 383 |
+
|
| 384 |
+
# Step 5: Pick target category RANDOMLY from selected (no bias!)
|
| 385 |
+
target_category = random.choice(all_categories)
|
| 386 |
+
self.dataset.category_usage_counts[target_category] += 1
|
| 387 |
+
|
| 388 |
+
# Step 6: Get effective durations by sampling one file per category
|
| 389 |
+
# Use min_effective_duration_per_source to avoid clips that are too short to distinguish
|
| 390 |
+
effective_durations = {}
|
| 391 |
+
selected_files = {}
|
| 392 |
+
|
| 393 |
+
for category in all_categories:
|
| 394 |
+
filename, filepath, eff_dur = self.dataset.sample_file_from_category_with_duration(
|
| 395 |
+
category,
|
| 396 |
+
min_effective_duration=self.min_effective_duration_per_source
|
| 397 |
+
)
|
| 398 |
+
effective_durations[category] = eff_dur
|
| 399 |
+
selected_files[category] = {
|
| 400 |
+
'filename': filename,
|
| 401 |
+
'filepath': filepath,
|
| 402 |
+
'effective_duration_s': eff_dur
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
# Step 7: Calculate slot distribution based on max_clips
|
| 406 |
+
slot_distribution, gap_satisfied, calc_metadata = self._calculate_slot_distribution(
|
| 407 |
+
max_clips=max_clips,
|
| 408 |
+
n_sources=n_sources,
|
| 409 |
+
effective_durations=effective_durations,
|
| 410 |
+
target_category=target_category,
|
| 411 |
+
question_type=question_type
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
# Step 8: If gap not satisfied, try adjustments
|
| 415 |
+
if not gap_satisfied:
|
| 416 |
+
# Try with different clips that have better durations
|
| 417 |
+
if self.sample_different_clips:
|
| 418 |
+
gap_satisfied = self._try_improve_gap_with_different_clips(
|
| 419 |
+
question_type=question_type,
|
| 420 |
+
target_category=target_category,
|
| 421 |
+
all_categories=all_categories,
|
| 422 |
+
max_clips=max_clips,
|
| 423 |
+
n_sources=n_sources,
|
| 424 |
+
effective_durations=effective_durations,
|
| 425 |
+
selected_files=selected_files,
|
| 426 |
+
slot_distribution=slot_distribution
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
if not gap_satisfied and self.reject_if_gap_not_met:
|
| 430 |
+
self.rejection_count += 1
|
| 431 |
+
self.logger.debug(
|
| 432 |
+
f"Sample {sample_id} rejected: gap not satisfied "
|
| 433 |
+
f"(type={question_type}, max_clips={max_clips}, sources={n_sources})"
|
| 434 |
+
)
|
| 435 |
+
return None
|
| 436 |
+
|
| 437 |
+
# Step 9: Load audio clips based on slot distribution
|
| 438 |
+
source_audio_lists = {}
|
| 439 |
+
files_used = {}
|
| 440 |
+
|
| 441 |
+
for category in all_categories:
|
| 442 |
+
reps = slot_distribution.get(category, 0)
|
| 443 |
+
if reps == 0:
|
| 444 |
+
continue
|
| 445 |
+
|
| 446 |
+
# Get files for this category
|
| 447 |
+
if self.sample_different_clips and reps > 1:
|
| 448 |
+
filenames, filepaths, total_dur = self.dataset.sample_files_from_category_to_reach_duration(
|
| 449 |
+
category,
|
| 450 |
+
reps * effective_durations[category],
|
| 451 |
+
prefer_same_file=False
|
| 452 |
+
)
|
| 453 |
+
else:
|
| 454 |
+
# Use same file repeated
|
| 455 |
+
file_info = selected_files[category]
|
| 456 |
+
filenames = [file_info['filename']] * reps
|
| 457 |
+
filepaths = [file_info['filepath']] * reps
|
| 458 |
+
|
| 459 |
+
# Load audio segments
|
| 460 |
+
audio_list = []
|
| 461 |
+
for fp in filepaths[:reps]:
|
| 462 |
+
audio = self.audio_processor.load_audio(fp)
|
| 463 |
+
audio_list.append(audio)
|
| 464 |
+
|
| 465 |
+
# If we need more, cycle through
|
| 466 |
+
while len(audio_list) < reps:
|
| 467 |
+
audio_list.append(audio_list[len(audio_list) % len(audio_list)])
|
| 468 |
+
|
| 469 |
+
source_audio_lists[category] = audio_list[:reps]
|
| 470 |
+
files_used[category] = filenames[:reps]
|
| 471 |
+
|
| 472 |
+
# Step 10: Build final audio
|
| 473 |
+
final_audio, category_sequence, build_metadata = build_duration_task_audio(
|
| 474 |
+
source_audio_lists=source_audio_lists,
|
| 475 |
+
slot_distribution=slot_distribution,
|
| 476 |
+
effective_durations=effective_durations,
|
| 477 |
+
target_total_duration_s=target_duration_s,
|
| 478 |
+
min_silence_between_sources_ms=self.min_silence_ms,
|
| 479 |
+
max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
|
| 480 |
+
crossfade_within_source_ms=self.crossfade_within_source_ms
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
# Save audio
|
| 484 |
+
output_audio_path = self.audio_output / f"{sample_id}.wav"
|
| 485 |
+
final_audio.export(str(output_audio_path), format="wav")
|
| 486 |
+
|
| 487 |
+
# Step 11: Generate questions
|
| 488 |
+
correct_category = target_category
|
| 489 |
+
present_categories = all_categories
|
| 490 |
+
|
| 491 |
+
mcq_question = self.task_config['mcq_questions'][question_type]
|
| 492 |
+
mcq_data = self.question_generator.generate_category_mcq(
|
| 493 |
+
mcq_question,
|
| 494 |
+
correct_category,
|
| 495 |
+
present_categories,
|
| 496 |
+
self.dataset.CATEGORIES
|
| 497 |
+
)
|
| 498 |
+
|
| 499 |
+
open_text_question = self.task_config['open_text_questions'][question_type]
|
| 500 |
+
open_text_data = self.question_generator.generate_category_open_text(
|
| 501 |
+
open_text_question,
|
| 502 |
+
correct_category
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
# Calculate actual effective durations
|
| 506 |
+
actual_effective_durations = {
|
| 507 |
+
cat: slot_distribution[cat] * effective_durations[cat]
|
| 508 |
+
for cat in all_categories
|
| 509 |
+
if cat in slot_distribution
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
# Create metadata
|
| 513 |
+
metadata = {
|
| 514 |
+
'id': sample_id,
|
| 515 |
+
'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
|
| 516 |
+
'question_type': question_type,
|
| 517 |
+
'max_clips': max_clips,
|
| 518 |
+
'n_unique_sources': n_sources,
|
| 519 |
+
'target_category': target_category,
|
| 520 |
+
'present_categories': present_categories,
|
| 521 |
+
'source_order': build_metadata['source_order'],
|
| 522 |
+
'slot_distribution': slot_distribution,
|
| 523 |
+
'effective_durations_per_clip': effective_durations,
|
| 524 |
+
'total_effective_durations': actual_effective_durations,
|
| 525 |
+
'gap_satisfied': gap_satisfied,
|
| 526 |
+
'multiplier_used': self.multiplier_longest if question_type == 'longest' else self.multiplier_shortest,
|
| 527 |
+
'files_used': files_used,
|
| 528 |
+
'target_duration_s': target_duration_s,
|
| 529 |
+
'actual_duration_s': len(final_audio) / 1000.0,
|
| 530 |
+
'timestamp_string': build_metadata.get('timestamp_string', ''),
|
| 531 |
+
'source_timestamps': build_metadata.get('source_timestamps', []),
|
| 532 |
+
'mcq_question': mcq_data['question'],
|
| 533 |
+
'mcq_options': mcq_data['options'],
|
| 534 |
+
'mcq_correct_answer': mcq_data['correct_answer'],
|
| 535 |
+
'open_text_question': open_text_data['question'],
|
| 536 |
+
'open_text_answer': open_text_data['correct_answer'],
|
| 537 |
+
'calc_metadata': calc_metadata
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
self.success_count += 1
|
| 541 |
+
self.logger.info(
|
| 542 |
+
f"Generated duration sample {sample_id}: {question_type}, "
|
| 543 |
+
f"max_clips={max_clips}, sources={n_sources}, target={target_category}, "
|
| 544 |
+
f"slots={slot_distribution}, gap_satisfied={gap_satisfied}"
|
| 545 |
+
)
|
| 546 |
+
|
| 547 |
+
return metadata
|
| 548 |
+
|
| 549 |
+
def _try_improve_gap_with_different_clips(
|
| 550 |
+
self,
|
| 551 |
+
question_type: str,
|
| 552 |
+
target_category: str,
|
| 553 |
+
all_categories: List[str],
|
| 554 |
+
max_clips: int,
|
| 555 |
+
n_sources: int,
|
| 556 |
+
effective_durations: Dict[str, float],
|
| 557 |
+
selected_files: Dict[str, Dict],
|
| 558 |
+
slot_distribution: Dict[str, int]
|
| 559 |
+
) -> bool:
|
| 560 |
+
"""
|
| 561 |
+
Try to improve gap satisfaction by selecting different clips.
|
| 562 |
+
|
| 563 |
+
For LONGEST: try clips with longer effective duration for target
|
| 564 |
+
For SHORTEST: try clips with shorter effective duration for target
|
| 565 |
+
|
| 566 |
+
Args:
|
| 567 |
+
Various state from generate_sample
|
| 568 |
+
|
| 569 |
+
Returns:
|
| 570 |
+
True if gap is now satisfied
|
| 571 |
+
"""
|
| 572 |
+
files = self.dataset.get_files_by_category_with_durations(target_category)
|
| 573 |
+
|
| 574 |
+
if question_type == "longest":
|
| 575 |
+
# Try to find a longer clip for target category
|
| 576 |
+
files_sorted = sorted(files, key=lambda x: x['effective_duration_s'], reverse=True)
|
| 577 |
+
else:
|
| 578 |
+
# For shortest, try shorter clip for target
|
| 579 |
+
files_sorted = sorted(files, key=lambda x: x['effective_duration_s'])
|
| 580 |
+
|
| 581 |
+
if files_sorted:
|
| 582 |
+
best = files_sorted[0]
|
| 583 |
+
effective_durations[target_category] = best['effective_duration_s']
|
| 584 |
+
selected_files[target_category] = {
|
| 585 |
+
'filename': best['filename'],
|
| 586 |
+
'filepath': best['filepath'],
|
| 587 |
+
'effective_duration_s': best['effective_duration_s']
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
# Recalculate slot distribution
|
| 591 |
+
new_slots, gap_satisfied, _ = self._calculate_slot_distribution(
|
| 592 |
+
max_clips=max_clips,
|
| 593 |
+
n_sources=n_sources,
|
| 594 |
+
effective_durations=effective_durations,
|
| 595 |
+
target_category=target_category,
|
| 596 |
+
question_type=question_type
|
| 597 |
+
)
|
| 598 |
+
|
| 599 |
+
if gap_satisfied:
|
| 600 |
+
slot_distribution.clear()
|
| 601 |
+
slot_distribution.update(new_slots)
|
| 602 |
+
|
| 603 |
+
return gap_satisfied
|
| 604 |
+
|
| 605 |
+
def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Optional[Dict]:
|
| 606 |
+
"""
|
| 607 |
+
Generate a single duration task sample with retries.
|
| 608 |
+
|
| 609 |
+
Args:
|
| 610 |
+
sample_id: Sample ID number
|
| 611 |
+
target_question_type: Target question type for balanced distribution
|
| 612 |
+
target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
|
| 613 |
+
|
| 614 |
+
Returns:
|
| 615 |
+
Dictionary with sample metadata, or None if failed
|
| 616 |
+
"""
|
| 617 |
+
question_type = target_question_type or random.choice(
|
| 618 |
+
self.task_config['question_types']
|
| 619 |
+
)
|
| 620 |
+
|
| 621 |
+
return self._try_generate_sample(sample_id, question_type, target_duration_seconds=target_duration_seconds)
|
| 622 |
+
|
| 623 |
+
def generate_dataset(self) -> tuple:
|
| 624 |
+
"""
|
| 625 |
+
Generate the complete duration task dataset.
|
| 626 |
+
|
| 627 |
+
Uses generate_sample_durations_for_task() to pre-generate exact sample durations
|
| 628 |
+
that sum to exactly the target task duration. This guarantees:
|
| 629 |
+
- Exact coverage of target duration
|
| 630 |
+
- No estimation errors from average-based calculation
|
| 631 |
+
|
| 632 |
+
Returns:
|
| 633 |
+
Tuple of (mcq_csv_path, open_text_csv_path)
|
| 634 |
+
"""
|
| 635 |
+
# Generate sample durations upfront (guarantees exact total duration)
|
| 636 |
+
sample_durations = generate_sample_durations_for_task(
|
| 637 |
+
self.task_duration_hours,
|
| 638 |
+
self.min_clip_duration,
|
| 639 |
+
self.max_clip_duration
|
| 640 |
+
)
|
| 641 |
+
num_samples = len(sample_durations)
|
| 642 |
+
|
| 643 |
+
self.logger.info(
|
| 644 |
+
f"Generating {num_samples} duration task samples "
|
| 645 |
+
f"(target: {self.task_duration_hours}h, exact fill)..."
|
| 646 |
+
)
|
| 647 |
+
|
| 648 |
+
# Create balanced question type distribution
|
| 649 |
+
question_types = self.task_config['question_types']
|
| 650 |
+
balanced_types = []
|
| 651 |
+
samples_per_type = num_samples // len(question_types)
|
| 652 |
+
remainder = num_samples % len(question_types)
|
| 653 |
+
|
| 654 |
+
for qtype in question_types:
|
| 655 |
+
count = samples_per_type + (1 if remainder > 0 else 0)
|
| 656 |
+
balanced_types.extend([qtype] * count)
|
| 657 |
+
remainder = max(0, remainder - 1)
|
| 658 |
+
|
| 659 |
+
random.shuffle(balanced_types)
|
| 660 |
+
type_dist = Counter(balanced_types)
|
| 661 |
+
self.logger.info(f"Balanced question type distribution: {dict(sorted(type_dist.items()))}")
|
| 662 |
+
|
| 663 |
+
all_metadata = []
|
| 664 |
+
sample_idx = 0
|
| 665 |
+
type_idx = 0
|
| 666 |
+
|
| 667 |
+
while len(all_metadata) < num_samples and type_idx < len(balanced_types) * 2:
|
| 668 |
+
question_type = balanced_types[type_idx % len(balanced_types)]
|
| 669 |
+
target_duration = sample_durations[sample_idx] if sample_idx < len(sample_durations) else None
|
| 670 |
+
|
| 671 |
+
metadata = self.generate_sample(sample_idx, question_type, target_duration_seconds=target_duration)
|
| 672 |
+
|
| 673 |
+
if metadata is not None:
|
| 674 |
+
all_metadata.append(metadata)
|
| 675 |
+
sample_idx += 1
|
| 676 |
+
|
| 677 |
+
type_idx += 1
|
| 678 |
+
|
| 679 |
+
# Log progress
|
| 680 |
+
if len(all_metadata) % 50 == 0:
|
| 681 |
+
self.logger.info(
|
| 682 |
+
f"Progress: {len(all_metadata)}/{num_samples} samples, "
|
| 683 |
+
f"{self.rejection_count} rejections"
|
| 684 |
+
)
|
| 685 |
+
|
| 686 |
+
self.logger.info(
|
| 687 |
+
f"Generation complete: {len(all_metadata)} samples, "
|
| 688 |
+
f"{self.rejection_count} rejections "
|
| 689 |
+
f"({self.rejection_count/(len(all_metadata)+self.rejection_count)*100:.1f}% rejection rate)"
|
| 690 |
+
)
|
| 691 |
+
|
| 692 |
+
# Save CSVs
|
| 693 |
+
mcq_csv_path = self.output_base / 'duration_mcq.csv'
|
| 694 |
+
self._save_mcq_csv(all_metadata, mcq_csv_path)
|
| 695 |
+
|
| 696 |
+
open_text_csv_path = self.output_base / 'duration_open_text.csv'
|
| 697 |
+
self._save_open_text_csv(all_metadata, open_text_csv_path)
|
| 698 |
+
|
| 699 |
+
metadata_csv_path = self.output_base / 'duration_metadata.csv'
|
| 700 |
+
self._save_metadata_csv(all_metadata, metadata_csv_path)
|
| 701 |
+
|
| 702 |
+
self.logger.info(f"Duration task dataset generation complete!")
|
| 703 |
+
self.logger.info(f" - MCQ CSV: {mcq_csv_path}")
|
| 704 |
+
self.logger.info(f" - Open-text CSV: {open_text_csv_path}")
|
| 705 |
+
self.logger.info(f" - Metadata CSV: {metadata_csv_path}")
|
| 706 |
+
self.logger.info(f" - Audio files: {self.audio_output}")
|
| 707 |
+
|
| 708 |
+
return mcq_csv_path, open_text_csv_path
|
| 709 |
+
|
| 710 |
+
def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 711 |
+
"""Save MCQ format CSV."""
|
| 712 |
+
with open(output_path, 'w', newline='') as f:
|
| 713 |
+
writer = csv.writer(f)
|
| 714 |
+
writer.writerow([
|
| 715 |
+
'question', 'id', 'audio_path',
|
| 716 |
+
'optionA', 'optionB', 'optionC', 'optionD',
|
| 717 |
+
'correct', 'question_type', 'max_clips', 'n_sources',
|
| 718 |
+
'target_category', 'slot_distribution', 'effective_durations'
|
| 719 |
+
])
|
| 720 |
+
|
| 721 |
+
for meta in metadata_list:
|
| 722 |
+
writer.writerow([
|
| 723 |
+
meta['mcq_question'],
|
| 724 |
+
meta['id'],
|
| 725 |
+
meta['audio_path'],
|
| 726 |
+
meta['mcq_options']['A'],
|
| 727 |
+
meta['mcq_options']['B'],
|
| 728 |
+
meta['mcq_options']['C'],
|
| 729 |
+
meta['mcq_options']['D'],
|
| 730 |
+
meta['mcq_correct_answer'],
|
| 731 |
+
meta['question_type'],
|
| 732 |
+
meta['max_clips'],
|
| 733 |
+
meta['n_unique_sources'],
|
| 734 |
+
meta['target_category'],
|
| 735 |
+
str(meta['slot_distribution']),
|
| 736 |
+
str(meta['total_effective_durations'])
|
| 737 |
+
])
|
| 738 |
+
|
| 739 |
+
def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 740 |
+
"""Save open-text format CSV."""
|
| 741 |
+
with open(output_path, 'w', newline='') as f:
|
| 742 |
+
writer = csv.writer(f)
|
| 743 |
+
writer.writerow([
|
| 744 |
+
'question', 'id', 'audio_path', 'answer',
|
| 745 |
+
'question_type', 'max_clips', 'n_sources',
|
| 746 |
+
'target_category', 'effective_durations'
|
| 747 |
+
])
|
| 748 |
+
|
| 749 |
+
for meta in metadata_list:
|
| 750 |
+
writer.writerow([
|
| 751 |
+
meta['open_text_question'],
|
| 752 |
+
meta['id'],
|
| 753 |
+
meta['audio_path'],
|
| 754 |
+
meta['open_text_answer'],
|
| 755 |
+
meta['question_type'],
|
| 756 |
+
meta['max_clips'],
|
| 757 |
+
meta['n_unique_sources'],
|
| 758 |
+
meta['target_category'],
|
| 759 |
+
str(meta['total_effective_durations'])
|
| 760 |
+
])
|
| 761 |
+
|
| 762 |
+
def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 763 |
+
"""Save detailed metadata CSV with effective durations and timestamps."""
|
| 764 |
+
with open(output_path, 'w', newline='') as f:
|
| 765 |
+
writer = csv.writer(f)
|
| 766 |
+
writer.writerow([
|
| 767 |
+
'id', 'audio_path', 'question_type', 'max_clips', 'n_sources',
|
| 768 |
+
'target_category', 'present_categories', 'source_order',
|
| 769 |
+
'slot_distribution', 'effective_durations_per_clip',
|
| 770 |
+
'total_effective_durations', 'gap_satisfied', 'multiplier_used',
|
| 771 |
+
'target_duration_s', 'actual_duration_s', 'clip_timestamps', 'files_used'
|
| 772 |
+
])
|
| 773 |
+
|
| 774 |
+
for meta in metadata_list:
|
| 775 |
+
writer.writerow([
|
| 776 |
+
meta['id'],
|
| 777 |
+
meta['audio_path'],
|
| 778 |
+
meta['question_type'],
|
| 779 |
+
meta['max_clips'],
|
| 780 |
+
meta['n_unique_sources'],
|
| 781 |
+
meta['target_category'],
|
| 782 |
+
str(meta['present_categories']),
|
| 783 |
+
str(meta['source_order']),
|
| 784 |
+
str(meta['slot_distribution']),
|
| 785 |
+
str(meta['effective_durations_per_clip']),
|
| 786 |
+
str(meta['total_effective_durations']),
|
| 787 |
+
meta['gap_satisfied'],
|
| 788 |
+
meta['multiplier_used'],
|
| 789 |
+
round(meta['target_duration_s'], 2),
|
| 790 |
+
round(meta['actual_duration_s'], 2),
|
| 791 |
+
meta.get('timestamp_string', ''),
|
| 792 |
+
str(meta['files_used'])
|
| 793 |
+
])
|
| 794 |
+
|
| 795 |
+
|
| 796 |
+
def main(config_path: str = None):
|
| 797 |
+
"""Main entry point for duration task generation."""
|
| 798 |
+
import yaml
|
| 799 |
+
|
| 800 |
+
if config_path is None:
|
| 801 |
+
config_path = Path(__file__).parent.parent / 'config.yaml'
|
| 802 |
+
|
| 803 |
+
with open(config_path, 'r') as f:
|
| 804 |
+
config = yaml.safe_load(f)
|
| 805 |
+
|
| 806 |
+
set_random_seed(config['random_seed'])
|
| 807 |
+
|
| 808 |
+
logger = setup_logger(
|
| 809 |
+
'duration_task',
|
| 810 |
+
log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
|
| 811 |
+
level=config['logging']['level'],
|
| 812 |
+
console_output=config['logging']['console_output']
|
| 813 |
+
)
|
| 814 |
+
|
| 815 |
+
generator = DurationTaskGenerator(config, logger)
|
| 816 |
+
generator.generate_dataset()
|
| 817 |
+
|
| 818 |
+
|
| 819 |
+
if __name__ == '__main__':
|
| 820 |
+
main()
|
tasks/task_order.py
ADDED
|
@@ -0,0 +1,598 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 3: Order - Generate temporal ordering questions
|
| 3 |
+
|
| 4 |
+
This task joins multiple audio sources and asks questions about their temporal order
|
| 5 |
+
(first, last, what comes after, what comes before).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import csv
|
| 9 |
+
import random
|
| 10 |
+
import math
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Dict, List
|
| 13 |
+
|
| 14 |
+
import sys
|
| 15 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 16 |
+
|
| 17 |
+
from utils import (
|
| 18 |
+
AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
|
| 19 |
+
setup_logger, set_random_seed, calculate_num_samples_for_task,
|
| 20 |
+
generate_single_clip_duration, get_max_clip_num_to_be_joined,
|
| 21 |
+
build_clip_sequence_with_silences, generate_sample_durations_for_task
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class OrderTaskGenerator:
|
| 26 |
+
"""Generator for temporal ordering task dataset."""
|
| 27 |
+
|
| 28 |
+
def __init__(self, config: Dict, logger):
|
| 29 |
+
"""
|
| 30 |
+
Initialize order task generator.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
config: Configuration dictionary
|
| 34 |
+
logger: Logger instance
|
| 35 |
+
"""
|
| 36 |
+
self.config = config
|
| 37 |
+
self.logger = logger
|
| 38 |
+
self.task_config = config['tasks']['order']
|
| 39 |
+
|
| 40 |
+
# Initialize components
|
| 41 |
+
self.dataset = ESC50Dataset(
|
| 42 |
+
config['esc50']['metadata_path'],
|
| 43 |
+
config['esc50']['audio_path'],
|
| 44 |
+
config # Pass config for class subset loading
|
| 45 |
+
)
|
| 46 |
+
self.audio_processor = AudioProcessor(
|
| 47 |
+
crossfade_duration=config['audio']['crossfade_duration'],
|
| 48 |
+
silence_duration=config['audio']['silence_duration'],
|
| 49 |
+
with_silence=config['audio']['with_silence'],
|
| 50 |
+
normalize=config['audio']['normalize'],
|
| 51 |
+
normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
|
| 52 |
+
synthetic_silence_path=config['synthetic_silence']['path']
|
| 53 |
+
)
|
| 54 |
+
self.question_generator = QuestionGenerator(
|
| 55 |
+
num_options=config['mcq']['num_options'],
|
| 56 |
+
option_labels=config['mcq']['option_labels'],
|
| 57 |
+
distractor_strategy=config['mcq']['distractor_strategy']
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Initialize LLM question generator
|
| 61 |
+
self.llm_enabled = config.get('llm', {}).get('enabled', False)
|
| 62 |
+
self.llm_generator = LLMQuestionGenerator(
|
| 63 |
+
enabled=self.llm_enabled,
|
| 64 |
+
template_questions=self.task_config
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Duration settings from config
|
| 68 |
+
self.min_clip_duration = config['audio']['min_clip_duration']
|
| 69 |
+
self.max_clip_duration = config['audio']['max_clip_duration']
|
| 70 |
+
# Duration of individual source clips (ESC-50 default is 5s)
|
| 71 |
+
self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0)
|
| 72 |
+
self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
|
| 73 |
+
self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
|
| 74 |
+
self.crossfade_ms = config['audio'].get('crossfade_duration', 0)
|
| 75 |
+
self.task_duration_hours = self.task_config['task_duration_size']
|
| 76 |
+
|
| 77 |
+
# Order task specific settings
|
| 78 |
+
self.allow_source_repetition = self.task_config.get('allow_source_repetition', False)
|
| 79 |
+
self.min_clips_for_second = self.task_config.get('min_clips_for_second_questions', 4)
|
| 80 |
+
|
| 81 |
+
# Set up output paths
|
| 82 |
+
self.output_base = Path(config['output']['base_path']) / 'order'
|
| 83 |
+
self.output_base.mkdir(parents=True, exist_ok=True)
|
| 84 |
+
self.audio_output = self.output_base / 'audios'
|
| 85 |
+
self.audio_output.mkdir(parents=True, exist_ok=True)
|
| 86 |
+
|
| 87 |
+
def _get_valid_question_types(self, n_clips: int) -> List[str]:
|
| 88 |
+
"""
|
| 89 |
+
Get question types valid for the given number of clips.
|
| 90 |
+
|
| 91 |
+
"second" and "second_last" require at least min_clips_for_second clips.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
n_clips: Number of clips in the sample
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
List of valid question types
|
| 98 |
+
"""
|
| 99 |
+
all_types = self.task_config['question_types']
|
| 100 |
+
|
| 101 |
+
# Filter based on n_clips
|
| 102 |
+
valid_types = []
|
| 103 |
+
for qtype in all_types:
|
| 104 |
+
if qtype in ['second', 'second_last']:
|
| 105 |
+
if n_clips >= self.min_clips_for_second:
|
| 106 |
+
valid_types.append(qtype)
|
| 107 |
+
elif qtype in ['after', 'before']:
|
| 108 |
+
if n_clips >= 2:
|
| 109 |
+
valid_types.append(qtype)
|
| 110 |
+
else: # first, last
|
| 111 |
+
valid_types.append(qtype)
|
| 112 |
+
|
| 113 |
+
return valid_types if valid_types else ['first', 'last']
|
| 114 |
+
|
| 115 |
+
def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Dict:
|
| 116 |
+
"""
|
| 117 |
+
Generate a single order task sample.
|
| 118 |
+
|
| 119 |
+
Pipeline: pick dataset -> pick class -> pick audio clip -> get duration ->
|
| 120 |
+
concatenate clips to reach target duration -> modulo to get num clips ->
|
| 121 |
+
inserting silences randomly based on remainder.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
sample_id: Sample ID number
|
| 125 |
+
target_question_type: Target question type for balanced distribution
|
| 126 |
+
target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
Dictionary with sample metadata
|
| 130 |
+
"""
|
| 131 |
+
# Use pre-generated duration or generate one (backward compatibility)
|
| 132 |
+
if target_duration_seconds is not None:
|
| 133 |
+
clip_duration_seconds = target_duration_seconds
|
| 134 |
+
else:
|
| 135 |
+
clip_duration_seconds = generate_single_clip_duration(
|
| 136 |
+
self.min_clip_duration,
|
| 137 |
+
self.max_clip_duration
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# Calculate how many clips we need using the new helper
|
| 141 |
+
max_clips, remainder_seconds = get_max_clip_num_to_be_joined(
|
| 142 |
+
clip_duration_seconds,
|
| 143 |
+
self.source_clip_duration,
|
| 144 |
+
self.min_silence_ms
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
|
| 148 |
+
|
| 149 |
+
# Silence reduction strategy: subsample from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)]
|
| 150 |
+
# This ensures we use close to max_clips that fit, reducing excessive silence
|
| 151 |
+
|
| 152 |
+
# Calculate valid range for this sample's duration
|
| 153 |
+
min_clips_for_sample = max(2, max_clips - 3) # At least 2, preferably max_clips-3
|
| 154 |
+
max_clips_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
|
| 155 |
+
|
| 156 |
+
# Validate range
|
| 157 |
+
if max_clips_for_sample < 2:
|
| 158 |
+
raise ValueError(
|
| 159 |
+
f"Sample {sample_id}: Cannot generate order task - need at least 2 clips. "
|
| 160 |
+
f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, "
|
| 161 |
+
f"duration={clip_duration_seconds:.1f}s. Increase min_clip_duration."
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
if min_clips_for_sample > max_clips_for_sample:
|
| 165 |
+
raise ValueError(
|
| 166 |
+
f"Sample {sample_id}: Invalid clip range - min_clips ({min_clips_for_sample}) > max_clips ({max_clips_for_sample}). "
|
| 167 |
+
f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, duration={clip_duration_seconds:.1f}s"
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# Randomly select from valid range (NO balanced pool for order task)
|
| 171 |
+
n_clips = random.randint(min_clips_for_sample, max_clips_for_sample)
|
| 172 |
+
|
| 173 |
+
# Get valid question types for this n_clips
|
| 174 |
+
valid_question_types = self._get_valid_question_types(n_clips)
|
| 175 |
+
|
| 176 |
+
if not valid_question_types:
|
| 177 |
+
raise ValueError(
|
| 178 |
+
f"Sample {sample_id}: No valid question types for n_clips={n_clips}. "
|
| 179 |
+
f"This should not happen - check _get_valid_question_types implementation."
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# Pre-select question type to determine answer position
|
| 183 |
+
if target_question_type is not None:
|
| 184 |
+
if target_question_type not in valid_question_types:
|
| 185 |
+
raise ValueError(
|
| 186 |
+
f"Sample {sample_id}: target_question_type='{target_question_type}' not valid for n_clips={n_clips}. "
|
| 187 |
+
f"Valid types: {valid_question_types}. Balanced distribution should only assign valid types."
|
| 188 |
+
)
|
| 189 |
+
question_type = target_question_type
|
| 190 |
+
else:
|
| 191 |
+
question_type = random.choice(valid_question_types)
|
| 192 |
+
|
| 193 |
+
# Determine answer position based on question type
|
| 194 |
+
if question_type == 'first':
|
| 195 |
+
answer_position = 0
|
| 196 |
+
elif question_type == 'last':
|
| 197 |
+
answer_position = n_clips - 1
|
| 198 |
+
elif question_type == 'second':
|
| 199 |
+
answer_position = 1 # 0-indexed, so position 1 is second
|
| 200 |
+
elif question_type == 'second_last':
|
| 201 |
+
answer_position = n_clips - 2 # Second to last
|
| 202 |
+
elif question_type == 'after':
|
| 203 |
+
# Answer is after a reference, so position 1 to n-1
|
| 204 |
+
answer_position = random.randint(1, n_clips - 1) if n_clips >= 2 else 0
|
| 205 |
+
else: # before
|
| 206 |
+
# Answer is before a reference, so position 0 to n-2
|
| 207 |
+
answer_position = random.randint(0, n_clips - 2) if n_clips >= 2 else 0
|
| 208 |
+
|
| 209 |
+
# Select answer category from least-used categories
|
| 210 |
+
answer_category = self.dataset.get_least_used_categories(1)[0]
|
| 211 |
+
|
| 212 |
+
# Sample remaining categories, ensuring balanced distribution
|
| 213 |
+
if n_clips <= len(self.dataset.CATEGORIES):
|
| 214 |
+
other_categories = self.dataset.get_least_used_categories(
|
| 215 |
+
n_clips - 1,
|
| 216 |
+
exclude=[answer_category]
|
| 217 |
+
)
|
| 218 |
+
else:
|
| 219 |
+
# Need more clips than unique categories - sample with some repetition
|
| 220 |
+
other_categories = self.dataset.get_least_used_categories(
|
| 221 |
+
min(n_clips - 1, len(self.dataset.CATEGORIES) - 1),
|
| 222 |
+
exclude=[answer_category]
|
| 223 |
+
)
|
| 224 |
+
# Add random repetitions if needed
|
| 225 |
+
while len(other_categories) < n_clips - 1:
|
| 226 |
+
other_categories.append(random.choice(self.dataset.CATEGORIES))
|
| 227 |
+
|
| 228 |
+
# Arrange categories with answer at correct position
|
| 229 |
+
selected_categories = []
|
| 230 |
+
other_idx = 0
|
| 231 |
+
for i in range(n_clips):
|
| 232 |
+
if i == answer_position:
|
| 233 |
+
selected_categories.append(answer_category)
|
| 234 |
+
else:
|
| 235 |
+
selected_categories.append(other_categories[other_idx])
|
| 236 |
+
other_idx += 1
|
| 237 |
+
|
| 238 |
+
# Track usage of answer category
|
| 239 |
+
self.dataset.category_usage_counts[answer_category] += 1
|
| 240 |
+
|
| 241 |
+
# Sample one file from each category and load audio
|
| 242 |
+
audio_segments = []
|
| 243 |
+
filenames_list = []
|
| 244 |
+
|
| 245 |
+
for category in selected_categories:
|
| 246 |
+
filename, filepath = self.dataset.sample_file_from_category(category)
|
| 247 |
+
audio = self.audio_processor.load_audio(filepath)
|
| 248 |
+
audio_segments.append(audio)
|
| 249 |
+
filenames_list.append(filename)
|
| 250 |
+
|
| 251 |
+
# Build final audio with guaranteed silences between clips
|
| 252 |
+
output_audio_path = self.audio_output / f"{sample_id}.wav"
|
| 253 |
+
final_audio = build_clip_sequence_with_silences(
|
| 254 |
+
audio_segments,
|
| 255 |
+
clip_duration_seconds,
|
| 256 |
+
min_silence_ms=self.min_silence_ms,
|
| 257 |
+
max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
|
| 258 |
+
crossfade_ms=self.crossfade_ms
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
# Save the audio
|
| 262 |
+
final_audio.export(str(output_audio_path), format="wav")
|
| 263 |
+
|
| 264 |
+
# Determine correct answer and generate questions based on question type
|
| 265 |
+
# CRITICAL BUG FIX: Verify answer_category is actually at answer_position
|
| 266 |
+
if selected_categories[answer_position] != answer_category:
|
| 267 |
+
self.logger.error(f"Sample {sample_id}: Answer mismatch! Expected {answer_category} at position {answer_position}, got {selected_categories[answer_position]}")
|
| 268 |
+
# Force correct by using actual category at answer_position
|
| 269 |
+
correct_category = selected_categories[answer_position]
|
| 270 |
+
else:
|
| 271 |
+
correct_category = answer_category
|
| 272 |
+
|
| 273 |
+
if question_type == 'first':
|
| 274 |
+
mcq_question = self.task_config['mcq_questions']['first']
|
| 275 |
+
open_text_question = self.task_config['open_text_questions']['first']
|
| 276 |
+
|
| 277 |
+
elif question_type == 'last':
|
| 278 |
+
mcq_question = self.task_config['mcq_questions']['last']
|
| 279 |
+
open_text_question = self.task_config['open_text_questions']['last']
|
| 280 |
+
|
| 281 |
+
elif question_type == 'second':
|
| 282 |
+
mcq_question = self.task_config['mcq_questions']['second']
|
| 283 |
+
open_text_question = self.task_config['open_text_questions']['second']
|
| 284 |
+
|
| 285 |
+
elif question_type == 'second_last':
|
| 286 |
+
mcq_question = self.task_config['mcq_questions']['second_last']
|
| 287 |
+
open_text_question = self.task_config['open_text_questions']['second_last']
|
| 288 |
+
|
| 289 |
+
elif question_type == 'after':
|
| 290 |
+
# Reference is the sound before answer_position
|
| 291 |
+
if answer_position > 0:
|
| 292 |
+
reference_category = selected_categories[answer_position - 1]
|
| 293 |
+
mcq_question = self.task_config['mcq_questions']['after'].format(sound1=reference_category)
|
| 294 |
+
open_text_question = self.task_config['open_text_questions']['after'].format(sound1=reference_category)
|
| 295 |
+
else:
|
| 296 |
+
# Fallback shouldn't happen but handle gracefully
|
| 297 |
+
mcq_question = self.task_config['mcq_questions']['first']
|
| 298 |
+
open_text_question = self.task_config['open_text_questions']['first']
|
| 299 |
+
|
| 300 |
+
else: # before
|
| 301 |
+
# Reference is the sound after answer_position
|
| 302 |
+
if answer_position < n_clips - 1:
|
| 303 |
+
reference_category = selected_categories[answer_position + 1]
|
| 304 |
+
mcq_question = self.task_config['mcq_questions']['before'].format(sound2=reference_category)
|
| 305 |
+
open_text_question = self.task_config['open_text_questions']['before'].format(sound2=reference_category)
|
| 306 |
+
else:
|
| 307 |
+
# Fallback to 'first' if only 1 clip
|
| 308 |
+
correct_category = selected_categories[0]
|
| 309 |
+
mcq_question = self.task_config['mcq_questions']['first']
|
| 310 |
+
open_text_question = self.task_config['open_text_questions']['first']
|
| 311 |
+
question_type = 'first'
|
| 312 |
+
|
| 313 |
+
# Generate MCQ
|
| 314 |
+
mcq_data = self.question_generator.generate_category_mcq(
|
| 315 |
+
mcq_question,
|
| 316 |
+
correct_category,
|
| 317 |
+
selected_categories,
|
| 318 |
+
self.dataset.CATEGORIES
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
# Generate open-text question
|
| 322 |
+
open_text_data = self.question_generator.generate_category_open_text(
|
| 323 |
+
open_text_question,
|
| 324 |
+
correct_category
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
# Also generate a sequence question for open-text
|
| 328 |
+
sequence_question = self.task_config['open_text_questions']['sequence']
|
| 329 |
+
sequence_data = self.question_generator.generate_sequence_open_text(
|
| 330 |
+
sequence_question,
|
| 331 |
+
selected_categories
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
# Create metadata
|
| 335 |
+
metadata = {
|
| 336 |
+
'id': sample_id,
|
| 337 |
+
'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
|
| 338 |
+
'n_clips': n_clips,
|
| 339 |
+
'question_type': question_type,
|
| 340 |
+
'audio_sequence': selected_categories,
|
| 341 |
+
'correct_answer_category': correct_category,
|
| 342 |
+
'source_files': filenames_list,
|
| 343 |
+
'mcq_question': mcq_data['question'],
|
| 344 |
+
'mcq_options': mcq_data['options'],
|
| 345 |
+
'mcq_correct_answer': mcq_data['correct_answer'],
|
| 346 |
+
'open_text_question': open_text_data['question'],
|
| 347 |
+
'open_text_answer': open_text_data['correct_answer'],
|
| 348 |
+
'sequence_question': sequence_data['question'],
|
| 349 |
+
'sequence_answer': sequence_data['correct_answer']
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
self.logger.info(f"Generated order sample {sample_id}: {question_type}, {n_clips} clips")
|
| 353 |
+
|
| 354 |
+
return metadata
|
| 355 |
+
|
| 356 |
+
def generate_dataset(self) -> tuple:
|
| 357 |
+
"""
|
| 358 |
+
Generate the complete order task dataset.
|
| 359 |
+
|
| 360 |
+
Uses generate_sample_durations_for_task() to pre-generate exact sample durations
|
| 361 |
+
that sum to exactly the target task duration. This guarantees:
|
| 362 |
+
- Exact coverage of target duration
|
| 363 |
+
- No estimation errors from average-based calculation
|
| 364 |
+
|
| 365 |
+
Returns:
|
| 366 |
+
Tuple of (mcq_csv_path, open_text_csv_path, sequence_csv_path)
|
| 367 |
+
"""
|
| 368 |
+
# Generate sample durations upfront (guarantees exact total duration)
|
| 369 |
+
sample_durations = generate_sample_durations_for_task(
|
| 370 |
+
self.task_duration_hours,
|
| 371 |
+
self.min_clip_duration,
|
| 372 |
+
self.max_clip_duration
|
| 373 |
+
)
|
| 374 |
+
num_samples = len(sample_durations)
|
| 375 |
+
|
| 376 |
+
self.logger.info(f"Generating {num_samples} order task samples (target: {self.task_duration_hours}h, exact fill)...")
|
| 377 |
+
|
| 378 |
+
# Calculate effective max clips each sample can use (accounting for silence reduction)
|
| 379 |
+
# This matches the logic in generate_sample()
|
| 380 |
+
max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
|
| 381 |
+
sample_effective_max_clips = []
|
| 382 |
+
|
| 383 |
+
for duration in sample_durations:
|
| 384 |
+
max_clips, _ = get_max_clip_num_to_be_joined(
|
| 385 |
+
duration,
|
| 386 |
+
self.source_clip_duration,
|
| 387 |
+
self.min_silence_ms
|
| 388 |
+
)
|
| 389 |
+
# Apply the same constraints as generate_sample()
|
| 390 |
+
effective_max = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
|
| 391 |
+
sample_effective_max_clips.append(effective_max)
|
| 392 |
+
|
| 393 |
+
# Create capacity-aware balanced question type distribution
|
| 394 |
+
# Categorize question types by clip requirements
|
| 395 |
+
question_types = self.task_config['question_types']
|
| 396 |
+
|
| 397 |
+
# Separate into tiers based on clip requirements
|
| 398 |
+
basic_types = ['first', 'last', 'after', 'before'] # Need >= 2 clips
|
| 399 |
+
advanced_types = ['second', 'second_last'] # Need >= min_clips_for_second
|
| 400 |
+
|
| 401 |
+
# Count how many samples can support each tier (use effective max, not raw max)
|
| 402 |
+
samples_for_basic = sum(1 for emc in sample_effective_max_clips if emc >= 2)
|
| 403 |
+
samples_for_advanced = sum(1 for emc in sample_effective_max_clips if emc >= self.min_clips_for_second)
|
| 404 |
+
|
| 405 |
+
# Create list of (sample_idx, duration, effective_max_clips)
|
| 406 |
+
sample_info = [(i, sample_durations[i], sample_effective_max_clips[i]) for i in range(num_samples)]
|
| 407 |
+
|
| 408 |
+
# Sort by capacity (descending) - assign advanced types to high-capacity samples
|
| 409 |
+
sample_info.sort(key=lambda x: x[2], reverse=True)
|
| 410 |
+
|
| 411 |
+
# Calculate distribution: prefer advanced types for longer clips
|
| 412 |
+
samples_per_type = num_samples // len(question_types)
|
| 413 |
+
remainder = num_samples % len(question_types)
|
| 414 |
+
|
| 415 |
+
# Build assignment pool - advanced types first (for high-capacity samples)
|
| 416 |
+
assignment_pool = []
|
| 417 |
+
for qtype in advanced_types:
|
| 418 |
+
count = samples_per_type + (1 if remainder > 0 else 0)
|
| 419 |
+
assignment_pool.extend([qtype] * count)
|
| 420 |
+
remainder = max(0, remainder - 1)
|
| 421 |
+
|
| 422 |
+
for qtype in basic_types:
|
| 423 |
+
count = samples_per_type + (1 if remainder > 0 else 0)
|
| 424 |
+
assignment_pool.extend([qtype] * count)
|
| 425 |
+
remainder = max(0, remainder - 1)
|
| 426 |
+
|
| 427 |
+
# Assign question types based on capacity
|
| 428 |
+
balanced_assignments = [None] * num_samples
|
| 429 |
+
|
| 430 |
+
for idx, (sample_idx, duration, capacity) in enumerate(sample_info):
|
| 431 |
+
target_qtype = assignment_pool[idx]
|
| 432 |
+
|
| 433 |
+
# Validate and adjust if needed
|
| 434 |
+
valid_types = self._get_valid_question_types(capacity)
|
| 435 |
+
|
| 436 |
+
if target_qtype not in valid_types:
|
| 437 |
+
# Assign a valid alternative - prefer similar types
|
| 438 |
+
if target_qtype in advanced_types and any(t in valid_types for t in basic_types):
|
| 439 |
+
# Downgrade to basic type
|
| 440 |
+
target_qtype = random.choice([t for t in basic_types if t in valid_types])
|
| 441 |
+
else:
|
| 442 |
+
# Fallback to any valid type
|
| 443 |
+
target_qtype = random.choice(valid_types)
|
| 444 |
+
|
| 445 |
+
balanced_assignments[sample_idx] = target_qtype
|
| 446 |
+
|
| 447 |
+
# Log the actual distribution after capacity-aware assignment
|
| 448 |
+
from collections import Counter
|
| 449 |
+
type_dist = Counter(balanced_assignments)
|
| 450 |
+
self.logger.info(f"Balanced question type distribution (after capacity-aware assignment): {dict(sorted(type_dist.items()))}")
|
| 451 |
+
|
| 452 |
+
all_metadata = []
|
| 453 |
+
|
| 454 |
+
for i, target_duration in enumerate(sample_durations):
|
| 455 |
+
metadata = self.generate_sample(i, target_question_type=balanced_assignments[i], target_duration_seconds=target_duration)
|
| 456 |
+
all_metadata.append(metadata) # Save MCQ CSV
|
| 457 |
+
mcq_csv_path = self.output_base / 'order_mcq.csv'
|
| 458 |
+
self._save_mcq_csv(all_metadata, mcq_csv_path)
|
| 459 |
+
|
| 460 |
+
# Save open-text CSV
|
| 461 |
+
open_text_csv_path = self.output_base / 'order_open_text.csv'
|
| 462 |
+
self._save_open_text_csv(all_metadata, open_text_csv_path)
|
| 463 |
+
|
| 464 |
+
# Save sequence CSV
|
| 465 |
+
sequence_csv_path = self.output_base / 'order_sequence.csv'
|
| 466 |
+
self._save_sequence_csv(all_metadata, sequence_csv_path)
|
| 467 |
+
|
| 468 |
+
# Save metadata CSV
|
| 469 |
+
metadata_csv_path = self.output_base / 'order_metadata.csv'
|
| 470 |
+
self._save_metadata_csv(all_metadata, metadata_csv_path)
|
| 471 |
+
|
| 472 |
+
self.logger.info(f"Order task dataset generation complete!")
|
| 473 |
+
self.logger.info(f" - MCQ CSV: {mcq_csv_path}")
|
| 474 |
+
self.logger.info(f" - Open-text CSV: {open_text_csv_path}")
|
| 475 |
+
self.logger.info(f" - Sequence CSV: {sequence_csv_path}")
|
| 476 |
+
self.logger.info(f" - Metadata CSV: {metadata_csv_path}")
|
| 477 |
+
self.logger.info(f" - Audio files: {self.audio_output}")
|
| 478 |
+
|
| 479 |
+
return mcq_csv_path, open_text_csv_path, sequence_csv_path
|
| 480 |
+
|
| 481 |
+
def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 482 |
+
"""Save MCQ format CSV."""
|
| 483 |
+
with open(output_path, 'w', newline='') as f:
|
| 484 |
+
writer = csv.writer(f)
|
| 485 |
+
# Header
|
| 486 |
+
writer.writerow([
|
| 487 |
+
'question', 'id', 'audio_path',
|
| 488 |
+
'optionA', 'optionB', 'optionC', 'optionD',
|
| 489 |
+
'correct', 'question_type', 'audio_sequence'
|
| 490 |
+
])
|
| 491 |
+
|
| 492 |
+
# Data rows
|
| 493 |
+
for meta in metadata_list:
|
| 494 |
+
writer.writerow([
|
| 495 |
+
meta['mcq_question'],
|
| 496 |
+
meta['id'],
|
| 497 |
+
meta['audio_path'],
|
| 498 |
+
meta['mcq_options']['A'],
|
| 499 |
+
meta['mcq_options']['B'],
|
| 500 |
+
meta['mcq_options']['C'],
|
| 501 |
+
meta['mcq_options']['D'],
|
| 502 |
+
meta['mcq_correct_answer'],
|
| 503 |
+
meta['question_type'],
|
| 504 |
+
str(meta['audio_sequence'])
|
| 505 |
+
])
|
| 506 |
+
|
| 507 |
+
def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 508 |
+
"""Save open-text format CSV."""
|
| 509 |
+
with open(output_path, 'w', newline='') as f:
|
| 510 |
+
writer = csv.writer(f)
|
| 511 |
+
# Header
|
| 512 |
+
writer.writerow([
|
| 513 |
+
'question', 'id', 'audio_path', 'answer',
|
| 514 |
+
'question_type', 'audio_sequence'
|
| 515 |
+
])
|
| 516 |
+
|
| 517 |
+
# Data rows
|
| 518 |
+
for meta in metadata_list:
|
| 519 |
+
writer.writerow([
|
| 520 |
+
meta['open_text_question'],
|
| 521 |
+
meta['id'],
|
| 522 |
+
meta['audio_path'],
|
| 523 |
+
meta['open_text_answer'],
|
| 524 |
+
meta['question_type'],
|
| 525 |
+
str(meta['audio_sequence'])
|
| 526 |
+
])
|
| 527 |
+
|
| 528 |
+
def _save_sequence_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 529 |
+
"""Save sequence question CSV."""
|
| 530 |
+
with open(output_path, 'w', newline='') as f:
|
| 531 |
+
writer = csv.writer(f)
|
| 532 |
+
# Header
|
| 533 |
+
writer.writerow([
|
| 534 |
+
'question', 'id', 'audio_path', 'answer', 'audio_sequence'
|
| 535 |
+
])
|
| 536 |
+
|
| 537 |
+
# Data rows
|
| 538 |
+
for meta in metadata_list:
|
| 539 |
+
writer.writerow([
|
| 540 |
+
meta['sequence_question'],
|
| 541 |
+
meta['id'],
|
| 542 |
+
meta['audio_path'],
|
| 543 |
+
meta['sequence_answer'],
|
| 544 |
+
str(meta['audio_sequence'])
|
| 545 |
+
])
|
| 546 |
+
|
| 547 |
+
def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 548 |
+
"""Save detailed metadata CSV."""
|
| 549 |
+
with open(output_path, 'w', newline='') as f:
|
| 550 |
+
writer = csv.writer(f)
|
| 551 |
+
# Header
|
| 552 |
+
writer.writerow([
|
| 553 |
+
'id', 'audio_path', 'n_clips', 'question_type',
|
| 554 |
+
'audio_sequence', 'correct_answer', 'source_files'
|
| 555 |
+
])
|
| 556 |
+
|
| 557 |
+
# Data rows
|
| 558 |
+
for meta in metadata_list:
|
| 559 |
+
writer.writerow([
|
| 560 |
+
meta['id'],
|
| 561 |
+
meta['audio_path'],
|
| 562 |
+
meta['n_clips'],
|
| 563 |
+
meta['question_type'],
|
| 564 |
+
str(meta['audio_sequence']),
|
| 565 |
+
meta['correct_answer_category'],
|
| 566 |
+
str(meta['source_files'])
|
| 567 |
+
])
|
| 568 |
+
|
| 569 |
+
|
| 570 |
+
def main(config_path: str = None):
|
| 571 |
+
"""Main entry point for order task generation."""
|
| 572 |
+
import yaml
|
| 573 |
+
|
| 574 |
+
# Load configuration
|
| 575 |
+
if config_path is None:
|
| 576 |
+
config_path = Path(__file__).parent.parent / 'config.yaml'
|
| 577 |
+
|
| 578 |
+
with open(config_path, 'r') as f:
|
| 579 |
+
config = yaml.safe_load(f)
|
| 580 |
+
|
| 581 |
+
# Set random seed
|
| 582 |
+
set_random_seed(config['random_seed'])
|
| 583 |
+
|
| 584 |
+
# Setup logger
|
| 585 |
+
logger = setup_logger(
|
| 586 |
+
'order_task',
|
| 587 |
+
log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
|
| 588 |
+
level=config['logging']['level'],
|
| 589 |
+
console_output=config['logging']['console_output']
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
# Generate dataset
|
| 593 |
+
generator = OrderTaskGenerator(config, logger)
|
| 594 |
+
generator.generate_dataset()
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
if __name__ == '__main__':
|
| 598 |
+
main()
|
tasks/task_volume.py
ADDED
|
@@ -0,0 +1,732 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 4: Volume - Generate volume comparison questions
|
| 3 |
+
|
| 4 |
+
This task joins multiple audio sources with different volume levels
|
| 5 |
+
and asks questions about the loudest or softest sound.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import csv
|
| 9 |
+
import random
|
| 10 |
+
import math
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Dict, List, Tuple, Optional
|
| 13 |
+
|
| 14 |
+
import sys
|
| 15 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 16 |
+
|
| 17 |
+
from utils import (
|
| 18 |
+
AudioProcessor, ESC50Dataset, QuestionGenerator, LLMQuestionGenerator,
|
| 19 |
+
setup_logger, set_random_seed, calculate_num_samples_for_task,
|
| 20 |
+
generate_single_clip_duration, get_max_clip_num_to_be_joined,
|
| 21 |
+
build_clip_sequence_with_silences, generate_sample_durations_for_task,
|
| 22 |
+
get_lufs_loudness, normalize_to_lufs
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class VolumeTaskGenerator:
|
| 27 |
+
"""Generator for volume comparison task dataset."""
|
| 28 |
+
|
| 29 |
+
def __init__(self, config: Dict, logger):
|
| 30 |
+
"""
|
| 31 |
+
Initialize volume task generator.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
config: Configuration dictionary
|
| 35 |
+
logger: Logger instance
|
| 36 |
+
"""
|
| 37 |
+
self.config = config
|
| 38 |
+
self.logger = logger
|
| 39 |
+
self.task_config = config['tasks']['volume']
|
| 40 |
+
|
| 41 |
+
# Initialize components
|
| 42 |
+
self.dataset = ESC50Dataset(
|
| 43 |
+
config['esc50']['metadata_path'],
|
| 44 |
+
config['esc50']['audio_path'],
|
| 45 |
+
config # Pass config for class subset loading
|
| 46 |
+
)
|
| 47 |
+
self.audio_processor = AudioProcessor(
|
| 48 |
+
crossfade_duration=config['audio']['crossfade_duration'],
|
| 49 |
+
silence_duration=config['audio']['silence_duration'],
|
| 50 |
+
with_silence=config['audio']['with_silence'],
|
| 51 |
+
normalize=config['audio']['normalize'],
|
| 52 |
+
normalize_target_dBFS=config['audio']['normalize_target_dBFS'],
|
| 53 |
+
synthetic_silence_path=config['synthetic_silence']['path']
|
| 54 |
+
)
|
| 55 |
+
self.question_generator = QuestionGenerator(
|
| 56 |
+
num_options=config['mcq']['num_options'],
|
| 57 |
+
option_labels=config['mcq']['option_labels'],
|
| 58 |
+
distractor_strategy=config['mcq']['distractor_strategy']
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# Initialize LLM question generator
|
| 62 |
+
self.llm_enabled = config.get('llm', {}).get('enabled', False)
|
| 63 |
+
self.llm_generator = LLMQuestionGenerator(
|
| 64 |
+
enabled=self.llm_enabled,
|
| 65 |
+
template_questions=self.task_config
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Duration settings from config
|
| 69 |
+
self.min_clip_duration = config['audio']['min_clip_duration']
|
| 70 |
+
self.max_clip_duration = config['audio']['max_clip_duration']
|
| 71 |
+
# Duration of individual source clips (ESC-50 default is 5s)
|
| 72 |
+
self.source_clip_duration = config['audio'].get('source_clip_duration', 5.0)
|
| 73 |
+
self.min_silence_ms = config['audio'].get('min_silence_duration', 100)
|
| 74 |
+
self.max_extra_silence_per_gap_ms = config['audio'].get('max_extra_silence_per_gap', 500)
|
| 75 |
+
self.crossfade_ms = config['audio'].get('crossfade_duration', 0)
|
| 76 |
+
self.task_duration_hours = self.task_config['task_duration_size']
|
| 77 |
+
|
| 78 |
+
# Volume task specific settings
|
| 79 |
+
self.normalize_to_baseline = self.task_config.get('normalize_to_baseline', True)
|
| 80 |
+
self.baseline_dBFS = self.task_config.get('baseline_dBFS', -20.0)
|
| 81 |
+
self.use_same_clip_different_volumes = self.task_config.get('use_same_clip_different_volumes', False)
|
| 82 |
+
self.repetitions_per_source = self.task_config.get('repetitions_per_source', [2, 3, 4])
|
| 83 |
+
if isinstance(self.repetitions_per_source, int):
|
| 84 |
+
self.repetitions_per_source = [self.repetitions_per_source]
|
| 85 |
+
|
| 86 |
+
# Volume gap multipliers (similar to duration task)
|
| 87 |
+
self.multiplier_max_loudness = self.task_config.get('multiplier_max_loudness', 1.5)
|
| 88 |
+
self.multiplier_min_loudness = self.task_config.get('multiplier_min_loudness', 0.5)
|
| 89 |
+
self.reject_if_gap_not_met = self.task_config.get('reject_if_gap_not_met', True)
|
| 90 |
+
|
| 91 |
+
# LUFS vs dBFS loudness measurement option
|
| 92 |
+
# LUFS (Loudness Units Full Scale) measures PERCEIVED loudness
|
| 93 |
+
# dBFS measures RMS amplitude - does NOT account for frequency sensitivity
|
| 94 |
+
# LUFS is recommended for comparing different sound types
|
| 95 |
+
self.use_lufs = self.task_config.get('use_lufs', True)
|
| 96 |
+
self.baseline_lufs = self.task_config.get('baseline_lufs', -23.0) # EBU R128 standard
|
| 97 |
+
|
| 98 |
+
# Set up output paths
|
| 99 |
+
self.output_base = Path(config['output']['base_path']) / 'volume'
|
| 100 |
+
self.output_base.mkdir(parents=True, exist_ok=True)
|
| 101 |
+
self.audio_output = self.output_base / 'audios'
|
| 102 |
+
self.audio_output.mkdir(parents=True, exist_ok=True)
|
| 103 |
+
|
| 104 |
+
# Create balanced sampling pool for num_clips
|
| 105 |
+
self.clips_count_pool = []
|
| 106 |
+
|
| 107 |
+
def _normalize_to_baseline(self, audio: "AudioSegment") -> "AudioSegment":
|
| 108 |
+
"""
|
| 109 |
+
Normalize audio to the baseline loudness level.
|
| 110 |
+
|
| 111 |
+
Uses LUFS (perceived loudness) if use_lufs=True, otherwise dBFS.
|
| 112 |
+
This ensures all clips start from the same perceived loudness before
|
| 113 |
+
applying volume adjustments.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
audio: Input audio segment
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
Normalized audio segment
|
| 120 |
+
"""
|
| 121 |
+
if not self.normalize_to_baseline:
|
| 122 |
+
return audio
|
| 123 |
+
|
| 124 |
+
if self.use_lufs:
|
| 125 |
+
# Use LUFS-based normalization (perceived loudness)
|
| 126 |
+
normalized = normalize_to_lufs(audio, self.baseline_lufs)
|
| 127 |
+
self.logger.debug(
|
| 128 |
+
f"Normalized to baseline LUFS: {get_lufs_loudness(audio):.2f} -> {get_lufs_loudness(normalized):.2f} LUFS"
|
| 129 |
+
)
|
| 130 |
+
return normalized
|
| 131 |
+
else:
|
| 132 |
+
# Use dBFS normalization (RMS amplitude)
|
| 133 |
+
change_in_dBFS = self.baseline_dBFS - audio.dBFS
|
| 134 |
+
normalized = audio.apply_gain(change_in_dBFS)
|
| 135 |
+
self.logger.debug(
|
| 136 |
+
f"Normalized to baseline dBFS: {audio.dBFS:.2f} -> {normalized.dBFS:.2f} dBFS"
|
| 137 |
+
)
|
| 138 |
+
return normalized
|
| 139 |
+
|
| 140 |
+
def _get_amplitude_loudness(self, audio: "AudioSegment") -> float:
|
| 141 |
+
"""
|
| 142 |
+
Get the loudness of an audio clip.
|
| 143 |
+
|
| 144 |
+
Uses LUFS (perceived loudness) if use_lufs=True, otherwise dBFS.
|
| 145 |
+
|
| 146 |
+
Args:
|
| 147 |
+
audio: Input audio segment
|
| 148 |
+
|
| 149 |
+
Returns:
|
| 150 |
+
Loudness in LUFS or dBFS depending on configuration
|
| 151 |
+
"""
|
| 152 |
+
if self.use_lufs:
|
| 153 |
+
return get_lufs_loudness(audio)
|
| 154 |
+
else:
|
| 155 |
+
return audio.dBFS
|
| 156 |
+
|
| 157 |
+
def _verify_loudness_gap(
|
| 158 |
+
self,
|
| 159 |
+
volume_levels: List[float],
|
| 160 |
+
question_type: str
|
| 161 |
+
) -> Tuple[bool, int, Dict]:
|
| 162 |
+
"""
|
| 163 |
+
Verify that loudness gap constraint is satisfied.
|
| 164 |
+
|
| 165 |
+
For MAX_LOUDNESS: max_volume >= second_max × multiplier_max
|
| 166 |
+
For MIN_LOUDNESS: min_volume <= second_min × multiplier_min
|
| 167 |
+
|
| 168 |
+
Since we work with dB (logarithmic), the gap is in dB difference:
|
| 169 |
+
- For max: max_dB - second_max_dB >= required_gap_dB
|
| 170 |
+
- For min: second_min_dB - min_dB >= required_gap_dB
|
| 171 |
+
|
| 172 |
+
The multiplier translates to dB: 1.5x linear = ~3.5dB, 2x = ~6dB
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
volume_levels: List of volume adjustments in dB
|
| 176 |
+
question_type: "max_loudness" or "min_loudness"
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
Tuple of (gap_satisfied, answer_idx, metadata)
|
| 180 |
+
"""
|
| 181 |
+
import math
|
| 182 |
+
|
| 183 |
+
sorted_levels = sorted(volume_levels, reverse=True) # Highest first
|
| 184 |
+
|
| 185 |
+
if question_type == "max_loudness":
|
| 186 |
+
max_level = sorted_levels[0]
|
| 187 |
+
second_max = sorted_levels[1] if len(sorted_levels) > 1 else sorted_levels[0]
|
| 188 |
+
|
| 189 |
+
# Convert multiplier to dB difference
|
| 190 |
+
# multiplier 1.5 means 1.5x louder in amplitude = 20*log10(1.5) ≈ 3.5 dB
|
| 191 |
+
required_gap_dB = 20 * math.log10(self.multiplier_max_loudness)
|
| 192 |
+
actual_gap_dB = max_level - second_max
|
| 193 |
+
|
| 194 |
+
gap_satisfied = actual_gap_dB >= required_gap_dB
|
| 195 |
+
answer_idx = volume_levels.index(max_level)
|
| 196 |
+
|
| 197 |
+
metadata = {
|
| 198 |
+
'max_level_dB': max_level,
|
| 199 |
+
'second_max_dB': second_max,
|
| 200 |
+
'required_gap_dB': required_gap_dB,
|
| 201 |
+
'actual_gap_dB': actual_gap_dB,
|
| 202 |
+
'multiplier': self.multiplier_max_loudness
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
else: # min_loudness
|
| 206 |
+
min_level = sorted_levels[-1]
|
| 207 |
+
second_min = sorted_levels[-2] if len(sorted_levels) > 1 else sorted_levels[-1]
|
| 208 |
+
|
| 209 |
+
# For min, we want min to be multiplier times softer
|
| 210 |
+
# multiplier 0.5 means 0.5x amplitude = 20*log10(0.5) ≈ -6 dB
|
| 211 |
+
# So second_min - min_level should be >= 6 dB
|
| 212 |
+
required_gap_dB = abs(20 * math.log10(self.multiplier_min_loudness))
|
| 213 |
+
actual_gap_dB = second_min - min_level
|
| 214 |
+
|
| 215 |
+
gap_satisfied = actual_gap_dB >= required_gap_dB
|
| 216 |
+
answer_idx = volume_levels.index(min_level)
|
| 217 |
+
|
| 218 |
+
metadata = {
|
| 219 |
+
'min_level_dB': min_level,
|
| 220 |
+
'second_min_dB': second_min,
|
| 221 |
+
'required_gap_dB': required_gap_dB,
|
| 222 |
+
'actual_gap_dB': actual_gap_dB,
|
| 223 |
+
'multiplier': self.multiplier_min_loudness
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
return gap_satisfied, answer_idx, metadata
|
| 227 |
+
|
| 228 |
+
def generate_volume_levels(self, n_clips: int, question_type: str = None) -> List[float]:
|
| 229 |
+
"""
|
| 230 |
+
Generate volume levels dynamically based on multiplier constraints.
|
| 231 |
+
|
| 232 |
+
The levels are generated to ensure proper gap for the question type:
|
| 233 |
+
- For max_loudness: the loudest is clearly distinguishable (gap = multiplier_max)
|
| 234 |
+
- For min_loudness: the softest is clearly distinguishable (gap = multiplier_min)
|
| 235 |
+
|
| 236 |
+
Args:
|
| 237 |
+
n_clips: Number of clips
|
| 238 |
+
question_type: "max_loudness" or "min_loudness" to ensure proper gap
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
List of volume adjustments in dB (integers)
|
| 242 |
+
"""
|
| 243 |
+
# Base spacing between adjacent volume levels (minimum audible difference)
|
| 244 |
+
# 6 dB = 2x amplitude, 12 dB = 4x amplitude (clearly distinguishable)
|
| 245 |
+
min_diff = 12 # 12 dB is a VERY noticeable difference (4x perceived loudness)
|
| 246 |
+
|
| 247 |
+
# Calculate required gap based on multiplier (round up to nearest int)
|
| 248 |
+
if question_type == "max_loudness":
|
| 249 |
+
required_gap = int(math.ceil(20 * math.log10(self.multiplier_max_loudness)))
|
| 250 |
+
elif question_type == "min_loudness":
|
| 251 |
+
required_gap = int(math.ceil(abs(20 * math.log10(self.multiplier_min_loudness))))
|
| 252 |
+
else:
|
| 253 |
+
required_gap = min_diff
|
| 254 |
+
|
| 255 |
+
# Ensure gap is at least min_diff
|
| 256 |
+
required_gap = max(required_gap, min_diff)
|
| 257 |
+
|
| 258 |
+
if question_type == "max_loudness":
|
| 259 |
+
# Generate levels where max has clear gap from others
|
| 260 |
+
# Max level (answer) at a high value - MUCH louder
|
| 261 |
+
max_level = 18 # dB adjustment = ~8x louder than baseline
|
| 262 |
+
|
| 263 |
+
# Other levels should be at least required_gap below max
|
| 264 |
+
# Spread them out with min_diff spacing
|
| 265 |
+
other_levels = []
|
| 266 |
+
current_level = max_level - required_gap
|
| 267 |
+
for i in range(n_clips - 1):
|
| 268 |
+
other_levels.append(current_level)
|
| 269 |
+
current_level -= min_diff
|
| 270 |
+
|
| 271 |
+
selected_levels = other_levels + [max_level]
|
| 272 |
+
|
| 273 |
+
elif question_type == "min_loudness":
|
| 274 |
+
# Generate levels where min has clear gap from others
|
| 275 |
+
# Min level (answer) at a low value - MUCH quieter
|
| 276 |
+
min_level = -24 # dB adjustment = ~1/16th of baseline volume
|
| 277 |
+
|
| 278 |
+
# Other levels should be at least required_gap above min
|
| 279 |
+
# Spread them out with min_diff spacing
|
| 280 |
+
other_levels = []
|
| 281 |
+
current_level = min_level + required_gap
|
| 282 |
+
for i in range(n_clips - 1):
|
| 283 |
+
other_levels.append(current_level)
|
| 284 |
+
current_level += min_diff
|
| 285 |
+
|
| 286 |
+
selected_levels = [min_level] + other_levels
|
| 287 |
+
|
| 288 |
+
else:
|
| 289 |
+
# Default: evenly spaced levels centered around 0
|
| 290 |
+
total_range = (n_clips - 1) * min_diff
|
| 291 |
+
start_level = -total_range // 2
|
| 292 |
+
selected_levels = [start_level + i * min_diff for i in range(n_clips)]
|
| 293 |
+
|
| 294 |
+
# Shuffle to randomize order in the audio
|
| 295 |
+
random.shuffle(selected_levels)
|
| 296 |
+
|
| 297 |
+
return selected_levels
|
| 298 |
+
|
| 299 |
+
def generate_sample(self, sample_id: int, target_question_type: str = None, target_duration_seconds: float = None) -> Dict:
|
| 300 |
+
"""
|
| 301 |
+
Generate a single volume task sample.
|
| 302 |
+
|
| 303 |
+
Pipeline:
|
| 304 |
+
1. Pick dataset -> pick class -> pick audio clip
|
| 305 |
+
2. NORMALIZE all clips to baseline dBFS (critical for controlled comparison)
|
| 306 |
+
3. Apply different volume adjustments to each clip
|
| 307 |
+
4. Concatenate clips with silences
|
| 308 |
+
|
| 309 |
+
Optionally: use same clip with different volume levels if configured.
|
| 310 |
+
|
| 311 |
+
Args:
|
| 312 |
+
sample_id: Sample ID number
|
| 313 |
+
target_question_type: Target question type for balanced distribution
|
| 314 |
+
target_duration_seconds: Pre-generated target duration (from generate_sample_durations_for_task)
|
| 315 |
+
|
| 316 |
+
Returns:
|
| 317 |
+
Dictionary with sample metadata
|
| 318 |
+
"""
|
| 319 |
+
# Use pre-generated duration or generate one (backward compatibility)
|
| 320 |
+
if target_duration_seconds is not None:
|
| 321 |
+
clip_duration_seconds = target_duration_seconds
|
| 322 |
+
else:
|
| 323 |
+
clip_duration_seconds = generate_single_clip_duration(
|
| 324 |
+
self.min_clip_duration,
|
| 325 |
+
self.max_clip_duration
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
# Calculate how many clips we need using the new helper
|
| 329 |
+
max_clips, remainder_seconds = get_max_clip_num_to_be_joined(
|
| 330 |
+
clip_duration_seconds,
|
| 331 |
+
self.source_clip_duration,
|
| 332 |
+
self.min_silence_ms
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
max_clips_per_sample = self.task_config.get('max_clips_per_sample', 10)
|
| 336 |
+
|
| 337 |
+
# Silence reduction strategy: subsample from [max(2, max_clips-3), min(max_clips, max_clips_per_sample)]
|
| 338 |
+
# This ensures we use close to max_clips that fit, reducing excessive silence
|
| 339 |
+
|
| 340 |
+
# Calculate valid range for this sample's duration
|
| 341 |
+
min_clips_for_sample = max(2, max_clips - 3) # At least 2, preferably max_clips-3
|
| 342 |
+
max_clips_for_sample = min(max_clips, max_clips_per_sample, len(self.dataset.CATEGORIES))
|
| 343 |
+
|
| 344 |
+
# Validate range
|
| 345 |
+
if max_clips_for_sample < 2:
|
| 346 |
+
raise ValueError(
|
| 347 |
+
f"Sample {sample_id}: Cannot generate volume task - need at least 2 clips. "
|
| 348 |
+
f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, "
|
| 349 |
+
f"duration={clip_duration_seconds:.1f}s. Increase min_clip_duration."
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
if min_clips_for_sample > max_clips_for_sample:
|
| 353 |
+
raise ValueError(
|
| 354 |
+
f"Sample {sample_id}: Invalid clip range - min_clips ({min_clips_for_sample}) > max_clips ({max_clips_for_sample}). "
|
| 355 |
+
f"max_clips={max_clips}, max_clips_per_sample={max_clips_per_sample}, duration={clip_duration_seconds:.1f}s"
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
# Randomly select from valid range (NO balanced pool for volume task)
|
| 359 |
+
n_clips = random.randint(min_clips_for_sample, max_clips_for_sample)
|
| 360 |
+
n_clips = max(2, n_clips) # Ensure at least 2 for volume comparison
|
| 361 |
+
|
| 362 |
+
# Pre-select question type to determine answer position
|
| 363 |
+
# Use target question type if provided, otherwise randomly select
|
| 364 |
+
if target_question_type is not None:
|
| 365 |
+
question_type = target_question_type
|
| 366 |
+
else:
|
| 367 |
+
question_type = random.choice(self.task_config['question_types'])
|
| 368 |
+
|
| 369 |
+
# Generate volume levels and verify gap constraint
|
| 370 |
+
max_attempts = 10
|
| 371 |
+
gap_satisfied = False
|
| 372 |
+
volume_levels = None
|
| 373 |
+
gap_metadata = None
|
| 374 |
+
|
| 375 |
+
for attempt in range(max_attempts):
|
| 376 |
+
volume_levels = self.generate_volume_levels(n_clips, question_type)
|
| 377 |
+
gap_satisfied, answer_idx, gap_metadata = self._verify_loudness_gap(
|
| 378 |
+
volume_levels, question_type
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
if gap_satisfied:
|
| 382 |
+
break
|
| 383 |
+
|
| 384 |
+
self.logger.debug(
|
| 385 |
+
f"Sample {sample_id} attempt {attempt+1}: gap not satisfied, "
|
| 386 |
+
f"required={gap_metadata['required_gap_dB']:.1f}dB, "
|
| 387 |
+
f"actual={gap_metadata['actual_gap_dB']:.1f}dB"
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
+
if not gap_satisfied and self.reject_if_gap_not_met:
|
| 391 |
+
self.logger.warning(
|
| 392 |
+
f"Sample {sample_id} rejected: loudness gap not satisfied after {max_attempts} attempts"
|
| 393 |
+
)
|
| 394 |
+
return None
|
| 395 |
+
|
| 396 |
+
# Determine answer position based on question type
|
| 397 |
+
if question_type == 'max_loudness':
|
| 398 |
+
answer_idx = volume_levels.index(max(volume_levels))
|
| 399 |
+
else: # min_loudness
|
| 400 |
+
answer_idx = volume_levels.index(min(volume_levels))
|
| 401 |
+
|
| 402 |
+
# Select answer category from least-used categories
|
| 403 |
+
answer_category = self.dataset.get_least_used_categories(1)[0]
|
| 404 |
+
|
| 405 |
+
# Determine if using same clip with different volumes
|
| 406 |
+
if self.use_same_clip_different_volumes:
|
| 407 |
+
# Use ONE source clip repeated at different volume levels
|
| 408 |
+
selected_categories = [answer_category] * n_clips
|
| 409 |
+
# Track usage
|
| 410 |
+
self.dataset.category_usage_counts[answer_category] += 1
|
| 411 |
+
correct_category = answer_category
|
| 412 |
+
else:
|
| 413 |
+
# Use different source clips (original behavior)
|
| 414 |
+
# Sample remaining categories, ensuring balanced distribution
|
| 415 |
+
if n_clips <= len(self.dataset.CATEGORIES):
|
| 416 |
+
other_categories = self.dataset.get_least_used_categories(
|
| 417 |
+
n_clips - 1,
|
| 418 |
+
exclude=[answer_category]
|
| 419 |
+
)
|
| 420 |
+
else:
|
| 421 |
+
# Need more clips than unique categories
|
| 422 |
+
other_categories = self.dataset.get_least_used_categories(
|
| 423 |
+
min(n_clips - 1, len(self.dataset.CATEGORIES) - 1),
|
| 424 |
+
exclude=[answer_category]
|
| 425 |
+
)
|
| 426 |
+
# Add random repetitions if needed
|
| 427 |
+
while len(other_categories) < n_clips - 1:
|
| 428 |
+
other_categories.append(random.choice(self.dataset.CATEGORIES))
|
| 429 |
+
|
| 430 |
+
# Arrange categories with answer at correct position
|
| 431 |
+
selected_categories = []
|
| 432 |
+
other_idx = 0
|
| 433 |
+
for i in range(n_clips):
|
| 434 |
+
if i == answer_idx:
|
| 435 |
+
selected_categories.append(answer_category)
|
| 436 |
+
else:
|
| 437 |
+
selected_categories.append(other_categories[other_idx])
|
| 438 |
+
other_idx += 1
|
| 439 |
+
|
| 440 |
+
# Track usage of answer category
|
| 441 |
+
self.dataset.category_usage_counts[answer_category] += 1
|
| 442 |
+
|
| 443 |
+
# CRITICAL BUG FIX: Verify answer_category is actually at answer_idx
|
| 444 |
+
if selected_categories[answer_idx] != answer_category:
|
| 445 |
+
self.logger.error(f"Sample {sample_id}: Answer mismatch! Expected {answer_category} at index {answer_idx}, got {selected_categories[answer_idx]}")
|
| 446 |
+
correct_category = selected_categories[answer_idx]
|
| 447 |
+
else:
|
| 448 |
+
correct_category = answer_category
|
| 449 |
+
|
| 450 |
+
# Sample files and process audio
|
| 451 |
+
audio_segments = []
|
| 452 |
+
filenames_list = []
|
| 453 |
+
original_loudness = []
|
| 454 |
+
final_loudness = []
|
| 455 |
+
|
| 456 |
+
if self.use_same_clip_different_volumes:
|
| 457 |
+
# Load one file and repeat it with different volumes
|
| 458 |
+
filename, filepath = self.dataset.sample_file_from_category(answer_category)
|
| 459 |
+
base_audio = self.audio_processor.load_audio(filepath)
|
| 460 |
+
original_loudness_val = self._get_amplitude_loudness(base_audio)
|
| 461 |
+
|
| 462 |
+
# Normalize to baseline first
|
| 463 |
+
base_audio_normalized = self._normalize_to_baseline(base_audio)
|
| 464 |
+
|
| 465 |
+
for i in range(n_clips):
|
| 466 |
+
# Apply volume adjustment to normalized audio
|
| 467 |
+
audio_adjusted = self.audio_processor.adjust_volume(
|
| 468 |
+
base_audio_normalized,
|
| 469 |
+
volume_levels[i]
|
| 470 |
+
)
|
| 471 |
+
audio_segments.append(audio_adjusted)
|
| 472 |
+
filenames_list.append(filename)
|
| 473 |
+
original_loudness.append(original_loudness_val)
|
| 474 |
+
final_loudness.append(self._get_amplitude_loudness(audio_adjusted))
|
| 475 |
+
else:
|
| 476 |
+
# Use different files (original behavior but with normalization)
|
| 477 |
+
for i, category in enumerate(selected_categories):
|
| 478 |
+
filename, filepath = self.dataset.sample_file_from_category(category)
|
| 479 |
+
audio = self.audio_processor.load_audio(filepath)
|
| 480 |
+
|
| 481 |
+
# Record original loudness
|
| 482 |
+
orig_loud = self._get_amplitude_loudness(audio)
|
| 483 |
+
original_loudness.append(orig_loud)
|
| 484 |
+
|
| 485 |
+
# STEP 1: Normalize to baseline dBFS
|
| 486 |
+
audio_normalized = self._normalize_to_baseline(audio)
|
| 487 |
+
|
| 488 |
+
# STEP 2: Apply volume adjustment (relative to baseline)
|
| 489 |
+
audio_adjusted = self.audio_processor.adjust_volume(
|
| 490 |
+
audio_normalized,
|
| 491 |
+
volume_levels[i]
|
| 492 |
+
)
|
| 493 |
+
|
| 494 |
+
audio_segments.append(audio_adjusted)
|
| 495 |
+
filenames_list.append(filename)
|
| 496 |
+
final_loudness.append(self._get_amplitude_loudness(audio_adjusted))
|
| 497 |
+
|
| 498 |
+
# Build final audio with guaranteed silences between clips
|
| 499 |
+
output_audio_path = self.audio_output / f"{sample_id}.wav"
|
| 500 |
+
final_audio = build_clip_sequence_with_silences(
|
| 501 |
+
audio_segments,
|
| 502 |
+
clip_duration_seconds,
|
| 503 |
+
min_silence_ms=self.min_silence_ms,
|
| 504 |
+
max_extra_silence_per_gap_ms=self.max_extra_silence_per_gap_ms,
|
| 505 |
+
crossfade_ms=self.crossfade_ms
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
# Save the audio
|
| 509 |
+
final_audio.export(str(output_audio_path), format="wav")
|
| 510 |
+
|
| 511 |
+
# Generate MCQ
|
| 512 |
+
mcq_question = self.task_config['mcq_questions'][question_type]
|
| 513 |
+
mcq_data = self.question_generator.generate_category_mcq(
|
| 514 |
+
mcq_question,
|
| 515 |
+
correct_category,
|
| 516 |
+
selected_categories,
|
| 517 |
+
self.dataset.CATEGORIES
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
# Generate open-text question
|
| 521 |
+
open_text_question = self.task_config['open_text_questions'][question_type]
|
| 522 |
+
open_text_data = self.question_generator.generate_category_open_text(
|
| 523 |
+
open_text_question,
|
| 524 |
+
correct_category
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
# Create category to volume mapping
|
| 528 |
+
category_volumes = {
|
| 529 |
+
selected_categories[i]: volume_levels[i]
|
| 530 |
+
for i in range(n_clips)
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
# Create metadata
|
| 534 |
+
metadata = {
|
| 535 |
+
'id': sample_id,
|
| 536 |
+
'audio_path': str(output_audio_path.relative_to(self.output_base.parent)),
|
| 537 |
+
'n_clips': n_clips,
|
| 538 |
+
'question_type': question_type,
|
| 539 |
+
'audio_sequence': selected_categories,
|
| 540 |
+
'volume_levels_db': volume_levels,
|
| 541 |
+
'category_volumes': category_volumes,
|
| 542 |
+
'correct_answer_category': correct_category,
|
| 543 |
+
'correct_volume_db': volume_levels[answer_idx],
|
| 544 |
+
'source_files': filenames_list,
|
| 545 |
+
'use_same_clip': self.use_same_clip_different_volumes,
|
| 546 |
+
'baseline_dBFS': self.baseline_dBFS if self.normalize_to_baseline else None,
|
| 547 |
+
'original_loudness_dBFS': original_loudness,
|
| 548 |
+
'final_loudness_dBFS': final_loudness,
|
| 549 |
+
'gap_satisfied': gap_satisfied,
|
| 550 |
+
'gap_metadata': gap_metadata,
|
| 551 |
+
'mcq_question': mcq_data['question'],
|
| 552 |
+
'mcq_options': mcq_data['options'],
|
| 553 |
+
'mcq_correct_answer': mcq_data['correct_answer'],
|
| 554 |
+
'open_text_question': open_text_data['question'],
|
| 555 |
+
'open_text_answer': open_text_data['correct_answer']
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
self.logger.info(
|
| 559 |
+
f"Generated volume sample {sample_id}: {question_type}, {n_clips} clips, "
|
| 560 |
+
f"volumes={volume_levels}, gap_satisfied={gap_satisfied}, "
|
| 561 |
+
f"gap={gap_metadata['actual_gap_dB']:.1f}dB (required={gap_metadata['required_gap_dB']:.1f}dB)"
|
| 562 |
+
)
|
| 563 |
+
|
| 564 |
+
return metadata
|
| 565 |
+
|
| 566 |
+
def generate_dataset(self) -> tuple:
|
| 567 |
+
"""
|
| 568 |
+
Generate the complete volume task dataset.
|
| 569 |
+
|
| 570 |
+
Uses generate_sample_durations_for_task() to pre-generate exact sample durations
|
| 571 |
+
that sum to exactly the target task duration. This guarantees:
|
| 572 |
+
- Exact coverage of target duration
|
| 573 |
+
- No estimation errors from average-based calculation
|
| 574 |
+
|
| 575 |
+
Returns:
|
| 576 |
+
Tuple of (mcq_csv_path, open_text_csv_path)
|
| 577 |
+
"""
|
| 578 |
+
# Generate sample durations upfront (guarantees exact total duration)
|
| 579 |
+
sample_durations = generate_sample_durations_for_task(
|
| 580 |
+
self.task_duration_hours,
|
| 581 |
+
self.min_clip_duration,
|
| 582 |
+
self.max_clip_duration
|
| 583 |
+
)
|
| 584 |
+
num_samples = len(sample_durations)
|
| 585 |
+
|
| 586 |
+
self.logger.info(f"Generating {num_samples} volume task samples (target: {self.task_duration_hours}h, exact fill)...")
|
| 587 |
+
|
| 588 |
+
# Create balanced question type distribution (NO clips balancing for volume task)
|
| 589 |
+
question_types = self.task_config['question_types']
|
| 590 |
+
balanced_question_types = []
|
| 591 |
+
samples_per_type = num_samples // len(question_types)
|
| 592 |
+
remainder = num_samples % len(question_types)
|
| 593 |
+
|
| 594 |
+
for qtype in question_types:
|
| 595 |
+
count = samples_per_type + (1 if remainder > 0 else 0)
|
| 596 |
+
balanced_question_types.extend([qtype] * count)
|
| 597 |
+
remainder = max(0, remainder - 1)
|
| 598 |
+
|
| 599 |
+
random.shuffle(balanced_question_types)
|
| 600 |
+
from collections import Counter
|
| 601 |
+
type_dist = Counter(balanced_question_types)
|
| 602 |
+
self.logger.info(f"Balanced question type distribution: {dict(sorted(type_dist.items()))}")
|
| 603 |
+
|
| 604 |
+
all_metadata = []
|
| 605 |
+
|
| 606 |
+
for i, target_duration in enumerate(sample_durations):
|
| 607 |
+
metadata = self.generate_sample(i, target_question_type=balanced_question_types[i], target_duration_seconds=target_duration)
|
| 608 |
+
all_metadata.append(metadata) # Save MCQ CSV
|
| 609 |
+
mcq_csv_path = self.output_base / 'volume_mcq.csv'
|
| 610 |
+
self._save_mcq_csv(all_metadata, mcq_csv_path)
|
| 611 |
+
|
| 612 |
+
# Save open-text CSV
|
| 613 |
+
open_text_csv_path = self.output_base / 'volume_open_text.csv'
|
| 614 |
+
self._save_open_text_csv(all_metadata, open_text_csv_path)
|
| 615 |
+
|
| 616 |
+
# Save metadata CSV
|
| 617 |
+
metadata_csv_path = self.output_base / 'volume_metadata.csv'
|
| 618 |
+
self._save_metadata_csv(all_metadata, metadata_csv_path)
|
| 619 |
+
|
| 620 |
+
self.logger.info(f"Volume task dataset generation complete!")
|
| 621 |
+
self.logger.info(f" - MCQ CSV: {mcq_csv_path}")
|
| 622 |
+
self.logger.info(f" - Open-text CSV: {open_text_csv_path}")
|
| 623 |
+
self.logger.info(f" - Metadata CSV: {metadata_csv_path}")
|
| 624 |
+
self.logger.info(f" - Audio files: {self.audio_output}")
|
| 625 |
+
|
| 626 |
+
return mcq_csv_path, open_text_csv_path
|
| 627 |
+
|
| 628 |
+
def _save_mcq_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 629 |
+
"""Save MCQ format CSV."""
|
| 630 |
+
with open(output_path, 'w', newline='') as f:
|
| 631 |
+
writer = csv.writer(f)
|
| 632 |
+
# Header
|
| 633 |
+
writer.writerow([
|
| 634 |
+
'question', 'id', 'audio_path',
|
| 635 |
+
'optionA', 'optionB', 'optionC', 'optionD',
|
| 636 |
+
'correct', 'question_type', 'audio_sequence',
|
| 637 |
+
'category_volumes'
|
| 638 |
+
])
|
| 639 |
+
|
| 640 |
+
# Data rows
|
| 641 |
+
for meta in metadata_list:
|
| 642 |
+
writer.writerow([
|
| 643 |
+
meta['mcq_question'],
|
| 644 |
+
meta['id'],
|
| 645 |
+
meta['audio_path'],
|
| 646 |
+
meta['mcq_options']['A'],
|
| 647 |
+
meta['mcq_options']['B'],
|
| 648 |
+
meta['mcq_options']['C'],
|
| 649 |
+
meta['mcq_options']['D'],
|
| 650 |
+
meta['mcq_correct_answer'],
|
| 651 |
+
meta['question_type'],
|
| 652 |
+
str(meta['audio_sequence']),
|
| 653 |
+
str(meta['category_volumes'])
|
| 654 |
+
])
|
| 655 |
+
|
| 656 |
+
def _save_open_text_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 657 |
+
"""Save open-text format CSV."""
|
| 658 |
+
with open(output_path, 'w', newline='') as f:
|
| 659 |
+
writer = csv.writer(f)
|
| 660 |
+
# Header
|
| 661 |
+
writer.writerow([
|
| 662 |
+
'question', 'id', 'audio_path', 'answer',
|
| 663 |
+
'question_type', 'audio_sequence', 'category_volumes'
|
| 664 |
+
])
|
| 665 |
+
|
| 666 |
+
# Data rows
|
| 667 |
+
for meta in metadata_list:
|
| 668 |
+
writer.writerow([
|
| 669 |
+
meta['open_text_question'],
|
| 670 |
+
meta['id'],
|
| 671 |
+
meta['audio_path'],
|
| 672 |
+
meta['open_text_answer'],
|
| 673 |
+
meta['question_type'],
|
| 674 |
+
str(meta['audio_sequence']),
|
| 675 |
+
str(meta['category_volumes'])
|
| 676 |
+
])
|
| 677 |
+
|
| 678 |
+
def _save_metadata_csv(self, metadata_list: List[Dict], output_path: Path):
|
| 679 |
+
"""Save detailed metadata CSV."""
|
| 680 |
+
with open(output_path, 'w', newline='') as f:
|
| 681 |
+
writer = csv.writer(f)
|
| 682 |
+
# Header
|
| 683 |
+
writer.writerow([
|
| 684 |
+
'id', 'audio_path', 'n_clips', 'question_type',
|
| 685 |
+
'audio_sequence', 'volume_levels_db', 'correct_answer',
|
| 686 |
+
'correct_volume_db', 'source_files'
|
| 687 |
+
])
|
| 688 |
+
|
| 689 |
+
# Data rows
|
| 690 |
+
for meta in metadata_list:
|
| 691 |
+
writer.writerow([
|
| 692 |
+
meta['id'],
|
| 693 |
+
meta['audio_path'],
|
| 694 |
+
meta['n_clips'],
|
| 695 |
+
meta['question_type'],
|
| 696 |
+
str(meta['audio_sequence']),
|
| 697 |
+
str(meta['volume_levels_db']),
|
| 698 |
+
meta['correct_answer_category'],
|
| 699 |
+
meta['correct_volume_db'],
|
| 700 |
+
str(meta['source_files'])
|
| 701 |
+
])
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
def main(config_path: str = None):
|
| 705 |
+
"""Main entry point for volume task generation."""
|
| 706 |
+
import yaml
|
| 707 |
+
|
| 708 |
+
# Load configuration
|
| 709 |
+
if config_path is None:
|
| 710 |
+
config_path = Path(__file__).parent.parent / 'config.yaml'
|
| 711 |
+
|
| 712 |
+
with open(config_path, 'r') as f:
|
| 713 |
+
config = yaml.safe_load(f)
|
| 714 |
+
|
| 715 |
+
# Set random seed
|
| 716 |
+
set_random_seed(config['random_seed'])
|
| 717 |
+
|
| 718 |
+
# Setup logger
|
| 719 |
+
logger = setup_logger(
|
| 720 |
+
'volume_task',
|
| 721 |
+
log_file=str(Path(config['output']['base_path']) / config['logging']['log_file']),
|
| 722 |
+
level=config['logging']['level'],
|
| 723 |
+
console_output=config['logging']['console_output']
|
| 724 |
+
)
|
| 725 |
+
|
| 726 |
+
# Generate dataset
|
| 727 |
+
generator = VolumeTaskGenerator(config, logger)
|
| 728 |
+
generator.generate_dataset()
|
| 729 |
+
|
| 730 |
+
|
| 731 |
+
if __name__ == '__main__':
|
| 732 |
+
main()
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility module initialization.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .audio_utils import (
|
| 6 |
+
AudioProcessor, set_random_seed,
|
| 7 |
+
calculate_num_samples_for_task, generate_sample_durations_for_task,
|
| 8 |
+
generate_single_clip_duration,
|
| 9 |
+
concatenate_to_target_duration,
|
| 10 |
+
get_max_clip_num_to_be_joined,
|
| 11 |
+
build_clip_sequence_with_silences,
|
| 12 |
+
distribute_remainder_as_silences,
|
| 13 |
+
repeat_clips_to_fill_duration,
|
| 14 |
+
build_consecutive_sources_for_count_task,
|
| 15 |
+
build_random_order_for_count_task,
|
| 16 |
+
build_count_task_audio,
|
| 17 |
+
calculate_duration_slot_distribution,
|
| 18 |
+
build_duration_task_audio,
|
| 19 |
+
get_lufs_loudness,
|
| 20 |
+
normalize_to_lufs
|
| 21 |
+
)
|
| 22 |
+
from .dataset_utils import ESC50Dataset, PreprocessedESC50Dataset
|
| 23 |
+
from .logger import setup_logger
|
| 24 |
+
from .question_utils import QuestionGenerator
|
| 25 |
+
from .llm_utils import LLMQuestionGenerator
|
| 26 |
+
|
| 27 |
+
__all__ = [
|
| 28 |
+
'AudioProcessor',
|
| 29 |
+
'ESC50Dataset',
|
| 30 |
+
'PreprocessedESC50Dataset',
|
| 31 |
+
'QuestionGenerator',
|
| 32 |
+
'LLMQuestionGenerator',
|
| 33 |
+
'setup_logger',
|
| 34 |
+
'set_random_seed',
|
| 35 |
+
'calculate_num_samples_for_task',
|
| 36 |
+
'generate_sample_durations_for_task',
|
| 37 |
+
'generate_single_clip_duration',
|
| 38 |
+
'concatenate_to_target_duration',
|
| 39 |
+
'get_max_clip_num_to_be_joined',
|
| 40 |
+
'build_clip_sequence_with_silences',
|
| 41 |
+
'distribute_remainder_as_silences',
|
| 42 |
+
'repeat_clips_to_fill_duration',
|
| 43 |
+
'build_consecutive_sources_for_count_task',
|
| 44 |
+
'build_random_order_for_count_task',
|
| 45 |
+
'build_count_task_audio',
|
| 46 |
+
'calculate_duration_slot_distribution',
|
| 47 |
+
'build_duration_task_audio',
|
| 48 |
+
'get_lufs_loudness',
|
| 49 |
+
'normalize_to_lufs'
|
| 50 |
+
]
|
utils/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (1.28 kB). View file
|
|
|
utils/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (1.16 kB). View file
|
|
|
utils/__pycache__/audio_utils.cpython-312.pyc
ADDED
|
Binary file (48 kB). View file
|
|
|
utils/__pycache__/audio_utils.cpython-314.pyc
ADDED
|
Binary file (45.1 kB). View file
|
|
|
utils/__pycache__/dataset_utils.cpython-312.pyc
ADDED
|
Binary file (26.2 kB). View file
|
|
|
utils/__pycache__/llm_utils.cpython-312.pyc
ADDED
|
Binary file (5.87 kB). View file
|
|
|
utils/__pycache__/logger.cpython-312.pyc
ADDED
|
Binary file (2.33 kB). View file
|
|
|
utils/__pycache__/question_utils.cpython-312.pyc
ADDED
|
Binary file (9.7 kB). View file
|
|
|
utils/audio_utils.py
ADDED
|
@@ -0,0 +1,1388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio processing utilities for temporal reasoning dataset generation.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import random
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, List, Optional, Tuple, Union
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
from pydub import AudioSegment
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
import pyloudnorm as pyln
|
| 15 |
+
PYLOUDNORM_AVAILABLE = True
|
| 16 |
+
except ImportError:
|
| 17 |
+
PYLOUDNORM_AVAILABLE = False
|
| 18 |
+
|
| 19 |
+
from .logger import setup_logger
|
| 20 |
+
|
| 21 |
+
logger = setup_logger(__name__)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def get_lufs_loudness(audio: AudioSegment) -> float:
|
| 25 |
+
"""
|
| 26 |
+
Calculate integrated LUFS loudness (perceived loudness) of an audio segment.
|
| 27 |
+
|
| 28 |
+
LUFS (Loudness Units Full Scale) is the broadcast standard for measuring
|
| 29 |
+
perceived loudness. It accounts for human hearing sensitivity to different
|
| 30 |
+
frequencies using K-weighting.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
audio: Input audio segment (pydub AudioSegment)
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
Loudness in LUFS (negative values, typically -70 to 0)
|
| 37 |
+
Returns dBFS if pyloudnorm is not available (fallback)
|
| 38 |
+
"""
|
| 39 |
+
if not PYLOUDNORM_AVAILABLE:
|
| 40 |
+
logger.warning("pyloudnorm not available, falling back to dBFS")
|
| 41 |
+
return audio.dBFS
|
| 42 |
+
|
| 43 |
+
# Convert pydub AudioSegment to numpy array
|
| 44 |
+
samples = np.array(audio.get_array_of_samples())
|
| 45 |
+
|
| 46 |
+
# Handle stereo by reshaping
|
| 47 |
+
if audio.channels == 2:
|
| 48 |
+
samples = samples.reshape((-1, 2))
|
| 49 |
+
|
| 50 |
+
# Normalize to float [-1, 1]
|
| 51 |
+
if audio.sample_width == 1:
|
| 52 |
+
samples = samples.astype(np.float64) / 128.0 - 1.0
|
| 53 |
+
elif audio.sample_width == 2:
|
| 54 |
+
samples = samples.astype(np.float64) / 32768.0
|
| 55 |
+
elif audio.sample_width == 4:
|
| 56 |
+
samples = samples.astype(np.float64) / 2147483648.0
|
| 57 |
+
else:
|
| 58 |
+
samples = samples.astype(np.float64) / 32768.0 # default to 16-bit
|
| 59 |
+
|
| 60 |
+
# Create meter with sample rate
|
| 61 |
+
meter = pyln.Meter(audio.frame_rate)
|
| 62 |
+
|
| 63 |
+
# Measure integrated loudness
|
| 64 |
+
try:
|
| 65 |
+
loudness = meter.integrated_loudness(samples)
|
| 66 |
+
# Handle -inf for silent audio
|
| 67 |
+
if np.isinf(loudness):
|
| 68 |
+
loudness = -70.0 # Return very quiet value instead of -inf
|
| 69 |
+
return loudness
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.warning(f"LUFS measurement failed: {e}, falling back to dBFS")
|
| 72 |
+
return audio.dBFS
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def normalize_to_lufs(audio: AudioSegment, target_lufs: float = -23.0) -> AudioSegment:
|
| 76 |
+
"""
|
| 77 |
+
Normalize audio to a target LUFS level (perceived loudness normalization).
|
| 78 |
+
|
| 79 |
+
This is superior to dBFS normalization for comparing different sound types
|
| 80 |
+
because it accounts for human hearing sensitivity.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
audio: Input audio segment
|
| 84 |
+
target_lufs: Target loudness level in LUFS (default: -23 LUFS, EBU R128 standard)
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
Loudness-normalized audio segment
|
| 88 |
+
"""
|
| 89 |
+
if not PYLOUDNORM_AVAILABLE:
|
| 90 |
+
logger.warning("pyloudnorm not available, falling back to dBFS normalization")
|
| 91 |
+
change_db = target_lufs - audio.dBFS
|
| 92 |
+
return audio.apply_gain(change_db)
|
| 93 |
+
|
| 94 |
+
current_lufs = get_lufs_loudness(audio)
|
| 95 |
+
|
| 96 |
+
# Calculate required gain change
|
| 97 |
+
gain_db = target_lufs - current_lufs
|
| 98 |
+
|
| 99 |
+
# Apply gain
|
| 100 |
+
normalized = audio.apply_gain(gain_db)
|
| 101 |
+
|
| 102 |
+
logger.debug(f"Normalized LUFS: {current_lufs:.2f} -> {get_lufs_loudness(normalized):.2f} LUFS")
|
| 103 |
+
|
| 104 |
+
return normalized
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class AudioProcessor:
|
| 108 |
+
"""Handles audio loading, processing, and concatenation."""
|
| 109 |
+
|
| 110 |
+
def __init__(
|
| 111 |
+
self,
|
| 112 |
+
crossfade_duration: int = 500,
|
| 113 |
+
silence_duration: int = 1000,
|
| 114 |
+
with_silence: bool = True,
|
| 115 |
+
normalize: bool = False,
|
| 116 |
+
normalize_target_dBFS: float = -20.0,
|
| 117 |
+
synthetic_silence_path: Optional[str] = None
|
| 118 |
+
):
|
| 119 |
+
"""
|
| 120 |
+
Initialize the audio processor.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
crossfade_duration: Duration of crossfade in milliseconds
|
| 124 |
+
silence_duration: Duration of silence between clips in milliseconds
|
| 125 |
+
with_silence: Whether to add silence between clips
|
| 126 |
+
normalize: Whether to normalize audio levels
|
| 127 |
+
normalize_target_dBFS: Target dBFS level for normalization
|
| 128 |
+
synthetic_silence_path: Path to synthetic silence audio files
|
| 129 |
+
"""
|
| 130 |
+
self.crossfade_duration = crossfade_duration
|
| 131 |
+
self.silence_duration = silence_duration
|
| 132 |
+
self.with_silence = with_silence
|
| 133 |
+
self.normalize = normalize
|
| 134 |
+
self.normalize_target_dBFS = normalize_target_dBFS
|
| 135 |
+
self.synthetic_silence_path = synthetic_silence_path
|
| 136 |
+
self._silence_cache = {}
|
| 137 |
+
|
| 138 |
+
def load_audio(self, audio_path: str) -> AudioSegment:
|
| 139 |
+
"""
|
| 140 |
+
Load an audio file.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
audio_path: Path to the audio file
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
Loaded audio segment
|
| 147 |
+
"""
|
| 148 |
+
try:
|
| 149 |
+
audio = AudioSegment.from_file(audio_path, format="wav")
|
| 150 |
+
logger.debug(f"Loaded audio: {audio_path}, duration: {len(audio)}ms")
|
| 151 |
+
return audio
|
| 152 |
+
except Exception as e:
|
| 153 |
+
logger.error(f"Error loading audio {audio_path}: {e}")
|
| 154 |
+
raise
|
| 155 |
+
|
| 156 |
+
def normalize_audio(self, audio: AudioSegment, target_dBFS: Optional[float] = None) -> AudioSegment:
|
| 157 |
+
"""
|
| 158 |
+
Normalize audio to a target dBFS level.
|
| 159 |
+
|
| 160 |
+
Args:
|
| 161 |
+
audio: Input audio segment
|
| 162 |
+
target_dBFS: Target dBFS level (uses default if None)
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
Normalized audio segment
|
| 166 |
+
"""
|
| 167 |
+
if target_dBFS is None:
|
| 168 |
+
target_dBFS = self.normalize_target_dBFS
|
| 169 |
+
|
| 170 |
+
change_in_dBFS = target_dBFS - audio.dBFS
|
| 171 |
+
normalized = audio.apply_gain(change_in_dBFS)
|
| 172 |
+
logger.debug(f"Normalized audio: {audio.dBFS:.2f} dBFS -> {normalized.dBFS:.2f} dBFS")
|
| 173 |
+
return normalized
|
| 174 |
+
|
| 175 |
+
def adjust_volume(self, audio: AudioSegment, volume_db: float) -> AudioSegment:
|
| 176 |
+
"""
|
| 177 |
+
Adjust audio volume by a specific dB amount.
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
audio: Input audio segment
|
| 181 |
+
volume_db: Volume adjustment in dB (positive = louder, negative = quieter)
|
| 182 |
+
|
| 183 |
+
Returns:
|
| 184 |
+
Volume-adjusted audio segment
|
| 185 |
+
"""
|
| 186 |
+
adjusted = audio.apply_gain(volume_db)
|
| 187 |
+
logger.debug(f"Adjusted volume by {volume_db} dB: {audio.dBFS:.2f} -> {adjusted.dBFS:.2f} dBFS")
|
| 188 |
+
return adjusted
|
| 189 |
+
|
| 190 |
+
def get_silence(self, duration: Optional[int] = None) -> AudioSegment:
|
| 191 |
+
"""
|
| 192 |
+
Get a silence audio segment, using synthetic silence if available.
|
| 193 |
+
|
| 194 |
+
Args:
|
| 195 |
+
duration: Duration in milliseconds (uses default if None)
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
Silence audio segment
|
| 199 |
+
"""
|
| 200 |
+
if duration is None:
|
| 201 |
+
duration = self.silence_duration
|
| 202 |
+
|
| 203 |
+
# Check cache first
|
| 204 |
+
if duration in self._silence_cache:
|
| 205 |
+
return self._silence_cache[duration]
|
| 206 |
+
|
| 207 |
+
# Try to load synthetic silence
|
| 208 |
+
if self.synthetic_silence_path and os.path.exists(self.synthetic_silence_path):
|
| 209 |
+
silence_files = list(Path(self.synthetic_silence_path).glob("*.wav"))
|
| 210 |
+
if silence_files:
|
| 211 |
+
silence = self.load_audio(str(random.choice(silence_files)))
|
| 212 |
+
# Adjust duration if needed
|
| 213 |
+
if len(silence) < duration:
|
| 214 |
+
# Repeat the silence
|
| 215 |
+
repetitions = (duration // len(silence)) + 1
|
| 216 |
+
silence = silence * repetitions
|
| 217 |
+
silence = silence[:duration]
|
| 218 |
+
self._silence_cache[duration] = silence
|
| 219 |
+
logger.debug(f"Using synthetic silence: {duration}ms")
|
| 220 |
+
return silence
|
| 221 |
+
|
| 222 |
+
# Fall back to pure silence
|
| 223 |
+
silence = AudioSegment.silent(duration=duration)
|
| 224 |
+
self._silence_cache[duration] = silence
|
| 225 |
+
logger.debug(f"Using pure silence: {duration}ms")
|
| 226 |
+
return silence
|
| 227 |
+
|
| 228 |
+
def concatenate_audios(
|
| 229 |
+
self,
|
| 230 |
+
audio_list: List[AudioSegment],
|
| 231 |
+
normalize_each: bool = False,
|
| 232 |
+
volume_adjustments: Optional[List[float]] = None
|
| 233 |
+
) -> AudioSegment:
|
| 234 |
+
"""
|
| 235 |
+
Concatenate multiple audio segments with crossfade and optional silence.
|
| 236 |
+
|
| 237 |
+
Args:
|
| 238 |
+
audio_list: List of audio segments to concatenate
|
| 239 |
+
normalize_each: Whether to normalize each audio before concatenation
|
| 240 |
+
volume_adjustments: Optional list of volume adjustments (in dB) for each audio
|
| 241 |
+
|
| 242 |
+
Returns:
|
| 243 |
+
Concatenated audio segment
|
| 244 |
+
"""
|
| 245 |
+
if not audio_list:
|
| 246 |
+
raise ValueError("audio_list cannot be empty")
|
| 247 |
+
|
| 248 |
+
if len(audio_list) == 1:
|
| 249 |
+
audio = audio_list[0]
|
| 250 |
+
if normalize_each and self.normalize:
|
| 251 |
+
audio = self.normalize_audio(audio)
|
| 252 |
+
if volume_adjustments and len(volume_adjustments) > 0:
|
| 253 |
+
audio = self.adjust_volume(audio, volume_adjustments[0])
|
| 254 |
+
return audio
|
| 255 |
+
|
| 256 |
+
# Process first audio
|
| 257 |
+
merged = audio_list[0]
|
| 258 |
+
if normalize_each and self.normalize:
|
| 259 |
+
merged = self.normalize_audio(merged)
|
| 260 |
+
if volume_adjustments and len(volume_adjustments) > 0:
|
| 261 |
+
merged = self.adjust_volume(merged, volume_adjustments[0])
|
| 262 |
+
|
| 263 |
+
# Concatenate remaining audios
|
| 264 |
+
for i, audio in enumerate(audio_list[1:], start=1):
|
| 265 |
+
# Process current audio
|
| 266 |
+
current = audio
|
| 267 |
+
if normalize_each and self.normalize:
|
| 268 |
+
current = self.normalize_audio(current)
|
| 269 |
+
if volume_adjustments and len(volume_adjustments) > i:
|
| 270 |
+
current = self.adjust_volume(current, volume_adjustments[i])
|
| 271 |
+
|
| 272 |
+
# Add silence if configured
|
| 273 |
+
if self.with_silence:
|
| 274 |
+
silence = self.get_silence()
|
| 275 |
+
# Crossfade between audio and silence for smooth transition
|
| 276 |
+
merged = merged.append(silence, crossfade=self.crossfade_duration)
|
| 277 |
+
|
| 278 |
+
# Append current audio WITHOUT crossfade to avoid cutting it
|
| 279 |
+
# The crossfade with silence already provides smooth transition
|
| 280 |
+
merged = merged.append(current, crossfade=0)
|
| 281 |
+
|
| 282 |
+
logger.debug(f"Concatenated {len(audio_list)} audio segments, total duration: {len(merged)}ms")
|
| 283 |
+
return merged
|
| 284 |
+
|
| 285 |
+
def concatenate_audio_files(
|
| 286 |
+
self,
|
| 287 |
+
audio_paths: List[str],
|
| 288 |
+
output_path: str,
|
| 289 |
+
normalize_each: bool = False,
|
| 290 |
+
volume_adjustments: Optional[List[float]] = None,
|
| 291 |
+
target_durations: Optional[List[float]] = None
|
| 292 |
+
) -> Tuple[AudioSegment, dict]:
|
| 293 |
+
"""
|
| 294 |
+
Load, concatenate, and save multiple audio files.
|
| 295 |
+
|
| 296 |
+
Args:
|
| 297 |
+
audio_paths: List of paths to audio files
|
| 298 |
+
output_path: Path to save the concatenated audio
|
| 299 |
+
normalize_each: Whether to normalize each audio before concatenation
|
| 300 |
+
volume_adjustments: Optional list of volume adjustments (in dB) for each audio
|
| 301 |
+
target_durations: Optional list of target durations (in seconds) for each clip
|
| 302 |
+
|
| 303 |
+
Returns:
|
| 304 |
+
Tuple of (concatenated audio segment, metadata dict)
|
| 305 |
+
"""
|
| 306 |
+
# Load all audio files
|
| 307 |
+
audio_segments = []
|
| 308 |
+
for i, path in enumerate(audio_paths):
|
| 309 |
+
audio = self.load_audio(path)
|
| 310 |
+
|
| 311 |
+
# Adjust duration if specified
|
| 312 |
+
if target_durations and i < len(target_durations):
|
| 313 |
+
target_ms = int(target_durations[i] * 1000)
|
| 314 |
+
audio = trim_or_repeat_audio(audio, target_ms)
|
| 315 |
+
logger.debug(f"Adjusted clip {i} to {len(audio)}ms (target: {target_ms}ms)")
|
| 316 |
+
|
| 317 |
+
audio_segments.append(audio)
|
| 318 |
+
|
| 319 |
+
# Concatenate
|
| 320 |
+
merged = self.concatenate_audios(audio_segments, normalize_each, volume_adjustments)
|
| 321 |
+
|
| 322 |
+
# Save
|
| 323 |
+
output_path = Path(output_path)
|
| 324 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 325 |
+
merged.export(str(output_path), format="wav")
|
| 326 |
+
logger.info(f"Saved concatenated audio: {output_path}")
|
| 327 |
+
|
| 328 |
+
# Create metadata
|
| 329 |
+
metadata = {
|
| 330 |
+
"output_path": str(output_path),
|
| 331 |
+
"source_files": audio_paths,
|
| 332 |
+
"num_sources": len(audio_paths),
|
| 333 |
+
"total_duration_ms": len(merged),
|
| 334 |
+
"total_duration_s": len(merged) / 1000.0,
|
| 335 |
+
"individual_durations_ms": [len(a) for a in audio_segments],
|
| 336 |
+
"individual_durations_s": [len(a) / 1000.0 for a in audio_segments],
|
| 337 |
+
"target_durations_s": target_durations if target_durations else [],
|
| 338 |
+
"volume_adjustments_db": volume_adjustments if volume_adjustments else []
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
return merged, metadata
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
def generate_sample_durations_for_task(
|
| 345 |
+
task_duration_hours: float,
|
| 346 |
+
min_clip_duration: float,
|
| 347 |
+
max_clip_duration: float
|
| 348 |
+
) -> list:
|
| 349 |
+
"""
|
| 350 |
+
Generate sample durations that exactly fill the target task duration.
|
| 351 |
+
|
| 352 |
+
Algorithm:
|
| 353 |
+
1. Start with remaining = total_seconds
|
| 354 |
+
2. While remaining >= min_clip_duration:
|
| 355 |
+
- Sample d ~ Uniform(min, min(max, remaining))
|
| 356 |
+
- Append d to durations list
|
| 357 |
+
- Subtract d from remaining
|
| 358 |
+
3. Return shuffled list of durations
|
| 359 |
+
|
| 360 |
+
This ensures:
|
| 361 |
+
- Total of all durations ≈ task_duration (within min_clip_duration tolerance)
|
| 362 |
+
- Each duration is uniformly sampled within valid range
|
| 363 |
+
- No overshoot of target duration
|
| 364 |
+
|
| 365 |
+
Args:
|
| 366 |
+
task_duration_hours: Total duration for the task in hours
|
| 367 |
+
min_clip_duration: Minimum duration per clip in seconds
|
| 368 |
+
max_clip_duration: Maximum duration per clip in seconds
|
| 369 |
+
|
| 370 |
+
Returns:
|
| 371 |
+
List of sample durations in seconds (shuffled)
|
| 372 |
+
"""
|
| 373 |
+
task_duration_seconds = task_duration_hours * 3600
|
| 374 |
+
remaining = task_duration_seconds
|
| 375 |
+
durations = []
|
| 376 |
+
|
| 377 |
+
while remaining >= min_clip_duration:
|
| 378 |
+
# Cap max at remaining to avoid overshoot
|
| 379 |
+
effective_max = min(max_clip_duration, remaining)
|
| 380 |
+
|
| 381 |
+
# If remaining is less than min, we can't fit another sample
|
| 382 |
+
if effective_max < min_clip_duration:
|
| 383 |
+
break
|
| 384 |
+
|
| 385 |
+
# Sample uniformly within valid range
|
| 386 |
+
d = random.uniform(min_clip_duration, effective_max)
|
| 387 |
+
durations.append(d)
|
| 388 |
+
remaining -= d
|
| 389 |
+
|
| 390 |
+
# Shuffle to randomize order (durations were generated sequentially)
|
| 391 |
+
random.shuffle(durations)
|
| 392 |
+
|
| 393 |
+
total_duration = sum(durations)
|
| 394 |
+
logger.info(f"Task duration target: {task_duration_hours}h ({task_duration_seconds:.1f}s)")
|
| 395 |
+
logger.info(f"Generated {len(durations)} sample durations, total: {total_duration:.1f}s")
|
| 396 |
+
logger.info(f"Duration range: [{min(durations):.1f}s, {max(durations):.1f}s], "
|
| 397 |
+
f"mean: {total_duration/len(durations):.1f}s")
|
| 398 |
+
logger.info(f"Unused remainder: {remaining:.1f}s ({remaining/task_duration_seconds*100:.2f}%)")
|
| 399 |
+
|
| 400 |
+
return durations
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
def calculate_num_samples_for_task(
|
| 404 |
+
task_duration_hours: float,
|
| 405 |
+
min_clip_duration: float,
|
| 406 |
+
max_clip_duration: float
|
| 407 |
+
) -> int:
|
| 408 |
+
"""
|
| 409 |
+
Calculate number of samples needed to fill the task duration.
|
| 410 |
+
|
| 411 |
+
DEPRECATED: Use generate_sample_durations_for_task() instead for exact duration filling.
|
| 412 |
+
This function is kept for backward compatibility but uses average-based estimation.
|
| 413 |
+
|
| 414 |
+
Args:
|
| 415 |
+
task_duration_hours: Total duration for the task in hours
|
| 416 |
+
min_clip_duration: Minimum duration per clip in seconds
|
| 417 |
+
max_clip_duration: Maximum duration per clip in seconds
|
| 418 |
+
|
| 419 |
+
Returns:
|
| 420 |
+
Number of samples to generate (estimate)
|
| 421 |
+
"""
|
| 422 |
+
task_duration_seconds = task_duration_hours * 3600
|
| 423 |
+
avg_clip_duration = (min_clip_duration + max_clip_duration) / 2
|
| 424 |
+
num_samples = int(task_duration_seconds / avg_clip_duration)
|
| 425 |
+
|
| 426 |
+
logger.info(f"Task duration: {task_duration_hours}h ({task_duration_seconds}s)")
|
| 427 |
+
logger.info(f"Avg clip duration: {avg_clip_duration}s (min: {min_clip_duration}s, max: {max_clip_duration}s)")
|
| 428 |
+
logger.info(f"Calculated number of samples: {num_samples}")
|
| 429 |
+
|
| 430 |
+
return max(1, num_samples) # At least 1 sample
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
def generate_single_clip_duration(
|
| 434 |
+
min_duration: float,
|
| 435 |
+
max_duration: float
|
| 436 |
+
) -> float:
|
| 437 |
+
"""
|
| 438 |
+
Generate a random clip duration between min and max.
|
| 439 |
+
|
| 440 |
+
Args:
|
| 441 |
+
min_duration: Minimum duration in seconds
|
| 442 |
+
max_duration: Maximum duration in seconds
|
| 443 |
+
|
| 444 |
+
Returns:
|
| 445 |
+
Random duration in seconds
|
| 446 |
+
"""
|
| 447 |
+
return random.uniform(min_duration, max_duration)
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
def concatenate_to_target_duration(
|
| 451 |
+
base_audio: AudioSegment,
|
| 452 |
+
target_duration_seconds: float,
|
| 453 |
+
crossfade_ms: int = 0
|
| 454 |
+
) -> AudioSegment:
|
| 455 |
+
"""
|
| 456 |
+
Concatenate a base audio clip to reach target duration.
|
| 457 |
+
|
| 458 |
+
This takes a 5-second ESC-50 clip and repeats it to create a longer clip.
|
| 459 |
+
|
| 460 |
+
Args:
|
| 461 |
+
base_audio: Original 5s audio segment
|
| 462 |
+
target_duration_seconds: Target duration in seconds
|
| 463 |
+
crossfade_ms: Crossfade between repetitions in milliseconds
|
| 464 |
+
|
| 465 |
+
Returns:
|
| 466 |
+
Audio segment of target duration
|
| 467 |
+
"""
|
| 468 |
+
target_duration_ms = int(target_duration_seconds * 1000)
|
| 469 |
+
base_duration_ms = len(base_audio)
|
| 470 |
+
|
| 471 |
+
if target_duration_ms <= base_duration_ms:
|
| 472 |
+
# Just trim if target is shorter
|
| 473 |
+
return base_audio[:target_duration_ms]
|
| 474 |
+
|
| 475 |
+
# Calculate number of repetitions needed
|
| 476 |
+
num_repetitions = (target_duration_ms // base_duration_ms) + 1
|
| 477 |
+
|
| 478 |
+
# Concatenate with crossfade
|
| 479 |
+
result = base_audio
|
| 480 |
+
for i in range(1, num_repetitions):
|
| 481 |
+
if crossfade_ms > 0:
|
| 482 |
+
result = result.append(base_audio, crossfade=crossfade_ms)
|
| 483 |
+
else:
|
| 484 |
+
result = result + base_audio
|
| 485 |
+
|
| 486 |
+
# Stop if we've reached target
|
| 487 |
+
if len(result) >= target_duration_ms:
|
| 488 |
+
break
|
| 489 |
+
|
| 490 |
+
# Trim to exact duration
|
| 491 |
+
return result[:target_duration_ms]
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
def set_random_seed(seed: int):
|
| 495 |
+
"""Set random seed for reproducibility."""
|
| 496 |
+
random.seed(seed)
|
| 497 |
+
np.random.seed(seed)
|
| 498 |
+
logger.info(f"Random seed set to: {seed}")
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
def get_max_clip_num_to_be_joined(
|
| 502 |
+
target_duration_seconds: float,
|
| 503 |
+
source_clip_duration_seconds: float,
|
| 504 |
+
min_silence_ms: int = 100
|
| 505 |
+
) -> Tuple[int, float]:
|
| 506 |
+
"""
|
| 507 |
+
Calculate the maximum number of source clips needed to reach target duration.
|
| 508 |
+
|
| 509 |
+
Pipeline: pick dataset -> pick class -> pick audio clip -> get duration ->
|
| 510 |
+
concatenate clips to reach target duration -> modulo to get num clips ->
|
| 511 |
+
inserting silences randomly based on remainder.
|
| 512 |
+
|
| 513 |
+
Args:
|
| 514 |
+
target_duration_seconds: Target total duration in seconds
|
| 515 |
+
source_clip_duration_seconds: Duration of each source clip (e.g., 5s for ESC-50)
|
| 516 |
+
min_silence_ms: Minimum silence between clips in milliseconds
|
| 517 |
+
|
| 518 |
+
Returns:
|
| 519 |
+
Tuple of (num_clips_needed, remainder_seconds_for_silences)
|
| 520 |
+
- num_clips_needed: How many source clips to concatenate
|
| 521 |
+
- remainder_seconds_for_silences: Extra time to distribute as random silences
|
| 522 |
+
|
| 523 |
+
Example:
|
| 524 |
+
target=30s, source=5s -> (6, 0.0) - exactly 6 clips, no extra silence
|
| 525 |
+
target=32s, source=5s -> (6, 2.0) - 6 clips + 2s distributed as silences
|
| 526 |
+
"""
|
| 527 |
+
target_ms = target_duration_seconds * 1000
|
| 528 |
+
source_ms = source_clip_duration_seconds * 1000
|
| 529 |
+
|
| 530 |
+
# Account for minimum silence between each pair of clips
|
| 531 |
+
# If we have N clips, we have (N-1) gaps for silence
|
| 532 |
+
# Each gap needs at least min_silence_ms
|
| 533 |
+
|
| 534 |
+
# Start by computing raw number of clips (floor division)
|
| 535 |
+
num_clips = int(target_ms // source_ms)
|
| 536 |
+
num_clips = max(1, num_clips) # At least 1 clip
|
| 537 |
+
|
| 538 |
+
# Total audio content from clips
|
| 539 |
+
clips_duration_ms = num_clips * source_ms
|
| 540 |
+
|
| 541 |
+
# Minimum required silence for gaps
|
| 542 |
+
num_gaps = max(0, num_clips - 1)
|
| 543 |
+
min_total_silence_ms = num_gaps * min_silence_ms
|
| 544 |
+
|
| 545 |
+
# Check if we need to reduce clips to fit silences
|
| 546 |
+
while num_clips > 1 and (clips_duration_ms + min_total_silence_ms) > target_ms:
|
| 547 |
+
num_clips -= 1
|
| 548 |
+
clips_duration_ms = num_clips * source_ms
|
| 549 |
+
num_gaps = num_clips - 1
|
| 550 |
+
min_total_silence_ms = num_gaps * min_silence_ms
|
| 551 |
+
|
| 552 |
+
# Calculate remainder for extra silences
|
| 553 |
+
remainder_ms = target_ms - clips_duration_ms - min_total_silence_ms
|
| 554 |
+
remainder_seconds = max(0, remainder_ms / 1000.0)
|
| 555 |
+
|
| 556 |
+
logger.debug(
|
| 557 |
+
f"get_max_clip_num: target={target_duration_seconds}s, source={source_clip_duration_seconds}s "
|
| 558 |
+
f"-> {num_clips} clips, {remainder_seconds:.3f}s remainder for extra silences"
|
| 559 |
+
)
|
| 560 |
+
|
| 561 |
+
return num_clips, remainder_seconds
|
| 562 |
+
|
| 563 |
+
|
| 564 |
+
def build_clip_sequence_with_silences(
|
| 565 |
+
audio_segments: List[AudioSegment],
|
| 566 |
+
target_duration_seconds: float,
|
| 567 |
+
min_silence_ms: int = 100,
|
| 568 |
+
max_extra_silence_per_gap_ms: int = 500,
|
| 569 |
+
crossfade_ms: int = 0
|
| 570 |
+
) -> AudioSegment:
|
| 571 |
+
"""
|
| 572 |
+
Build a final audio clip by concatenating segments with guaranteed silences.
|
| 573 |
+
|
| 574 |
+
Ensures:
|
| 575 |
+
1. All clips are joined with at least min_silence_ms between them
|
| 576 |
+
2. Any remainder duration is distributed as random extra silences in gaps
|
| 577 |
+
3. Final duration matches target_duration_seconds exactly
|
| 578 |
+
|
| 579 |
+
Args:
|
| 580 |
+
audio_segments: List of audio segments to concatenate
|
| 581 |
+
target_duration_seconds: Target total duration in seconds
|
| 582 |
+
min_silence_ms: Minimum silence between each pair of clips (always inserted)
|
| 583 |
+
max_extra_silence_per_gap_ms: Maximum extra silence to add per gap
|
| 584 |
+
crossfade_ms: Crossfade duration in ms (applied when joining)
|
| 585 |
+
|
| 586 |
+
Returns:
|
| 587 |
+
Concatenated audio segment of exact target duration
|
| 588 |
+
"""
|
| 589 |
+
if not audio_segments:
|
| 590 |
+
raise ValueError("audio_segments cannot be empty")
|
| 591 |
+
|
| 592 |
+
target_ms = int(target_duration_seconds * 1000)
|
| 593 |
+
|
| 594 |
+
if len(audio_segments) == 1:
|
| 595 |
+
# Single clip: just trim/repeat to target
|
| 596 |
+
audio = audio_segments[0]
|
| 597 |
+
if len(audio) >= target_ms:
|
| 598 |
+
return audio[:target_ms]
|
| 599 |
+
else:
|
| 600 |
+
# Repeat to reach target
|
| 601 |
+
return concatenate_to_target_duration(audio, target_duration_seconds, crossfade_ms)
|
| 602 |
+
|
| 603 |
+
# Calculate total audio content duration
|
| 604 |
+
total_audio_ms = sum(len(seg) for seg in audio_segments)
|
| 605 |
+
num_gaps = len(audio_segments) - 1
|
| 606 |
+
|
| 607 |
+
# Minimum silence needed
|
| 608 |
+
min_total_silence_ms = num_gaps * min_silence_ms
|
| 609 |
+
|
| 610 |
+
# Available time for extra silences
|
| 611 |
+
available_extra_ms = target_ms - total_audio_ms - min_total_silence_ms
|
| 612 |
+
|
| 613 |
+
if available_extra_ms < 0:
|
| 614 |
+
# Not enough room - need to trim clips
|
| 615 |
+
logger.warning(
|
| 616 |
+
f"Clips too long for target duration. Total audio: {total_audio_ms}ms, "
|
| 617 |
+
f"target: {target_ms}ms. Will trim final result."
|
| 618 |
+
)
|
| 619 |
+
available_extra_ms = 0
|
| 620 |
+
|
| 621 |
+
# Distribute extra silence randomly across gaps
|
| 622 |
+
extra_silences_ms = distribute_remainder_as_silences(
|
| 623 |
+
available_extra_ms,
|
| 624 |
+
num_gaps,
|
| 625 |
+
max_extra_silence_per_gap_ms
|
| 626 |
+
)
|
| 627 |
+
|
| 628 |
+
# Build the final audio
|
| 629 |
+
result = audio_segments[0]
|
| 630 |
+
|
| 631 |
+
for i, audio in enumerate(audio_segments[1:]):
|
| 632 |
+
# Calculate total silence for this gap
|
| 633 |
+
gap_silence_ms = min_silence_ms + extra_silences_ms[i]
|
| 634 |
+
|
| 635 |
+
# Add silence
|
| 636 |
+
silence = AudioSegment.silent(duration=gap_silence_ms)
|
| 637 |
+
|
| 638 |
+
if crossfade_ms > 0 and crossfade_ms < gap_silence_ms:
|
| 639 |
+
# Crossfade audio->silence for smooth transition, but NOT silence->audio
|
| 640 |
+
result = result.append(silence, crossfade=crossfade_ms)
|
| 641 |
+
result = result.append(audio, crossfade=0) # No crossfade to avoid cutting audio
|
| 642 |
+
else:
|
| 643 |
+
result = result + silence + audio
|
| 644 |
+
|
| 645 |
+
# Trim to exact target duration
|
| 646 |
+
if len(result) > target_ms:
|
| 647 |
+
result = result[:target_ms]
|
| 648 |
+
elif len(result) < target_ms:
|
| 649 |
+
# Pad with silence if slightly short
|
| 650 |
+
padding = AudioSegment.silent(duration=target_ms - len(result))
|
| 651 |
+
result = result + padding
|
| 652 |
+
|
| 653 |
+
logger.debug(
|
| 654 |
+
f"Built clip sequence: {len(audio_segments)} segments, "
|
| 655 |
+
f"final duration: {len(result)}ms (target: {target_ms}ms)"
|
| 656 |
+
)
|
| 657 |
+
|
| 658 |
+
return result
|
| 659 |
+
|
| 660 |
+
|
| 661 |
+
def distribute_remainder_as_silences(
|
| 662 |
+
remainder_ms: float,
|
| 663 |
+
num_gaps: int,
|
| 664 |
+
max_per_gap_ms: int = 500
|
| 665 |
+
) -> List[int]:
|
| 666 |
+
"""
|
| 667 |
+
Distribute remainder time as random silences across gaps.
|
| 668 |
+
|
| 669 |
+
Args:
|
| 670 |
+
remainder_ms: Total extra time to distribute (in ms)
|
| 671 |
+
num_gaps: Number of gaps between clips
|
| 672 |
+
max_per_gap_ms: Maximum extra silence per gap
|
| 673 |
+
|
| 674 |
+
Returns:
|
| 675 |
+
List of extra silence durations (in ms) for each gap
|
| 676 |
+
"""
|
| 677 |
+
if num_gaps <= 0:
|
| 678 |
+
return []
|
| 679 |
+
|
| 680 |
+
remainder_ms = int(max(0, remainder_ms))
|
| 681 |
+
|
| 682 |
+
if remainder_ms == 0:
|
| 683 |
+
return [0] * num_gaps
|
| 684 |
+
|
| 685 |
+
# Generate random weights for distribution
|
| 686 |
+
weights = [random.random() for _ in range(num_gaps)]
|
| 687 |
+
total_weight = sum(weights)
|
| 688 |
+
|
| 689 |
+
if total_weight == 0:
|
| 690 |
+
# Fallback to uniform distribution
|
| 691 |
+
weights = [1.0] * num_gaps
|
| 692 |
+
total_weight = num_gaps
|
| 693 |
+
|
| 694 |
+
# Distribute proportionally, respecting max_per_gap
|
| 695 |
+
extra_silences = []
|
| 696 |
+
remaining = remainder_ms
|
| 697 |
+
|
| 698 |
+
for i, w in enumerate(weights):
|
| 699 |
+
if i == num_gaps - 1:
|
| 700 |
+
# Last gap gets whatever is left
|
| 701 |
+
extra = min(remaining, max_per_gap_ms)
|
| 702 |
+
else:
|
| 703 |
+
proportion = w / total_weight
|
| 704 |
+
extra = int(remainder_ms * proportion)
|
| 705 |
+
extra = min(extra, max_per_gap_ms, remaining)
|
| 706 |
+
|
| 707 |
+
extra_silences.append(extra)
|
| 708 |
+
remaining -= extra
|
| 709 |
+
total_weight -= w
|
| 710 |
+
|
| 711 |
+
# If there's still remainder (due to max_per_gap limits), do another pass
|
| 712 |
+
while remaining > 0:
|
| 713 |
+
for i in range(num_gaps):
|
| 714 |
+
if extra_silences[i] < max_per_gap_ms and remaining > 0:
|
| 715 |
+
add = min(remaining, max_per_gap_ms - extra_silences[i])
|
| 716 |
+
extra_silences[i] += add
|
| 717 |
+
remaining -= add
|
| 718 |
+
if remaining > 0:
|
| 719 |
+
# Can't distribute more (all gaps at max)
|
| 720 |
+
break
|
| 721 |
+
|
| 722 |
+
logger.debug(f"Distributed {remainder_ms}ms across {num_gaps} gaps: {extra_silences}")
|
| 723 |
+
|
| 724 |
+
return extra_silences
|
| 725 |
+
|
| 726 |
+
|
| 727 |
+
def repeat_clips_to_fill_duration(
|
| 728 |
+
source_audios: List[AudioSegment],
|
| 729 |
+
source_categories: List[str],
|
| 730 |
+
target_duration_seconds: float,
|
| 731 |
+
source_clip_duration_seconds: float = 5.0,
|
| 732 |
+
min_silence_ms: int = 100
|
| 733 |
+
) -> Tuple[List[AudioSegment], List[str], int]:
|
| 734 |
+
"""
|
| 735 |
+
Repeat source clips to fill target duration, cycling through all sources.
|
| 736 |
+
|
| 737 |
+
This ensures all unique sources appear and are repeated proportionally.
|
| 738 |
+
|
| 739 |
+
Args:
|
| 740 |
+
source_audios: List of unique source audio segments
|
| 741 |
+
source_categories: List of category names corresponding to source_audios
|
| 742 |
+
target_duration_seconds: Target total duration
|
| 743 |
+
source_clip_duration_seconds: Duration of each source clip
|
| 744 |
+
min_silence_ms: Minimum silence between clips
|
| 745 |
+
|
| 746 |
+
Returns:
|
| 747 |
+
Tuple of (expanded_audio_list, expanded_categories, num_clips)
|
| 748 |
+
"""
|
| 749 |
+
num_clips, remainder = get_max_clip_num_to_be_joined(
|
| 750 |
+
target_duration_seconds,
|
| 751 |
+
source_clip_duration_seconds,
|
| 752 |
+
min_silence_ms
|
| 753 |
+
)
|
| 754 |
+
|
| 755 |
+
num_sources = len(source_audios)
|
| 756 |
+
|
| 757 |
+
if num_sources == 0:
|
| 758 |
+
raise ValueError("source_audios cannot be empty")
|
| 759 |
+
|
| 760 |
+
# Build expanded lists by cycling through sources
|
| 761 |
+
expanded_audios = []
|
| 762 |
+
expanded_categories = []
|
| 763 |
+
|
| 764 |
+
for i in range(num_clips):
|
| 765 |
+
idx = i % num_sources
|
| 766 |
+
expanded_audios.append(source_audios[idx])
|
| 767 |
+
expanded_categories.append(source_categories[idx])
|
| 768 |
+
|
| 769 |
+
logger.debug(
|
| 770 |
+
f"Repeated {num_sources} sources to {num_clips} clips for "
|
| 771 |
+
f"{target_duration_seconds}s target duration"
|
| 772 |
+
)
|
| 773 |
+
|
| 774 |
+
return expanded_audios, expanded_categories, num_clips
|
| 775 |
+
|
| 776 |
+
|
| 777 |
+
def build_consecutive_sources_for_count_task(
|
| 778 |
+
source_audios: List[AudioSegment],
|
| 779 |
+
source_categories: List[str],
|
| 780 |
+
target_duration_seconds: float,
|
| 781 |
+
source_clip_duration_seconds: float = 5.0,
|
| 782 |
+
min_silence_between_sources_ms: int = 100,
|
| 783 |
+
max_extra_silence_per_gap_ms: int = 500,
|
| 784 |
+
crossfade_within_source_ms: int = 50
|
| 785 |
+
) -> Tuple[AudioSegment, List[str], dict]:
|
| 786 |
+
"""
|
| 787 |
+
Build audio for COUNT task with consecutive same-class clips.
|
| 788 |
+
|
| 789 |
+
For count task, same-class clips must be consecutive (AAA BBB CCC) so they
|
| 790 |
+
are perceived as ONE sound source. Silences are only inserted BETWEEN
|
| 791 |
+
different classes, not within same-class repetitions.
|
| 792 |
+
|
| 793 |
+
Pipeline: pick classes -> for each class concatenate clips consecutively ->
|
| 794 |
+
insert silences only between different classes -> distribute remainder
|
| 795 |
+
|
| 796 |
+
Args:
|
| 797 |
+
source_audios: List of unique source audio segments (one per class)
|
| 798 |
+
source_categories: List of category names
|
| 799 |
+
target_duration_seconds: Target total duration
|
| 800 |
+
source_clip_duration_seconds: Duration of each source clip
|
| 801 |
+
min_silence_between_sources_ms: Minimum silence between different sources
|
| 802 |
+
max_extra_silence_per_gap_ms: Max extra silence per gap for remainder distribution
|
| 803 |
+
crossfade_within_source_ms: Small crossfade within same-source repetitions
|
| 804 |
+
|
| 805 |
+
Returns:
|
| 806 |
+
Tuple of (final_audio, category_sequence, metadata_dict)
|
| 807 |
+
"""
|
| 808 |
+
target_ms = int(target_duration_seconds * 1000)
|
| 809 |
+
source_ms = int(source_clip_duration_seconds * 1000)
|
| 810 |
+
num_sources = len(source_audios)
|
| 811 |
+
|
| 812 |
+
if num_sources == 0:
|
| 813 |
+
raise ValueError("source_audios cannot be empty")
|
| 814 |
+
|
| 815 |
+
# Calculate total clips needed
|
| 816 |
+
num_clips, remainder_seconds = get_max_clip_num_to_be_joined(
|
| 817 |
+
target_duration_seconds,
|
| 818 |
+
source_clip_duration_seconds,
|
| 819 |
+
min_silence_between_sources_ms
|
| 820 |
+
)
|
| 821 |
+
|
| 822 |
+
# Safety check: if more sources than clips can fit, warn
|
| 823 |
+
if num_sources > num_clips:
|
| 824 |
+
logger.warning(
|
| 825 |
+
f"More sources ({num_sources}) than clips that fit ({num_clips}). "
|
| 826 |
+
f"Each source needs at least 1 clip, so output may exceed target duration. "
|
| 827 |
+
f"Consider capping n_unique_audios <= max_clips in task_count.py"
|
| 828 |
+
)
|
| 829 |
+
# Each source gets exactly 1 rep if there are more sources than clips
|
| 830 |
+
num_clips = num_sources # This will exceed target but ensures each source is included
|
| 831 |
+
|
| 832 |
+
# Distribute clips across sources as evenly as possible
|
| 833 |
+
# Each source gets at least 1 clip since num_sources <= num_clips
|
| 834 |
+
base_reps = num_clips // num_sources
|
| 835 |
+
extra_reps = num_clips % num_sources
|
| 836 |
+
|
| 837 |
+
repetitions_per_source = []
|
| 838 |
+
for i in range(num_sources):
|
| 839 |
+
reps = base_reps + (1 if i < extra_reps else 0)
|
| 840 |
+
repetitions_per_source.append(reps)
|
| 841 |
+
|
| 842 |
+
# Shuffle repetition assignment to add variety
|
| 843 |
+
random.shuffle(repetitions_per_source)
|
| 844 |
+
|
| 845 |
+
# Build each source's audio block (consecutive clips of same class)
|
| 846 |
+
source_blocks = []
|
| 847 |
+
category_sequence = []
|
| 848 |
+
|
| 849 |
+
for i, (audio, category, reps) in enumerate(zip(source_audios, source_categories, repetitions_per_source)):
|
| 850 |
+
if reps == 0:
|
| 851 |
+
continue
|
| 852 |
+
|
| 853 |
+
# Concatenate same-source clips with minimal/no gap (just small crossfade)
|
| 854 |
+
block = audio
|
| 855 |
+
for _ in range(reps - 1):
|
| 856 |
+
if crossfade_within_source_ms > 0:
|
| 857 |
+
block = block.append(audio, crossfade=crossfade_within_source_ms)
|
| 858 |
+
else:
|
| 859 |
+
block = block + audio
|
| 860 |
+
|
| 861 |
+
source_blocks.append(block)
|
| 862 |
+
category_sequence.append(category)
|
| 863 |
+
|
| 864 |
+
# Now we have N source blocks, need to join them with silences
|
| 865 |
+
# Number of gaps = num_source_blocks - 1
|
| 866 |
+
num_gaps = len(source_blocks) - 1
|
| 867 |
+
|
| 868 |
+
if num_gaps <= 0:
|
| 869 |
+
# Only one source block
|
| 870 |
+
final_audio = source_blocks[0]
|
| 871 |
+
else:
|
| 872 |
+
# Calculate total audio duration from blocks
|
| 873 |
+
total_blocks_ms = sum(len(block) for block in source_blocks)
|
| 874 |
+
min_total_silence_ms = num_gaps * min_silence_between_sources_ms
|
| 875 |
+
|
| 876 |
+
# Available for extra silences
|
| 877 |
+
available_extra_ms = target_ms - total_blocks_ms - min_total_silence_ms
|
| 878 |
+
available_extra_ms = max(0, available_extra_ms)
|
| 879 |
+
|
| 880 |
+
# Distribute extra silence across gaps
|
| 881 |
+
extra_silences = distribute_remainder_as_silences(
|
| 882 |
+
available_extra_ms,
|
| 883 |
+
num_gaps,
|
| 884 |
+
max_extra_silence_per_gap_ms
|
| 885 |
+
)
|
| 886 |
+
|
| 887 |
+
# Build final audio with silences between source blocks
|
| 888 |
+
final_audio = source_blocks[0]
|
| 889 |
+
for i, block in enumerate(source_blocks[1:]):
|
| 890 |
+
gap_silence_ms = min_silence_between_sources_ms + extra_silences[i]
|
| 891 |
+
silence = AudioSegment.silent(duration=gap_silence_ms)
|
| 892 |
+
final_audio = final_audio + silence + block
|
| 893 |
+
|
| 894 |
+
# Trim or pad to exact target duration
|
| 895 |
+
if len(final_audio) > target_ms:
|
| 896 |
+
final_audio = final_audio[:target_ms]
|
| 897 |
+
elif len(final_audio) < target_ms:
|
| 898 |
+
padding = AudioSegment.silent(duration=target_ms - len(final_audio))
|
| 899 |
+
final_audio = final_audio + padding
|
| 900 |
+
|
| 901 |
+
# Create metadata
|
| 902 |
+
metadata = {
|
| 903 |
+
'num_unique_sources': num_sources,
|
| 904 |
+
'total_clips': num_clips,
|
| 905 |
+
'ordering_mode': 'consecutive',
|
| 906 |
+
'repetitions_per_source': dict(zip(source_categories, repetitions_per_source)),
|
| 907 |
+
'target_duration_ms': target_ms,
|
| 908 |
+
'actual_duration_ms': len(final_audio),
|
| 909 |
+
'num_gaps_between_sources': num_gaps
|
| 910 |
+
}
|
| 911 |
+
|
| 912 |
+
logger.debug(
|
| 913 |
+
f"Count task (consecutive): {num_sources} sources, {num_clips} total clips, "
|
| 914 |
+
f"reps={repetitions_per_source}, duration={len(final_audio)}ms"
|
| 915 |
+
)
|
| 916 |
+
|
| 917 |
+
return final_audio, category_sequence, metadata
|
| 918 |
+
|
| 919 |
+
|
| 920 |
+
def build_random_order_for_count_task(
|
| 921 |
+
source_audios: List[AudioSegment],
|
| 922 |
+
source_categories: List[str],
|
| 923 |
+
target_duration_seconds: float,
|
| 924 |
+
source_clip_duration_seconds: float = 5.0,
|
| 925 |
+
min_silence_ms: int = 100,
|
| 926 |
+
max_extra_silence_per_gap_ms: int = 500
|
| 927 |
+
) -> Tuple[AudioSegment, List[str], dict]:
|
| 928 |
+
"""
|
| 929 |
+
Build audio for COUNT task with RANDOM ordering of clips.
|
| 930 |
+
|
| 931 |
+
Clips from different sources are shuffled randomly (A B A C B A C...).
|
| 932 |
+
This tests whether the model can recognize recurring sounds as the same source.
|
| 933 |
+
Silences are inserted between ALL clips (same or different source).
|
| 934 |
+
|
| 935 |
+
Pipeline:
|
| 936 |
+
1. Calculate total clips needed
|
| 937 |
+
2. Distribute clips across sources
|
| 938 |
+
3. Create expanded list with all clip instances
|
| 939 |
+
4. Shuffle randomly
|
| 940 |
+
5. Insert silences between ALL clips
|
| 941 |
+
6. Distribute remainder as extra random silences
|
| 942 |
+
|
| 943 |
+
Args:
|
| 944 |
+
source_audios: List of unique source audio segments (one per class)
|
| 945 |
+
source_categories: List of category names
|
| 946 |
+
target_duration_seconds: Target total duration
|
| 947 |
+
source_clip_duration_seconds: Duration of each source clip
|
| 948 |
+
min_silence_ms: Minimum silence between ALL clips
|
| 949 |
+
max_extra_silence_per_gap_ms: Max extra silence per gap
|
| 950 |
+
|
| 951 |
+
Returns:
|
| 952 |
+
Tuple of (final_audio, clip_sequence, metadata_dict)
|
| 953 |
+
"""
|
| 954 |
+
target_ms = int(target_duration_seconds * 1000)
|
| 955 |
+
source_ms = int(source_clip_duration_seconds * 1000)
|
| 956 |
+
num_sources = len(source_audios)
|
| 957 |
+
|
| 958 |
+
if num_sources == 0:
|
| 959 |
+
raise ValueError("source_audios cannot be empty")
|
| 960 |
+
|
| 961 |
+
# Calculate total clips needed
|
| 962 |
+
num_clips, remainder_seconds = get_max_clip_num_to_be_joined(
|
| 963 |
+
target_duration_seconds,
|
| 964 |
+
source_clip_duration_seconds,
|
| 965 |
+
min_silence_ms
|
| 966 |
+
)
|
| 967 |
+
|
| 968 |
+
# Safety check: if more sources than clips can fit, warn and cap sources
|
| 969 |
+
if num_sources > num_clips:
|
| 970 |
+
logger.warning(
|
| 971 |
+
f"More sources ({num_sources}) than clips that fit ({num_clips}). "
|
| 972 |
+
f"Each source needs at least 1 clip, so output may exceed target duration. "
|
| 973 |
+
f"Consider capping n_unique_audios <= max_clips in task_count.py"
|
| 974 |
+
)
|
| 975 |
+
# Each source gets exactly 1 rep if there are more sources than clips
|
| 976 |
+
num_clips = num_sources # This will exceed target but ensures each source is included
|
| 977 |
+
|
| 978 |
+
# Distribute clips across sources as evenly as possible
|
| 979 |
+
base_reps = num_clips // num_sources # At least 1 since num_sources <= num_clips (after cap)
|
| 980 |
+
extra_reps = num_clips % num_sources
|
| 981 |
+
|
| 982 |
+
repetitions_per_source = []
|
| 983 |
+
for i in range(num_sources):
|
| 984 |
+
reps = base_reps + (1 if i < extra_reps else 0)
|
| 985 |
+
repetitions_per_source.append(reps)
|
| 986 |
+
|
| 987 |
+
# Build expanded list of (audio, category) pairs
|
| 988 |
+
expanded_clips = []
|
| 989 |
+
for audio, category, reps in zip(source_audios, source_categories, repetitions_per_source):
|
| 990 |
+
for _ in range(reps):
|
| 991 |
+
expanded_clips.append((audio, category))
|
| 992 |
+
|
| 993 |
+
# Shuffle the clips randomly
|
| 994 |
+
random.shuffle(expanded_clips)
|
| 995 |
+
|
| 996 |
+
# Extract shuffled audios and categories
|
| 997 |
+
shuffled_audios = [clip[0] for clip in expanded_clips]
|
| 998 |
+
clip_sequence = [clip[1] for clip in expanded_clips]
|
| 999 |
+
|
| 1000 |
+
# Build final audio with silences between ALL clips
|
| 1001 |
+
final_audio = build_clip_sequence_with_silences(
|
| 1002 |
+
shuffled_audios,
|
| 1003 |
+
target_duration_seconds,
|
| 1004 |
+
min_silence_ms=min_silence_ms,
|
| 1005 |
+
max_extra_silence_per_gap_ms=max_extra_silence_per_gap_ms,
|
| 1006 |
+
crossfade_ms=0 # No crossfade for random ordering
|
| 1007 |
+
)
|
| 1008 |
+
|
| 1009 |
+
# Create metadata
|
| 1010 |
+
metadata = {
|
| 1011 |
+
'num_unique_sources': num_sources,
|
| 1012 |
+
'total_clips': len(expanded_clips),
|
| 1013 |
+
'ordering_mode': 'random',
|
| 1014 |
+
'repetitions_per_source': dict(zip(source_categories, repetitions_per_source)),
|
| 1015 |
+
'clip_sequence': clip_sequence,
|
| 1016 |
+
'target_duration_ms': target_ms,
|
| 1017 |
+
'actual_duration_ms': len(final_audio),
|
| 1018 |
+
'num_gaps': len(expanded_clips) - 1
|
| 1019 |
+
}
|
| 1020 |
+
|
| 1021 |
+
logger.debug(
|
| 1022 |
+
f"Count task (random): {num_sources} sources, {len(expanded_clips)} clips, "
|
| 1023 |
+
f"sequence={clip_sequence[:5]}..., duration={len(final_audio)}ms"
|
| 1024 |
+
)
|
| 1025 |
+
|
| 1026 |
+
return final_audio, clip_sequence, metadata
|
| 1027 |
+
|
| 1028 |
+
|
| 1029 |
+
def build_count_task_audio(
|
| 1030 |
+
source_audios: List[AudioSegment],
|
| 1031 |
+
source_categories: List[str],
|
| 1032 |
+
target_duration_seconds: float,
|
| 1033 |
+
ordering_mode: str = "random",
|
| 1034 |
+
source_clip_duration_seconds: float = 5.0,
|
| 1035 |
+
min_silence_ms: int = 100,
|
| 1036 |
+
max_extra_silence_per_gap_ms: int = 500,
|
| 1037 |
+
crossfade_within_source_ms: int = 50
|
| 1038 |
+
) -> Tuple[AudioSegment, List[str], dict]:
|
| 1039 |
+
"""
|
| 1040 |
+
Build audio for COUNT task with configurable ordering mode.
|
| 1041 |
+
|
| 1042 |
+
Args:
|
| 1043 |
+
source_audios: List of unique source audio segments (one per class)
|
| 1044 |
+
source_categories: List of category names
|
| 1045 |
+
target_duration_seconds: Target total duration
|
| 1046 |
+
ordering_mode: "random" or "consecutive"
|
| 1047 |
+
- "random": Clips shuffled (A B A C B A C) - tests sound recognition
|
| 1048 |
+
- "consecutive": Same-source grouped (AAA BBB CCC) - easier
|
| 1049 |
+
source_clip_duration_seconds: Duration of each source clip
|
| 1050 |
+
min_silence_ms: Minimum silence between clips
|
| 1051 |
+
max_extra_silence_per_gap_ms: Max extra silence per gap
|
| 1052 |
+
crossfade_within_source_ms: Crossfade for consecutive mode only
|
| 1053 |
+
|
| 1054 |
+
Returns:
|
| 1055 |
+
Tuple of (final_audio, clip_sequence, metadata_dict)
|
| 1056 |
+
"""
|
| 1057 |
+
if ordering_mode == "consecutive":
|
| 1058 |
+
return build_consecutive_sources_for_count_task(
|
| 1059 |
+
source_audios,
|
| 1060 |
+
source_categories,
|
| 1061 |
+
target_duration_seconds,
|
| 1062 |
+
source_clip_duration_seconds,
|
| 1063 |
+
min_silence_ms,
|
| 1064 |
+
max_extra_silence_per_gap_ms,
|
| 1065 |
+
crossfade_within_source_ms
|
| 1066 |
+
)
|
| 1067 |
+
else: # random (default)
|
| 1068 |
+
return build_random_order_for_count_task(
|
| 1069 |
+
source_audios,
|
| 1070 |
+
source_categories,
|
| 1071 |
+
target_duration_seconds,
|
| 1072 |
+
source_clip_duration_seconds,
|
| 1073 |
+
min_silence_ms,
|
| 1074 |
+
max_extra_silence_per_gap_ms
|
| 1075 |
+
)
|
| 1076 |
+
|
| 1077 |
+
|
| 1078 |
+
# =============================================================================
|
| 1079 |
+
# DURATION TASK FUNCTIONS
|
| 1080 |
+
# =============================================================================
|
| 1081 |
+
|
| 1082 |
+
def calculate_duration_slot_distribution(
|
| 1083 |
+
target_total_duration_s: float,
|
| 1084 |
+
effective_durations: Dict[str, float],
|
| 1085 |
+
target_category: str,
|
| 1086 |
+
question_type: str,
|
| 1087 |
+
multiplier_longest: float = 1.5,
|
| 1088 |
+
multiplier_shortest: float = 0.5,
|
| 1089 |
+
min_silence_between_sources_ms: int = 100
|
| 1090 |
+
) -> Tuple[Dict[str, int], bool, Dict]:
|
| 1091 |
+
"""
|
| 1092 |
+
Calculate how many repetitions each source gets for duration task.
|
| 1093 |
+
|
| 1094 |
+
For LONGEST: target gets max repetitions, backgrounds get 1 each
|
| 1095 |
+
For SHORTEST: target gets 1, backgrounds share remaining duration
|
| 1096 |
+
|
| 1097 |
+
Args:
|
| 1098 |
+
target_total_duration_s: Target total audio duration
|
| 1099 |
+
effective_durations: Dict mapping category -> effective duration in seconds
|
| 1100 |
+
target_category: The category that should be longest/shortest
|
| 1101 |
+
question_type: "longest" or "shortest"
|
| 1102 |
+
multiplier_longest: target >= max_background * this
|
| 1103 |
+
multiplier_shortest: target <= min_background * this
|
| 1104 |
+
min_silence_between_sources_ms: Minimum silence between different sources
|
| 1105 |
+
|
| 1106 |
+
Returns:
|
| 1107 |
+
Tuple of (slot_distribution, gap_satisfied, metadata)
|
| 1108 |
+
slot_distribution: Dict mapping category -> number of repetitions
|
| 1109 |
+
gap_satisfied: Whether the duration gap constraint is met
|
| 1110 |
+
metadata: Additional info about the calculation
|
| 1111 |
+
"""
|
| 1112 |
+
categories = list(effective_durations.keys())
|
| 1113 |
+
n_sources = len(categories)
|
| 1114 |
+
|
| 1115 |
+
if n_sources < 2:
|
| 1116 |
+
# Single source - always satisfies constraint
|
| 1117 |
+
reps = max(1, int(target_total_duration_s / effective_durations[target_category]))
|
| 1118 |
+
return {target_category: reps}, True, {'note': 'single_source'}
|
| 1119 |
+
|
| 1120 |
+
# Total silence between sources
|
| 1121 |
+
total_silence_s = (n_sources - 1) * min_silence_between_sources_ms / 1000.0
|
| 1122 |
+
available_for_audio_s = target_total_duration_s - total_silence_s
|
| 1123 |
+
|
| 1124 |
+
background_categories = [c for c in categories if c != target_category]
|
| 1125 |
+
|
| 1126 |
+
if question_type == "longest":
|
| 1127 |
+
# Backgrounds get 1 rep each
|
| 1128 |
+
background_duration_s = sum(effective_durations[c] for c in background_categories)
|
| 1129 |
+
|
| 1130 |
+
# Remaining for target
|
| 1131 |
+
remaining_for_target_s = available_for_audio_s - background_duration_s
|
| 1132 |
+
target_duration_per_rep = effective_durations[target_category]
|
| 1133 |
+
|
| 1134 |
+
# Calculate reps for target
|
| 1135 |
+
target_reps = max(1, int(remaining_for_target_s / target_duration_per_rep))
|
| 1136 |
+
actual_target_duration = target_reps * target_duration_per_rep
|
| 1137 |
+
|
| 1138 |
+
# Verify gap
|
| 1139 |
+
max_background_duration = max(effective_durations[c] for c in background_categories)
|
| 1140 |
+
required_target_duration = max_background_duration * multiplier_longest
|
| 1141 |
+
gap_satisfied = actual_target_duration >= required_target_duration
|
| 1142 |
+
|
| 1143 |
+
slot_distribution = {c: 1 for c in background_categories}
|
| 1144 |
+
slot_distribution[target_category] = target_reps
|
| 1145 |
+
|
| 1146 |
+
metadata = {
|
| 1147 |
+
'available_for_audio_s': available_for_audio_s,
|
| 1148 |
+
'background_duration_s': background_duration_s,
|
| 1149 |
+
'remaining_for_target_s': remaining_for_target_s,
|
| 1150 |
+
'target_reps': target_reps,
|
| 1151 |
+
'actual_target_duration_s': actual_target_duration,
|
| 1152 |
+
'max_background_duration_s': max_background_duration,
|
| 1153 |
+
'required_target_duration_s': required_target_duration,
|
| 1154 |
+
'multiplier_used': multiplier_longest
|
| 1155 |
+
}
|
| 1156 |
+
|
| 1157 |
+
else: # shortest
|
| 1158 |
+
# Target gets 1 rep
|
| 1159 |
+
target_duration_s = effective_durations[target_category]
|
| 1160 |
+
|
| 1161 |
+
# Remaining for backgrounds
|
| 1162 |
+
remaining_for_backgrounds_s = available_for_audio_s - target_duration_s
|
| 1163 |
+
|
| 1164 |
+
# Distribute remaining to backgrounds as evenly as possible
|
| 1165 |
+
# while ensuring each background is longer than target * 1/multiplier
|
| 1166 |
+
slot_distribution = {target_category: 1}
|
| 1167 |
+
|
| 1168 |
+
# Calculate minimum required duration for each background
|
| 1169 |
+
min_background_required = target_duration_s / multiplier_shortest
|
| 1170 |
+
|
| 1171 |
+
background_reps = {}
|
| 1172 |
+
for cat in background_categories:
|
| 1173 |
+
eff_dur = effective_durations[cat]
|
| 1174 |
+
# How many reps needed to exceed min_background_required?
|
| 1175 |
+
min_reps = max(1, int(min_background_required / eff_dur) + 1)
|
| 1176 |
+
background_reps[cat] = min_reps
|
| 1177 |
+
|
| 1178 |
+
# Check if we have room for all backgrounds
|
| 1179 |
+
total_background_needed = sum(
|
| 1180 |
+
background_reps[c] * effective_durations[c]
|
| 1181 |
+
for c in background_categories
|
| 1182 |
+
)
|
| 1183 |
+
|
| 1184 |
+
if total_background_needed <= remaining_for_backgrounds_s:
|
| 1185 |
+
# Distribute extra reps
|
| 1186 |
+
extra_available = remaining_for_backgrounds_s - total_background_needed
|
| 1187 |
+
|
| 1188 |
+
# Add extra reps to backgrounds proportionally
|
| 1189 |
+
while extra_available > 0:
|
| 1190 |
+
added_any = False
|
| 1191 |
+
for cat in background_categories:
|
| 1192 |
+
eff_dur = effective_durations[cat]
|
| 1193 |
+
if extra_available >= eff_dur:
|
| 1194 |
+
background_reps[cat] += 1
|
| 1195 |
+
extra_available -= eff_dur
|
| 1196 |
+
added_any = True
|
| 1197 |
+
if not added_any:
|
| 1198 |
+
break
|
| 1199 |
+
|
| 1200 |
+
slot_distribution.update(background_reps)
|
| 1201 |
+
gap_satisfied = True
|
| 1202 |
+
else:
|
| 1203 |
+
# Not enough room - use minimum reps anyway
|
| 1204 |
+
slot_distribution.update(background_reps)
|
| 1205 |
+
gap_satisfied = False
|
| 1206 |
+
|
| 1207 |
+
# Calculate actual durations
|
| 1208 |
+
actual_durations = {
|
| 1209 |
+
cat: slot_distribution[cat] * effective_durations[cat]
|
| 1210 |
+
for cat in categories
|
| 1211 |
+
}
|
| 1212 |
+
min_background_actual = min(
|
| 1213 |
+
actual_durations[c] for c in background_categories
|
| 1214 |
+
)
|
| 1215 |
+
|
| 1216 |
+
# Re-verify gap
|
| 1217 |
+
gap_satisfied = actual_durations[target_category] <= min_background_actual * multiplier_shortest
|
| 1218 |
+
|
| 1219 |
+
metadata = {
|
| 1220 |
+
'available_for_audio_s': available_for_audio_s,
|
| 1221 |
+
'target_duration_s': target_duration_s,
|
| 1222 |
+
'remaining_for_backgrounds_s': remaining_for_backgrounds_s,
|
| 1223 |
+
'min_background_required_s': min_background_required,
|
| 1224 |
+
'actual_durations_s': actual_durations,
|
| 1225 |
+
'min_background_actual_s': min_background_actual,
|
| 1226 |
+
'multiplier_used': multiplier_shortest
|
| 1227 |
+
}
|
| 1228 |
+
|
| 1229 |
+
return slot_distribution, gap_satisfied, metadata
|
| 1230 |
+
|
| 1231 |
+
|
| 1232 |
+
def build_duration_task_audio(
|
| 1233 |
+
source_audio_lists: Dict[str, List[AudioSegment]],
|
| 1234 |
+
slot_distribution: Dict[str, int],
|
| 1235 |
+
effective_durations: Dict[str, float],
|
| 1236 |
+
target_total_duration_s: float,
|
| 1237 |
+
min_silence_between_sources_ms: int = 100,
|
| 1238 |
+
max_extra_silence_per_gap_ms: int = 500,
|
| 1239 |
+
crossfade_within_source_ms: int = 50
|
| 1240 |
+
) -> Tuple[AudioSegment, List[str], Dict]:
|
| 1241 |
+
"""
|
| 1242 |
+
Build audio for DURATION task with consecutive ordering per source.
|
| 1243 |
+
|
| 1244 |
+
Structure: [SourceA × n] + silence + [SourceB × m] + silence + ...
|
| 1245 |
+
Order of sources is randomized to avoid patterns.
|
| 1246 |
+
|
| 1247 |
+
Args:
|
| 1248 |
+
source_audio_lists: Dict mapping category -> list of audio segments
|
| 1249 |
+
slot_distribution: Dict mapping category -> number of repetitions
|
| 1250 |
+
effective_durations: Dict mapping category -> effective duration per clip
|
| 1251 |
+
target_total_duration_s: Target total duration
|
| 1252 |
+
min_silence_between_sources_ms: Min silence between different sources
|
| 1253 |
+
max_extra_silence_per_gap_ms: Max extra silence per gap
|
| 1254 |
+
crossfade_within_source_ms: Crossfade between same-source repetitions
|
| 1255 |
+
|
| 1256 |
+
Returns:
|
| 1257 |
+
Tuple of (final_audio, category_sequence, metadata)
|
| 1258 |
+
"""
|
| 1259 |
+
categories = list(slot_distribution.keys())
|
| 1260 |
+
|
| 1261 |
+
# Randomize source order
|
| 1262 |
+
random.shuffle(categories)
|
| 1263 |
+
|
| 1264 |
+
# Build audio blocks for each source
|
| 1265 |
+
source_blocks = []
|
| 1266 |
+
category_sequence = []
|
| 1267 |
+
actual_durations = {}
|
| 1268 |
+
block_durations_ms = [] # Track duration of each block for timestamp calculation
|
| 1269 |
+
|
| 1270 |
+
for category in categories:
|
| 1271 |
+
reps = slot_distribution[category]
|
| 1272 |
+
audio_list = source_audio_lists[category]
|
| 1273 |
+
|
| 1274 |
+
if reps == 0:
|
| 1275 |
+
continue
|
| 1276 |
+
|
| 1277 |
+
# Build block for this source
|
| 1278 |
+
block = audio_list[0]
|
| 1279 |
+
for i in range(1, reps):
|
| 1280 |
+
# Use same clip or cycle through available clips
|
| 1281 |
+
next_clip = audio_list[i % len(audio_list)]
|
| 1282 |
+
|
| 1283 |
+
# Crossfade within same source
|
| 1284 |
+
if crossfade_within_source_ms > 0:
|
| 1285 |
+
if len(block) > crossfade_within_source_ms and len(next_clip) > crossfade_within_source_ms:
|
| 1286 |
+
block = block.append(next_clip, crossfade=crossfade_within_source_ms)
|
| 1287 |
+
else:
|
| 1288 |
+
block = block + next_clip
|
| 1289 |
+
else:
|
| 1290 |
+
block = block + next_clip
|
| 1291 |
+
|
| 1292 |
+
source_blocks.append((category, block))
|
| 1293 |
+
block_durations_ms.append(len(block))
|
| 1294 |
+
category_sequence.extend([category] * reps)
|
| 1295 |
+
actual_durations[category] = len(block) / 1000.0
|
| 1296 |
+
|
| 1297 |
+
# Calculate total audio duration and available extra silence
|
| 1298 |
+
total_audio_ms = sum(len(block) for _, block in source_blocks)
|
| 1299 |
+
num_gaps = len(source_blocks) - 1
|
| 1300 |
+
min_total_silence_ms = num_gaps * min_silence_between_sources_ms
|
| 1301 |
+
|
| 1302 |
+
target_ms = int(target_total_duration_s * 1000)
|
| 1303 |
+
available_extra_ms = target_ms - total_audio_ms - min_total_silence_ms
|
| 1304 |
+
|
| 1305 |
+
# Distribute extra silence
|
| 1306 |
+
if available_extra_ms > 0 and num_gaps > 0:
|
| 1307 |
+
extra_silences = distribute_remainder_as_silences(
|
| 1308 |
+
available_extra_ms,
|
| 1309 |
+
num_gaps,
|
| 1310 |
+
max_extra_silence_per_gap_ms
|
| 1311 |
+
)
|
| 1312 |
+
else:
|
| 1313 |
+
extra_silences = [0] * max(num_gaps, 1)
|
| 1314 |
+
|
| 1315 |
+
# Concatenate with silences and track timestamps
|
| 1316 |
+
source_timestamps = [] # List of (category, start_ms, end_ms)
|
| 1317 |
+
current_position_ms = 0
|
| 1318 |
+
|
| 1319 |
+
if len(source_blocks) == 1:
|
| 1320 |
+
final_audio = source_blocks[0][1]
|
| 1321 |
+
cat, block = source_blocks[0]
|
| 1322 |
+
source_timestamps.append((cat, 0, len(block)))
|
| 1323 |
+
else:
|
| 1324 |
+
final_audio = source_blocks[0][1]
|
| 1325 |
+
cat, block = source_blocks[0]
|
| 1326 |
+
source_timestamps.append((cat, 0, len(block)))
|
| 1327 |
+
current_position_ms = len(block)
|
| 1328 |
+
|
| 1329 |
+
for i, (cat, block) in enumerate(source_blocks[1:]):
|
| 1330 |
+
gap_silence_ms = min_silence_between_sources_ms + extra_silences[i]
|
| 1331 |
+
silence = AudioSegment.silent(duration=gap_silence_ms)
|
| 1332 |
+
|
| 1333 |
+
# Prefer crossfading from audio -> silence for a smooth transition,
|
| 1334 |
+
# but avoid crossfading silence -> audio (it cuts the start of the next clip).
|
| 1335 |
+
# Conditions for safe crossfade:
|
| 1336 |
+
# - crossfade length should be less than gap silence
|
| 1337 |
+
# - both segments must be longer than crossfade
|
| 1338 |
+
crossfade_ms = min(500, gap_silence_ms)
|
| 1339 |
+
if crossfade_ms > 0 and crossfade_ms < gap_silence_ms and len(final_audio) > crossfade_ms and len(block) > crossfade_ms:
|
| 1340 |
+
final_audio = final_audio.append(silence, crossfade=crossfade_ms)
|
| 1341 |
+
# Append next block without crossfade to avoid trimming its start
|
| 1342 |
+
final_audio = final_audio.append(block, crossfade=0)
|
| 1343 |
+
# Track timestamp after silence (start of block)
|
| 1344 |
+
start_ms = current_position_ms + gap_silence_ms
|
| 1345 |
+
end_ms = start_ms + len(block)
|
| 1346 |
+
source_timestamps.append((cat, start_ms, end_ms))
|
| 1347 |
+
current_position_ms = end_ms
|
| 1348 |
+
else:
|
| 1349 |
+
# Fall back to simple concatenation
|
| 1350 |
+
final_audio = final_audio + silence + block
|
| 1351 |
+
start_ms = current_position_ms + gap_silence_ms
|
| 1352 |
+
end_ms = start_ms + len(block)
|
| 1353 |
+
source_timestamps.append((cat, start_ms, end_ms))
|
| 1354 |
+
current_position_ms = end_ms
|
| 1355 |
+
|
| 1356 |
+
# Adjust to target duration
|
| 1357 |
+
if len(final_audio) > target_ms:
|
| 1358 |
+
final_audio = final_audio[:target_ms]
|
| 1359 |
+
elif len(final_audio) < target_ms:
|
| 1360 |
+
padding = AudioSegment.silent(duration=target_ms - len(final_audio))
|
| 1361 |
+
final_audio = final_audio + padding
|
| 1362 |
+
|
| 1363 |
+
# Build timestamp string: "category1 start-end, category2 start-end, ..."
|
| 1364 |
+
timestamp_parts = []
|
| 1365 |
+
for cat, start_ms, end_ms in source_timestamps:
|
| 1366 |
+
start_s = round(start_ms / 1000.0, 2)
|
| 1367 |
+
end_s = round(end_ms / 1000.0, 2)
|
| 1368 |
+
duration_s = round((end_ms - start_ms) / 1000.0, 2)
|
| 1369 |
+
timestamp_parts.append(f"{cat} {start_s}s-{end_s}s ({duration_s}s)")
|
| 1370 |
+
timestamp_string = ", ".join(timestamp_parts)
|
| 1371 |
+
|
| 1372 |
+
metadata = {
|
| 1373 |
+
'source_order': [cat for cat, _ in source_blocks],
|
| 1374 |
+
'slot_distribution': slot_distribution,
|
| 1375 |
+
'actual_durations_s': actual_durations,
|
| 1376 |
+
'total_audio_ms': total_audio_ms,
|
| 1377 |
+
'num_gaps': num_gaps,
|
| 1378 |
+
'final_duration_ms': len(final_audio),
|
| 1379 |
+
'source_timestamps': source_timestamps, # List of (category, start_ms, end_ms)
|
| 1380 |
+
'timestamp_string': timestamp_string # Human-readable format
|
| 1381 |
+
}
|
| 1382 |
+
|
| 1383 |
+
logger.debug(
|
| 1384 |
+
f"Duration task audio: {len(source_blocks)} sources, "
|
| 1385 |
+
f"order={metadata['source_order']}, duration={len(final_audio)}ms"
|
| 1386 |
+
)
|
| 1387 |
+
|
| 1388 |
+
return final_audio, category_sequence, metadata
|
utils/dataset_utils.py
ADDED
|
@@ -0,0 +1,536 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ESC-50 dataset utilities for loading and sampling audio data.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import csv
|
| 6 |
+
import json
|
| 7 |
+
import random
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, List, Optional, Tuple
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
|
| 13 |
+
from .logger import setup_logger
|
| 14 |
+
|
| 15 |
+
logger = setup_logger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def load_or_create_class_subset(config: dict, all_categories: List[str]) -> List[str]:
|
| 19 |
+
"""
|
| 20 |
+
Load persisted class subset or create a new one.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
config: Configuration dictionary with dataset.use_class_subset, etc.
|
| 24 |
+
all_categories: List of all available categories
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
List of category names to use (either subset or all)
|
| 28 |
+
"""
|
| 29 |
+
dataset_config = config.get('dataset', {})
|
| 30 |
+
use_subset = dataset_config.get('use_class_subset', False)
|
| 31 |
+
|
| 32 |
+
if not use_subset:
|
| 33 |
+
logger.info(f"Using all {len(all_categories)} classes")
|
| 34 |
+
return all_categories
|
| 35 |
+
|
| 36 |
+
num_classes = dataset_config.get('num_classes_subset', len(all_categories))
|
| 37 |
+
persist_path = Path(dataset_config.get('subset_persist_path', 'class_subset.json'))
|
| 38 |
+
subset_seed = dataset_config.get('subset_seed', 42)
|
| 39 |
+
|
| 40 |
+
# Try to load existing subset
|
| 41 |
+
if persist_path.exists():
|
| 42 |
+
try:
|
| 43 |
+
with open(persist_path, 'r') as f:
|
| 44 |
+
data = json.load(f)
|
| 45 |
+
subset = data.get('classes', [])
|
| 46 |
+
|
| 47 |
+
# Validate subset
|
| 48 |
+
if len(subset) == num_classes and all(c in all_categories for c in subset):
|
| 49 |
+
logger.info(f"Loaded persisted class subset from {persist_path}: {len(subset)} classes")
|
| 50 |
+
return subset
|
| 51 |
+
else:
|
| 52 |
+
logger.warning(f"Invalid persisted subset, regenerating...")
|
| 53 |
+
except Exception as e:
|
| 54 |
+
logger.warning(f"Failed to load persisted subset: {e}, regenerating...")
|
| 55 |
+
|
| 56 |
+
# Create new subset
|
| 57 |
+
random.seed(subset_seed)
|
| 58 |
+
subset = random.sample(all_categories, min(num_classes, len(all_categories)))
|
| 59 |
+
subset.sort() # Sort for consistency
|
| 60 |
+
|
| 61 |
+
# Persist subset
|
| 62 |
+
persist_path.parent.mkdir(parents=True, exist_ok=True)
|
| 63 |
+
with open(persist_path, 'w') as f:
|
| 64 |
+
json.dump({
|
| 65 |
+
'classes': subset,
|
| 66 |
+
'num_classes': len(subset),
|
| 67 |
+
'seed': subset_seed,
|
| 68 |
+
'total_available': len(all_categories)
|
| 69 |
+
}, f, indent=2)
|
| 70 |
+
|
| 71 |
+
logger.info(f"Created and persisted new class subset: {len(subset)} classes to {persist_path}")
|
| 72 |
+
return subset
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class ESC50Dataset:
|
| 76 |
+
"""Handler for ESC-50 dataset."""
|
| 77 |
+
|
| 78 |
+
# All 50 ESC-50 sound categories
|
| 79 |
+
ALL_CATEGORIES = [
|
| 80 |
+
'dog', 'chirping_birds', 'vacuum_cleaner', 'thunderstorm', 'door_wood_knock',
|
| 81 |
+
'can_opening', 'crow', 'clapping', 'fireworks', 'chainsaw', 'airplane',
|
| 82 |
+
'mouse_click', 'pouring_water', 'train', 'sheep', 'water_drops', 'church_bells',
|
| 83 |
+
'clock_alarm', 'keyboard_typing', 'wind', 'footsteps', 'frog', 'cow',
|
| 84 |
+
'brushing_teeth', 'car_horn', 'crackling_fire', 'helicopter', 'drinking_sipping',
|
| 85 |
+
'rain', 'insects', 'laughing', 'hen', 'engine', 'breathing', 'crying_baby',
|
| 86 |
+
'hand_saw', 'coughing', 'glass_breaking', 'snoring', 'toilet_flush', 'pig',
|
| 87 |
+
'washing_machine', 'clock_tick', 'sneezing', 'rooster', 'sea_waves', 'siren',
|
| 88 |
+
'cat', 'door_wood_creaks', 'crickets'
|
| 89 |
+
]
|
| 90 |
+
|
| 91 |
+
def __init__(self, metadata_path: str, audio_path: str, config: Optional[dict] = None):
|
| 92 |
+
"""
|
| 93 |
+
Initialize ESC-50 dataset handler.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
metadata_path: Path to esc50.csv metadata file
|
| 97 |
+
audio_path: Path to audio directory
|
| 98 |
+
config: Optional configuration dict with dataset.use_class_subset settings
|
| 99 |
+
"""
|
| 100 |
+
self.metadata_path = Path(metadata_path)
|
| 101 |
+
self.audio_path = Path(audio_path)
|
| 102 |
+
self.config = config or {}
|
| 103 |
+
self.df = None
|
| 104 |
+
self.category_to_target = {}
|
| 105 |
+
self.target_to_category = {}
|
| 106 |
+
|
| 107 |
+
# Load class subset if configured
|
| 108 |
+
self.CATEGORIES = load_or_create_class_subset(self.config, self.ALL_CATEGORIES)
|
| 109 |
+
self.category_usage_counts = {cat: 0 for cat in self.CATEGORIES}
|
| 110 |
+
|
| 111 |
+
self.load_metadata()
|
| 112 |
+
|
| 113 |
+
def load_metadata(self):
|
| 114 |
+
"""Load ESC-50 metadata CSV."""
|
| 115 |
+
try:
|
| 116 |
+
self.df = pd.read_csv(self.metadata_path)
|
| 117 |
+
logger.info(f"Loaded ESC-50 metadata: {len(self.df)} files")
|
| 118 |
+
|
| 119 |
+
# Create category mappings
|
| 120 |
+
for target, category in zip(self.df['target'], self.df['category']):
|
| 121 |
+
self.category_to_target[category] = target
|
| 122 |
+
self.target_to_category[target] = category
|
| 123 |
+
|
| 124 |
+
logger.info(f"Found {len(self.category_to_target)} unique categories")
|
| 125 |
+
except Exception as e:
|
| 126 |
+
logger.error(f"Error loading metadata: {e}")
|
| 127 |
+
raise
|
| 128 |
+
|
| 129 |
+
def get_files_by_category(self, category: str) -> List[str]:
|
| 130 |
+
"""
|
| 131 |
+
Get all audio files for a specific category.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
category: Sound category name
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
List of filenames for the category
|
| 138 |
+
"""
|
| 139 |
+
if category not in self.category_to_target:
|
| 140 |
+
raise ValueError(f"Unknown category: {category}")
|
| 141 |
+
|
| 142 |
+
target = self.category_to_target[category]
|
| 143 |
+
files = self.df[self.df['target'] == target]['filename'].tolist()
|
| 144 |
+
return files
|
| 145 |
+
|
| 146 |
+
def get_files_by_target(self, target: int) -> List[str]:
|
| 147 |
+
"""
|
| 148 |
+
Get all audio files for a specific target ID.
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
target: Target class ID (0-49)
|
| 152 |
+
|
| 153 |
+
Returns:
|
| 154 |
+
List of filenames for the target
|
| 155 |
+
"""
|
| 156 |
+
files = self.df[self.df['target'] == target]['filename'].tolist()
|
| 157 |
+
return files
|
| 158 |
+
|
| 159 |
+
def sample_categories(self, n: int, exclude: Optional[List[str]] = None) -> List[str]:
|
| 160 |
+
"""
|
| 161 |
+
Sample n unique random categories from the active subset.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
n: Number of categories to sample
|
| 165 |
+
exclude: Optional list of categories to exclude
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
List of sampled category names
|
| 169 |
+
"""
|
| 170 |
+
available = [c for c in self.CATEGORIES if c not in (exclude or [])]
|
| 171 |
+
if n > len(available):
|
| 172 |
+
raise ValueError(f"Cannot sample {n} categories from subset, only {len(available)} available (subset size: {len(self.CATEGORIES)})")
|
| 173 |
+
return random.sample(available, n)
|
| 174 |
+
|
| 175 |
+
def sample_targets(self, n: int, exclude: Optional[List[int]] = None) -> List[int]:
|
| 176 |
+
"""
|
| 177 |
+
Sample n unique random targets from the active subset.
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
n: Number of targets to sample
|
| 181 |
+
exclude: Optional list of targets to exclude
|
| 182 |
+
|
| 183 |
+
Returns:
|
| 184 |
+
List of sampled target IDs corresponding to categories in the subset
|
| 185 |
+
"""
|
| 186 |
+
# Get targets corresponding to categories in the subset
|
| 187 |
+
available_targets = [self.category_to_target[cat] for cat in self.CATEGORIES]
|
| 188 |
+
available = [t for t in available_targets if t not in (exclude or [])]
|
| 189 |
+
if n > len(available):
|
| 190 |
+
raise ValueError(f"Cannot sample {n} targets from subset, only {len(available)} available (subset size: {len(self.CATEGORIES)})")
|
| 191 |
+
return random.sample(available, n)
|
| 192 |
+
|
| 193 |
+
def sample_file_from_category(self, category: str) -> Tuple[str, str]:
|
| 194 |
+
"""
|
| 195 |
+
Sample a random audio file from a category.
|
| 196 |
+
|
| 197 |
+
Args:
|
| 198 |
+
category: Sound category name
|
| 199 |
+
|
| 200 |
+
Returns:
|
| 201 |
+
Tuple of (filename, full_path)
|
| 202 |
+
"""
|
| 203 |
+
files = self.get_files_by_category(category)
|
| 204 |
+
filename = random.choice(files)
|
| 205 |
+
full_path = str(self.audio_path / filename)
|
| 206 |
+
return filename, full_path
|
| 207 |
+
|
| 208 |
+
def sample_file_from_target(self, target: int) -> Tuple[str, str, str]:
|
| 209 |
+
"""
|
| 210 |
+
Sample a random audio file from a target.
|
| 211 |
+
|
| 212 |
+
Args:
|
| 213 |
+
target: Target class ID
|
| 214 |
+
|
| 215 |
+
Returns:
|
| 216 |
+
Tuple of (filename, category, full_path)
|
| 217 |
+
"""
|
| 218 |
+
files = self.get_files_by_target(target)
|
| 219 |
+
filename = random.choice(files)
|
| 220 |
+
category = self.target_to_category[target]
|
| 221 |
+
full_path = str(self.audio_path / filename)
|
| 222 |
+
return filename, category, full_path
|
| 223 |
+
|
| 224 |
+
def get_category_from_filename(self, filename: str) -> str:
|
| 225 |
+
"""Get category name from filename."""
|
| 226 |
+
row = self.df[self.df['filename'] == filename]
|
| 227 |
+
if len(row) == 0:
|
| 228 |
+
raise ValueError(f"Unknown filename: {filename}")
|
| 229 |
+
return row.iloc[0]['category']
|
| 230 |
+
|
| 231 |
+
def get_file_path(self, filename: str) -> str:
|
| 232 |
+
"""Get full path for a filename."""
|
| 233 |
+
return str(self.audio_path / filename)
|
| 234 |
+
|
| 235 |
+
def sample_categories_balanced(self, n: int, exclude: Optional[List[str]] = None,
|
| 236 |
+
answer_category: Optional[str] = None) -> List[str]:
|
| 237 |
+
"""
|
| 238 |
+
Sample n unique categories with balanced usage tracking.
|
| 239 |
+
|
| 240 |
+
This method ensures that over many samples, all categories appear
|
| 241 |
+
roughly equally as answers by preferentially sampling underused categories.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
n: Number of categories to sample
|
| 245 |
+
exclude: Optional list of categories to exclude
|
| 246 |
+
answer_category: If provided, ensures this category is included and tracks it
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
List of sampled category names with answer_category first if provided
|
| 250 |
+
"""
|
| 251 |
+
available = [c for c in self.CATEGORIES if c not in (exclude or [])]
|
| 252 |
+
if n > len(available):
|
| 253 |
+
raise ValueError(f"Cannot sample {n} categories, only {len(available)} available")
|
| 254 |
+
|
| 255 |
+
if answer_category:
|
| 256 |
+
# Track answer category usage
|
| 257 |
+
self.category_usage_counts[answer_category] += 1
|
| 258 |
+
|
| 259 |
+
# Remove answer category from available and sample the rest
|
| 260 |
+
available = [c for c in available if c != answer_category]
|
| 261 |
+
other_categories = random.sample(available, n - 1)
|
| 262 |
+
return [answer_category] + other_categories
|
| 263 |
+
else:
|
| 264 |
+
# Sample without specific answer category
|
| 265 |
+
return random.sample(available, n)
|
| 266 |
+
|
| 267 |
+
def get_least_used_categories(self, n: int, exclude: Optional[List[str]] = None) -> List[str]:
|
| 268 |
+
"""
|
| 269 |
+
Get n categories that have been used least as answers.
|
| 270 |
+
|
| 271 |
+
Args:
|
| 272 |
+
n: Number of categories to get
|
| 273 |
+
exclude: Optional list of categories to exclude
|
| 274 |
+
|
| 275 |
+
Returns:
|
| 276 |
+
List of least-used category names
|
| 277 |
+
"""
|
| 278 |
+
available = [c for c in self.CATEGORIES if c not in (exclude or [])]
|
| 279 |
+
if n > len(available):
|
| 280 |
+
raise ValueError(f"Cannot get {n} categories, only {len(available)} available")
|
| 281 |
+
|
| 282 |
+
# Sort by usage count (ascending) and take n least used
|
| 283 |
+
sorted_categories = sorted(available, key=lambda c: self.category_usage_counts[c])
|
| 284 |
+
|
| 285 |
+
# Among least used, get all with same minimum count
|
| 286 |
+
min_count = self.category_usage_counts[sorted_categories[0]]
|
| 287 |
+
candidates = [c for c in sorted_categories if self.category_usage_counts[c] == min_count]
|
| 288 |
+
|
| 289 |
+
if len(candidates) >= n:
|
| 290 |
+
# Randomly sample from least used
|
| 291 |
+
return random.sample(candidates, n)
|
| 292 |
+
else:
|
| 293 |
+
# Take all minimum and fill with next tier
|
| 294 |
+
result = candidates.copy()
|
| 295 |
+
remaining = n - len(result)
|
| 296 |
+
next_tier = [c for c in sorted_categories if c not in candidates][:remaining]
|
| 297 |
+
result.extend(next_tier)
|
| 298 |
+
return result
|
| 299 |
+
|
| 300 |
+
def get_category_usage_stats(self) -> Dict[str, int]:
|
| 301 |
+
"""Get current category usage statistics."""
|
| 302 |
+
return self.category_usage_counts.copy()
|
| 303 |
+
|
| 304 |
+
def reset_category_usage(self):
|
| 305 |
+
"""Reset category usage tracking."""
|
| 306 |
+
self.category_usage_counts = {cat: 0 for cat in self.CATEGORIES}
|
| 307 |
+
logger.info("Reset category usage tracking")
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
class PreprocessedESC50Dataset(ESC50Dataset):
|
| 311 |
+
"""
|
| 312 |
+
Handler for preprocessed ESC-50 dataset with effective durations.
|
| 313 |
+
|
| 314 |
+
Extends ESC50Dataset to use trimmed audio files and effective duration
|
| 315 |
+
metadata from amplitude-based preprocessing.
|
| 316 |
+
"""
|
| 317 |
+
|
| 318 |
+
def __init__(
|
| 319 |
+
self,
|
| 320 |
+
metadata_path: str,
|
| 321 |
+
audio_path: str,
|
| 322 |
+
preprocessed_path: str,
|
| 323 |
+
config: Optional[dict] = None
|
| 324 |
+
):
|
| 325 |
+
"""
|
| 326 |
+
Initialize preprocessed ESC-50 dataset handler.
|
| 327 |
+
|
| 328 |
+
Args:
|
| 329 |
+
metadata_path: Path to original esc50.csv metadata file
|
| 330 |
+
audio_path: Path to original audio directory (fallback)
|
| 331 |
+
preprocessed_path: Path to preprocessed data directory
|
| 332 |
+
config: Optional configuration dict with dataset.use_class_subset settings
|
| 333 |
+
"""
|
| 334 |
+
super().__init__(metadata_path, audio_path, config)
|
| 335 |
+
|
| 336 |
+
self.preprocessed_path = Path(preprocessed_path)
|
| 337 |
+
self.trimmed_audio_path = self.preprocessed_path / "trimmed_audio"
|
| 338 |
+
self.effective_durations_path = self.preprocessed_path / "effective_durations.csv"
|
| 339 |
+
|
| 340 |
+
# Load effective durations
|
| 341 |
+
self.effective_df = None
|
| 342 |
+
self.load_effective_durations()
|
| 343 |
+
|
| 344 |
+
def load_effective_durations(self):
|
| 345 |
+
"""Load effective durations from preprocessed CSV."""
|
| 346 |
+
try:
|
| 347 |
+
self.effective_df = pd.read_csv(self.effective_durations_path)
|
| 348 |
+
logger.info(f"Loaded effective durations for {len(self.effective_df)} clips")
|
| 349 |
+
|
| 350 |
+
# Create quick lookup dictionaries
|
| 351 |
+
self.filename_to_effective = dict(
|
| 352 |
+
zip(self.effective_df['filename'], self.effective_df['effective_duration_s'])
|
| 353 |
+
)
|
| 354 |
+
self.filename_to_category = dict(
|
| 355 |
+
zip(self.effective_df['filename'], self.effective_df['category'])
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
# Category-level statistics
|
| 359 |
+
self.category_effective_stats = self.effective_df.groupby('category').agg({
|
| 360 |
+
'effective_duration_s': ['mean', 'std', 'min', 'max', 'count']
|
| 361 |
+
}).round(4)
|
| 362 |
+
self.category_effective_stats.columns = ['mean', 'std', 'min', 'max', 'count']
|
| 363 |
+
|
| 364 |
+
logger.info("Created effective duration lookup tables")
|
| 365 |
+
|
| 366 |
+
except Exception as e:
|
| 367 |
+
logger.error(f"Error loading effective durations: {e}")
|
| 368 |
+
raise
|
| 369 |
+
|
| 370 |
+
def get_effective_duration(self, filename: str) -> float:
|
| 371 |
+
"""
|
| 372 |
+
Get effective duration for a specific file.
|
| 373 |
+
|
| 374 |
+
Args:
|
| 375 |
+
filename: Audio filename
|
| 376 |
+
|
| 377 |
+
Returns:
|
| 378 |
+
Effective duration in seconds
|
| 379 |
+
"""
|
| 380 |
+
if filename not in self.filename_to_effective:
|
| 381 |
+
logger.warning(f"No effective duration for {filename}, using default 5.0s")
|
| 382 |
+
return 5.0
|
| 383 |
+
return self.filename_to_effective[filename]
|
| 384 |
+
|
| 385 |
+
def get_category_effective_stats(self, category: str) -> Dict:
|
| 386 |
+
"""
|
| 387 |
+
Get effective duration statistics for a category.
|
| 388 |
+
|
| 389 |
+
Args:
|
| 390 |
+
category: Category name
|
| 391 |
+
|
| 392 |
+
Returns:
|
| 393 |
+
Dict with mean, std, min, max, count
|
| 394 |
+
"""
|
| 395 |
+
if category not in self.category_effective_stats.index:
|
| 396 |
+
return {'mean': 5.0, 'std': 0.0, 'min': 5.0, 'max': 5.0, 'count': 0}
|
| 397 |
+
|
| 398 |
+
stats = self.category_effective_stats.loc[category]
|
| 399 |
+
return {
|
| 400 |
+
'mean': stats['mean'],
|
| 401 |
+
'std': stats['std'],
|
| 402 |
+
'min': stats['min'],
|
| 403 |
+
'max': stats['max'],
|
| 404 |
+
'count': int(stats['count'])
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
def get_files_by_category_with_durations(self, category: str) -> List[Dict]:
|
| 408 |
+
"""
|
| 409 |
+
Get all files for a category with their effective durations.
|
| 410 |
+
|
| 411 |
+
Args:
|
| 412 |
+
category: Category name
|
| 413 |
+
|
| 414 |
+
Returns:
|
| 415 |
+
List of dicts with filename, effective_duration_s, filepath
|
| 416 |
+
"""
|
| 417 |
+
cat_df = self.effective_df[self.effective_df['category'] == category]
|
| 418 |
+
|
| 419 |
+
results = []
|
| 420 |
+
for _, row in cat_df.iterrows():
|
| 421 |
+
results.append({
|
| 422 |
+
'filename': row['filename'],
|
| 423 |
+
'effective_duration_s': row['effective_duration_s'],
|
| 424 |
+
'filepath': str(self.trimmed_audio_path / row['filename']),
|
| 425 |
+
'raw_duration_s': row['raw_duration_s'],
|
| 426 |
+
'peak_amplitude_db': row['peak_amplitude_db']
|
| 427 |
+
})
|
| 428 |
+
|
| 429 |
+
return results
|
| 430 |
+
|
| 431 |
+
def sample_file_from_category_with_duration(
|
| 432 |
+
self,
|
| 433 |
+
category: str,
|
| 434 |
+
min_effective_duration: float = None,
|
| 435 |
+
max_effective_duration: float = None
|
| 436 |
+
) -> Tuple[str, str, float]:
|
| 437 |
+
"""
|
| 438 |
+
Sample a file from category with optional duration constraints.
|
| 439 |
+
|
| 440 |
+
Args:
|
| 441 |
+
category: Category name
|
| 442 |
+
min_effective_duration: Minimum effective duration (optional)
|
| 443 |
+
max_effective_duration: Maximum effective duration (optional)
|
| 444 |
+
|
| 445 |
+
Returns:
|
| 446 |
+
Tuple of (filename, filepath, effective_duration_s)
|
| 447 |
+
"""
|
| 448 |
+
files = self.get_files_by_category_with_durations(category)
|
| 449 |
+
|
| 450 |
+
# Filter by duration if constraints provided
|
| 451 |
+
if min_effective_duration is not None:
|
| 452 |
+
files = [f for f in files if f['effective_duration_s'] >= min_effective_duration]
|
| 453 |
+
if max_effective_duration is not None:
|
| 454 |
+
files = [f for f in files if f['effective_duration_s'] <= max_effective_duration]
|
| 455 |
+
|
| 456 |
+
if not files:
|
| 457 |
+
# Fallback to any file from category
|
| 458 |
+
logger.warning(f"No files match duration constraints for {category}, using any file")
|
| 459 |
+
files = self.get_files_by_category_with_durations(category)
|
| 460 |
+
|
| 461 |
+
selected = random.choice(files)
|
| 462 |
+
return selected['filename'], selected['filepath'], selected['effective_duration_s']
|
| 463 |
+
|
| 464 |
+
def sample_files_from_category_to_reach_duration(
|
| 465 |
+
self,
|
| 466 |
+
category: str,
|
| 467 |
+
target_duration_s: float,
|
| 468 |
+
prefer_same_file: bool = True
|
| 469 |
+
) -> Tuple[List[str], List[str], float]:
|
| 470 |
+
"""
|
| 471 |
+
Sample files from a category to reach a target total effective duration.
|
| 472 |
+
|
| 473 |
+
Args:
|
| 474 |
+
category: Category name
|
| 475 |
+
target_duration_s: Target total effective duration
|
| 476 |
+
prefer_same_file: If True, try repeating same file first
|
| 477 |
+
|
| 478 |
+
Returns:
|
| 479 |
+
Tuple of (filenames_list, filepaths_list, actual_total_duration_s)
|
| 480 |
+
"""
|
| 481 |
+
files = self.get_files_by_category_with_durations(category)
|
| 482 |
+
|
| 483 |
+
if not files:
|
| 484 |
+
raise ValueError(f"No files found for category: {category}")
|
| 485 |
+
|
| 486 |
+
selected_filenames = []
|
| 487 |
+
selected_filepaths = []
|
| 488 |
+
total_duration = 0.0
|
| 489 |
+
|
| 490 |
+
if prefer_same_file:
|
| 491 |
+
# Sort by effective duration descending (prefer longer clips)
|
| 492 |
+
files_sorted = sorted(files, key=lambda x: x['effective_duration_s'], reverse=True)
|
| 493 |
+
selected_file = files_sorted[0]
|
| 494 |
+
|
| 495 |
+
# Calculate how many repetitions needed
|
| 496 |
+
reps_needed = max(1, int(target_duration_s / selected_file['effective_duration_s']) + 1)
|
| 497 |
+
|
| 498 |
+
for _ in range(reps_needed):
|
| 499 |
+
selected_filenames.append(selected_file['filename'])
|
| 500 |
+
selected_filepaths.append(selected_file['filepath'])
|
| 501 |
+
total_duration += selected_file['effective_duration_s']
|
| 502 |
+
|
| 503 |
+
if total_duration >= target_duration_s:
|
| 504 |
+
break
|
| 505 |
+
else:
|
| 506 |
+
# Use different files
|
| 507 |
+
random.shuffle(files)
|
| 508 |
+
file_idx = 0
|
| 509 |
+
|
| 510 |
+
while total_duration < target_duration_s:
|
| 511 |
+
selected_file = files[file_idx % len(files)]
|
| 512 |
+
selected_filenames.append(selected_file['filename'])
|
| 513 |
+
selected_filepaths.append(selected_file['filepath'])
|
| 514 |
+
total_duration += selected_file['effective_duration_s']
|
| 515 |
+
file_idx += 1
|
| 516 |
+
|
| 517 |
+
# Safety limit
|
| 518 |
+
if file_idx > 100:
|
| 519 |
+
logger.warning(f"Hit safety limit when sampling files for {category}")
|
| 520 |
+
break
|
| 521 |
+
|
| 522 |
+
return selected_filenames, selected_filepaths, total_duration
|
| 523 |
+
|
| 524 |
+
def get_categories_sorted_by_effective_duration(self, ascending: bool = True) -> List[str]:
|
| 525 |
+
"""
|
| 526 |
+
Get categories sorted by their mean effective duration.
|
| 527 |
+
|
| 528 |
+
Args:
|
| 529 |
+
ascending: If True, shortest first; if False, longest first
|
| 530 |
+
|
| 531 |
+
Returns:
|
| 532 |
+
List of category names sorted by mean effective duration
|
| 533 |
+
"""
|
| 534 |
+
sorted_stats = self.category_effective_stats.sort_values('mean', ascending=ascending)
|
| 535 |
+
return sorted_stats.index.tolist()
|
| 536 |
+
|
utils/llm_utils.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM-based question generation utilities.
|
| 3 |
+
|
| 4 |
+
Supports multiple LLM providers for generating natural, lexically consistent questions.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import random
|
| 9 |
+
from typing import Dict, List, Optional, Tuple
|
| 10 |
+
import json
|
| 11 |
+
|
| 12 |
+
from .logger import setup_logger
|
| 13 |
+
|
| 14 |
+
logger = setup_logger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class LLMQuestionGenerator:
|
| 18 |
+
"""Generate questions using local Llama 3.1 8B Instruct LLM."""
|
| 19 |
+
|
| 20 |
+
def __init__(
|
| 21 |
+
self,
|
| 22 |
+
enabled: bool = False,
|
| 23 |
+
template_questions: Optional[Dict] = None
|
| 24 |
+
):
|
| 25 |
+
"""
|
| 26 |
+
Initialize LLM question generator.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
enabled: Whether LLM generation is enabled
|
| 30 |
+
template_questions: Template questions for fallback
|
| 31 |
+
"""
|
| 32 |
+
self.enabled = enabled
|
| 33 |
+
self.template_questions = template_questions or {}
|
| 34 |
+
|
| 35 |
+
if not self.enabled:
|
| 36 |
+
logger.info("LLM generation disabled, using templates")
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
# TODO: Initialize local Llama 3.1 8B model connection
|
| 40 |
+
# This will be implemented based on your local LLM setup
|
| 41 |
+
logger.info("LLM generation enabled (local Llama 3.1 8B)")
|
| 42 |
+
logger.warning("Local LLM integration not yet implemented, falling back to templates")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def generate_count_questions(
|
| 46 |
+
self,
|
| 47 |
+
correct_count: int,
|
| 48 |
+
categories_present: List[str],
|
| 49 |
+
generate_both: bool = True
|
| 50 |
+
) -> Dict:
|
| 51 |
+
"""
|
| 52 |
+
Generate count task questions.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
correct_count: Correct number of unique sounds
|
| 56 |
+
categories_present: List of sound categories in the audio
|
| 57 |
+
generate_both: Whether to generate both MCQ and open-text
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
Dictionary with mcq_question and/or open_text_question
|
| 61 |
+
"""
|
| 62 |
+
# TODO: Implement LLM generation when enabled
|
| 63 |
+
# For now, always use templates
|
| 64 |
+
return self._generate_count_template(correct_count)
|
| 65 |
+
|
| 66 |
+
def generate_category_questions(
|
| 67 |
+
self,
|
| 68 |
+
task_type: str,
|
| 69 |
+
correct_category: str,
|
| 70 |
+
categories_present: List[str],
|
| 71 |
+
context: Optional[Dict] = None
|
| 72 |
+
) -> Dict:
|
| 73 |
+
"""
|
| 74 |
+
Generate questions where the answer is a sound category.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
task_type: Type of task (duration, order, volume)
|
| 78 |
+
correct_category: Correct answer category
|
| 79 |
+
categories_present: All categories in the audio
|
| 80 |
+
context: Additional context (e.g., question_type, reference_sound)
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
Dictionary with mcq_question and open_text_question
|
| 84 |
+
"""
|
| 85 |
+
# TODO: Implement LLM generation when enabled
|
| 86 |
+
# For now, always use templates
|
| 87 |
+
return self._generate_category_template(task_type, correct_category, context)
|
| 88 |
+
|
| 89 |
+
def _generate_count_template(self, correct_count: int) -> Dict:
|
| 90 |
+
"""Generate count questions from templates."""
|
| 91 |
+
mcq_templates = self.template_questions.get("count", {}).get("mcq", [
|
| 92 |
+
"What is the number of distinct sound sources in the audio file?",
|
| 93 |
+
"How many different types of sounds can be identified in this recording?"
|
| 94 |
+
])
|
| 95 |
+
open_templates = self.template_questions.get("count", {}).get("open_text", [
|
| 96 |
+
"How many distinct sound sources are present in the audio?",
|
| 97 |
+
"Count the number of unique sounds in this recording."
|
| 98 |
+
])
|
| 99 |
+
|
| 100 |
+
return {
|
| 101 |
+
"mcq_question": random.choice(mcq_templates),
|
| 102 |
+
"open_text_question": random.choice(open_templates)
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
def _generate_category_template(
|
| 106 |
+
self,
|
| 107 |
+
task_type: str,
|
| 108 |
+
correct_category: str,
|
| 109 |
+
context: Optional[Dict]
|
| 110 |
+
) -> Dict:
|
| 111 |
+
"""Generate category questions from templates."""
|
| 112 |
+
context = context or {}
|
| 113 |
+
|
| 114 |
+
if task_type == "duration":
|
| 115 |
+
q_type = context.get("question_type", "shortest")
|
| 116 |
+
mcq_q = f"Which of the following sounds is heard for the {q_type} duration?"
|
| 117 |
+
open_q = f"Which sound is heard for the {q_type} duration in the audio?"
|
| 118 |
+
|
| 119 |
+
elif task_type == "order":
|
| 120 |
+
q_subtype = context.get("question_subtype", "first")
|
| 121 |
+
if q_subtype == "first":
|
| 122 |
+
mcq_q = "Which sound appears first in the audio clip?"
|
| 123 |
+
open_q = "What is the first sound you hear in the audio?"
|
| 124 |
+
elif q_subtype == "last":
|
| 125 |
+
mcq_q = "Which sound appears last in the audio clip?"
|
| 126 |
+
open_q = "What is the last sound you hear in the audio?"
|
| 127 |
+
elif q_subtype == "after":
|
| 128 |
+
ref = context.get("reference_sound", "")
|
| 129 |
+
mcq_q = f"Which sound comes after {ref}?"
|
| 130 |
+
open_q = f"What sound comes after {ref}?"
|
| 131 |
+
else:
|
| 132 |
+
ref = context.get("reference_sound", "")
|
| 133 |
+
mcq_q = f"Which sound comes before {ref}?"
|
| 134 |
+
open_q = f"What sound comes before {ref}?"
|
| 135 |
+
|
| 136 |
+
else: # volume
|
| 137 |
+
q_type = context.get("question_type", "loudest")
|
| 138 |
+
mcq_q = f"Which sound is the {q_type} in the audio?"
|
| 139 |
+
open_q = f"Identify the {q_type} sound in the audio clip."
|
| 140 |
+
|
| 141 |
+
return {
|
| 142 |
+
"mcq_question": mcq_q,
|
| 143 |
+
"open_text_question": open_q
|
| 144 |
+
}
|