PSELDNets (code, models, paper)
Browse files- .gitattributes +1 -0
- PSELDNets. Pre-trained Neural Networks on a Large-scale Synthetic Dataset for Sound Event Localization and Detection.pdf +3 -0
- code/PSELDNets.zip +3 -0
- code/checkpoints/audioset-training.zip +3 -0
- code/checkpoints/synthetic-dataset-training.zip +3 -0
- models/PSELDNets/.gitattributes +102 -0
- models/PSELDNets/README.md +31 -0
- models/PSELDNets/model/ACCDOA-HTSAT-0.566.ckpt +3 -0
- models/PSELDNets/model/Cnn14_mAP%3D0.431.pth +3 -0
- models/PSELDNets/model/EINV2-HTSAT-0.597.ckpt +3 -0
- models/PSELDNets/model/EINV2-HTSAT-AGG1-0.514.ckpt +3 -0
- models/PSELDNets/model/HTSAT-fullset-imagenet-768d-32000hz.ckpt +3 -0
- models/PSELDNets/model/SEDDOA-HTSAT-AGG1-0.531.ckpt +3 -0
- models/PSELDNets/model/mACCDOA-CNN14-Conformer-0.582.ckpt +3 -0
- models/PSELDNets/model/mACCDOA-HTSAT-0.567.ckpt +3 -0
- models/PSELDNets/model/mACCDOA-PaSST-0.562.ckpt +3 -0
- models/PSELDNets/model/passt-l-kd-ap.47.ckpt +3 -0
- models/PSELDNets/source.txt +1 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
PSELDNets.[[:space:]]Pre-trained[[:space:]]Neural[[:space:]]Networks[[:space:]]on[[:space:]]a[[:space:]]Large-scale[[:space:]]Synthetic[[:space:]]Dataset[[:space:]]for[[:space:]]Sound[[:space:]]Event[[:space:]]Localization[[:space:]]and[[:space:]]Detection.pdf filter=lfs diff=lfs merge=lfs -text
|
PSELDNets. Pre-trained Neural Networks on a Large-scale Synthetic Dataset for Sound Event Localization and Detection.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:780bcc78cf9ea10816b2880707733c66509f419fbbc4bf6ef74a8fa3afb2a1ab
|
| 3 |
+
size 7962353
|
code/PSELDNets.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b982275726e2d50b0a9a312192338037c8bc8616a3532d3418c1e6810ae561d4
|
| 3 |
+
size 4825510
|
code/checkpoints/audioset-training.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6d98ebeb4ef5c7e903b961e541b6a43212d9af25a5ee8957322f869af3ec1bb
|
| 3 |
+
size 1998284114
|
code/checkpoints/synthetic-dataset-training.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dab868640298a4e9a34b88e35d775f84bb2a9b970e6cb243988428f15e789f84
|
| 3 |
+
size 1998284276
|
models/PSELDNets/.gitattributes
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.mds filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
# Audio files - uncompressed
|
| 39 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
# Audio files - compressed
|
| 43 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
# Image files - uncompressed
|
| 49 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
# Image files - compressed
|
| 54 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
# Video files - compressed
|
| 58 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
*.webm filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
dataset/train20000_ov1_1.z02 filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
dataset/train20000_ov1_1.z03 filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
dataset/train10000_ov2_1.z01 filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
dataset/train10000_ov2_2.z08 filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
dataset/train3500_ov3_2.z03 filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
dataset/train10000_ov2_2.z07 filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
dataset/train20000_ov1_2.z07 filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
dataset/train20000_ov1_2.z03 filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
dataset/train20000_ov1_2.z10 filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
dataset/train20000_ov1_1.z08 filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
dataset/train20000_ov1_2.z05 filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
dataset/train3500_ov3_2.z02 filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
dataset/train10000_ov2_1.z07 filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
dataset/train10000_ov2_2.z05 filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
dataset/test1800_ov1.z01 filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
dataset/train3500_ov3_2.z01 filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
dataset/train20000_ov1_2.z09 filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
dataset/train10000_ov2_1.z05 filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
dataset/train20000_ov1_1.z10 filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
dataset/train20000_ov1_2.z08 filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
dataset/train10000_ov2_1.z03 filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
dataset/train10000_ov2_2.z01 filter=lfs diff=lfs merge=lfs -text
|
| 82 |
+
dataset/train20000_ov1_1.z06 filter=lfs diff=lfs merge=lfs -text
|
| 83 |
+
dataset/train10000_ov2_2.z04 filter=lfs diff=lfs merge=lfs -text
|
| 84 |
+
dataset/train20000_ov1_2.z02 filter=lfs diff=lfs merge=lfs -text
|
| 85 |
+
dataset/train10000_ov2_1.z02 filter=lfs diff=lfs merge=lfs -text
|
| 86 |
+
dataset/train10000_ov2_2.z06 filter=lfs diff=lfs merge=lfs -text
|
| 87 |
+
dataset/train20000_ov1_2.z04 filter=lfs diff=lfs merge=lfs -text
|
| 88 |
+
dataset/train20000_ov1_1.z05 filter=lfs diff=lfs merge=lfs -text
|
| 89 |
+
dataset/train10000_ov2_2.z02 filter=lfs diff=lfs merge=lfs -text
|
| 90 |
+
dataset/train3500_ov3_1.z03 filter=lfs diff=lfs merge=lfs -text
|
| 91 |
+
dataset/train20000_ov1_1.z01 filter=lfs diff=lfs merge=lfs -text
|
| 92 |
+
dataset/train3500_ov3_1.z01 filter=lfs diff=lfs merge=lfs -text
|
| 93 |
+
dataset/train20000_ov1_1.z04 filter=lfs diff=lfs merge=lfs -text
|
| 94 |
+
dataset/train10000_ov2_2.z03 filter=lfs diff=lfs merge=lfs -text
|
| 95 |
+
dataset/train10000_ov2_1.z04 filter=lfs diff=lfs merge=lfs -text
|
| 96 |
+
dataset/train10000_ov2_1.z08 filter=lfs diff=lfs merge=lfs -text
|
| 97 |
+
dataset/train20000_ov1_1.z07 filter=lfs diff=lfs merge=lfs -text
|
| 98 |
+
dataset/train10000_ov2_1.z06 filter=lfs diff=lfs merge=lfs -text
|
| 99 |
+
dataset/train20000_ov1_2.z06 filter=lfs diff=lfs merge=lfs -text
|
| 100 |
+
dataset/train3500_ov3_1.z02 filter=lfs diff=lfs merge=lfs -text
|
| 101 |
+
dataset/train20000_ov1_1.z09 filter=lfs diff=lfs merge=lfs -text
|
| 102 |
+
dataset/train20000_ov1_2.z01 filter=lfs diff=lfs merge=lfs -text
|
models/PSELDNets/README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
pretty_name: DataSynthSELD
|
| 3 |
+
size_categories:
|
| 4 |
+
- 100B<n<1T
|
| 5 |
+
task_categories:
|
| 6 |
+
- audio-classification
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
# PSELDNets: Pre-trained Neural Networks on a Large-scale Synthetic Dataset for Sound Event Localization and Detection
|
| 10 |
+
|
| 11 |
+
- [Paper](https://arxiv.org/abs/2411.06399)
|
| 12 |
+
- [GitHub](https://github.com/Jinbo-Hu/PSELDNets)
|
| 13 |
+
|
| 14 |
+
1. This repo contains 67,000 1-minute clips, amounting to approximately 1,117 hours for training, and 3,060 1-minute clips, amounting to roughly 51 hours for testing.
|
| 15 |
+
2. The dataset features an ontology of 170 sound classes and is generated by convolving sound event clips from [FSD50K](https://zenodo.org/records/4060432) with simulated SRIRs (for training) or collected SRIRs from [TAU-SRIR DB](https://zenodo.org/records/6408611) (for testing).
|
| 16 |
+
3. The datasets are generated by this [tools](https://github.com/Jinbo-Hu/SELD-Data-Generator).
|
| 17 |
+
4. The pre-trained SELD checkpoints on the large-scale synthetic dataset are also publicly available.
|
| 18 |
+
|
| 19 |
+
## New Updates
|
| 20 |
+
- (2025-05-22) We release `EINV2-HTSAT-AGG1-0.514.ckpt` and `SEDDOA-HTSAT-AGG1-0.531.ckpt`. The corresponding method is described [here](https://github.com/Jinbo-Hu/PSELDNets/blob/main/AGG_LOSS.md).
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
## Download
|
| 24 |
+
- [Synthetic Datasets](https://huggingface.co/datasets/Jinbo-HU/PSELDNets/tree/main/dataset)
|
| 25 |
+
- [Pre-trained checkpoints](https://huggingface.co/datasets/Jinbo-HU/PSELDNets/tree/main/model)
|
| 26 |
+
|
| 27 |
+
## Citation
|
| 28 |
+
Please cite our papers as below if you use the datasets, codes, and models of PSELDNets.
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
[1] Jinbo Hu, Yin Cao, Ming Wu, Fang Kang, Feiran Yang, Wenwu Wang, Mark D. Plumbley, Jun Yang, "PSELDNets: Pre-trained Neural Networks on Large-scale Synthetic Datasets for Sound Event Localization and Detection" [arXiv:2411.06399](https://arxiv.org/abs/2411.06399), 2024. [URL](https://arxiv.org/abs/2411.06399)
|
models/PSELDNets/model/ACCDOA-HTSAT-0.566.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c54509137cff95361290cb5578c02cb28a0858bd862f3447da4e46fcb8a5178d
|
| 3 |
+
size 121711961
|
models/PSELDNets/model/Cnn14_mAP%3D0.431.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0dc499e40e9761ef5ea061ffc77697697f277f6a960894903df3ada000e34b31
|
| 3 |
+
size 327428481
|
models/PSELDNets/model/EINV2-HTSAT-0.597.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3a62db1d672c2c499a207cc62920908c3aa0f563a473d7cea84861d0b663557
|
| 3 |
+
size 234040898
|
models/PSELDNets/model/EINV2-HTSAT-AGG1-0.514.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b1cb776992e54933b7e1daefbbc6a535e694ccf3946e1d1d288278c7262e2bc
|
| 3 |
+
size 234009780
|
models/PSELDNets/model/HTSAT-fullset-imagenet-768d-32000hz.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6700e6480188ae00b7e133fdd127abf636941cd5bba31077867cb85c92cd0549
|
| 3 |
+
size 127072759
|
models/PSELDNets/model/SEDDOA-HTSAT-AGG1-0.531.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0fe02816d1ce29581c89268518084f355acc6d29256ff1a531332d2db26ef8cf
|
| 3 |
+
size 121878584
|
models/PSELDNets/model/mACCDOA-CNN14-Conformer-0.582.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c516aac7b88ad90c4bf4f86f3d5d892300ca51164da3b7a9dd68e458a60a768
|
| 3 |
+
size 799827990
|
models/PSELDNets/model/mACCDOA-HTSAT-0.567.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:813083ac938c5974a6f36ceca29ea66c0382091db5df1d6d47ece9572d5ac71b
|
| 3 |
+
size 140516864
|
models/PSELDNets/model/mACCDOA-PaSST-0.562.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3dcf9a2bc39fa055b899f5c62f3269bbb1db4ac75c21410a8ca294659541bcf
|
| 3 |
+
size 209219358
|
models/PSELDNets/model/passt-l-kd-ap.47.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:14ae3e192bf8e2df3c0960d67a031bac900fc35bee824decef1bf0f9549f4f9b
|
| 3 |
+
size 202889066
|
models/PSELDNets/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/datasets/Jinbo-HU/PSELDNets
|