niobures commited on
Commit
0ec3bdf
·
verified ·
1 Parent(s): 3bcb5e7

PSELDNets (code, models, paper)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ PSELDNets.[[:space:]]Pre-trained[[:space:]]Neural[[:space:]]Networks[[:space:]]on[[:space:]]a[[:space:]]Large-scale[[:space:]]Synthetic[[:space:]]Dataset[[:space:]]for[[:space:]]Sound[[:space:]]Event[[:space:]]Localization[[:space:]]and[[:space:]]Detection.pdf filter=lfs diff=lfs merge=lfs -text
PSELDNets. Pre-trained Neural Networks on a Large-scale Synthetic Dataset for Sound Event Localization and Detection.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:780bcc78cf9ea10816b2880707733c66509f419fbbc4bf6ef74a8fa3afb2a1ab
3
+ size 7962353
code/PSELDNets.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b982275726e2d50b0a9a312192338037c8bc8616a3532d3418c1e6810ae561d4
3
+ size 4825510
code/checkpoints/audioset-training.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6d98ebeb4ef5c7e903b961e541b6a43212d9af25a5ee8957322f869af3ec1bb
3
+ size 1998284114
code/checkpoints/synthetic-dataset-training.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dab868640298a4e9a34b88e35d775f84bb2a9b970e6cb243988428f15e789f84
3
+ size 1998284276
models/PSELDNets/.gitattributes ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mds filter=lfs diff=lfs merge=lfs -text
13
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
14
+ *.model filter=lfs diff=lfs merge=lfs -text
15
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
16
+ *.npy filter=lfs diff=lfs merge=lfs -text
17
+ *.npz filter=lfs diff=lfs merge=lfs -text
18
+ *.onnx filter=lfs diff=lfs merge=lfs -text
19
+ *.ot filter=lfs diff=lfs merge=lfs -text
20
+ *.parquet filter=lfs diff=lfs merge=lfs -text
21
+ *.pb filter=lfs diff=lfs merge=lfs -text
22
+ *.pickle filter=lfs diff=lfs merge=lfs -text
23
+ *.pkl filter=lfs diff=lfs merge=lfs -text
24
+ *.pt filter=lfs diff=lfs merge=lfs -text
25
+ *.pth filter=lfs diff=lfs merge=lfs -text
26
+ *.rar filter=lfs diff=lfs merge=lfs -text
27
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
28
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
30
+ *.tar filter=lfs diff=lfs merge=lfs -text
31
+ *.tflite filter=lfs diff=lfs merge=lfs -text
32
+ *.tgz filter=lfs diff=lfs merge=lfs -text
33
+ *.wasm filter=lfs diff=lfs merge=lfs -text
34
+ *.xz filter=lfs diff=lfs merge=lfs -text
35
+ *.zip filter=lfs diff=lfs merge=lfs -text
36
+ *.zst filter=lfs diff=lfs merge=lfs -text
37
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
38
+ # Audio files - uncompressed
39
+ *.pcm filter=lfs diff=lfs merge=lfs -text
40
+ *.sam filter=lfs diff=lfs merge=lfs -text
41
+ *.raw filter=lfs diff=lfs merge=lfs -text
42
+ # Audio files - compressed
43
+ *.aac filter=lfs diff=lfs merge=lfs -text
44
+ *.flac filter=lfs diff=lfs merge=lfs -text
45
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
46
+ *.ogg filter=lfs diff=lfs merge=lfs -text
47
+ *.wav filter=lfs diff=lfs merge=lfs -text
48
+ # Image files - uncompressed
49
+ *.bmp filter=lfs diff=lfs merge=lfs -text
50
+ *.gif filter=lfs diff=lfs merge=lfs -text
51
+ *.png filter=lfs diff=lfs merge=lfs -text
52
+ *.tiff filter=lfs diff=lfs merge=lfs -text
53
+ # Image files - compressed
54
+ *.jpg filter=lfs diff=lfs merge=lfs -text
55
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
56
+ *.webp filter=lfs diff=lfs merge=lfs -text
57
+ # Video files - compressed
58
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ *.webm filter=lfs diff=lfs merge=lfs -text
60
+ dataset/train20000_ov1_1.z02 filter=lfs diff=lfs merge=lfs -text
61
+ dataset/train20000_ov1_1.z03 filter=lfs diff=lfs merge=lfs -text
62
+ dataset/train10000_ov2_1.z01 filter=lfs diff=lfs merge=lfs -text
63
+ dataset/train10000_ov2_2.z08 filter=lfs diff=lfs merge=lfs -text
64
+ dataset/train3500_ov3_2.z03 filter=lfs diff=lfs merge=lfs -text
65
+ dataset/train10000_ov2_2.z07 filter=lfs diff=lfs merge=lfs -text
66
+ dataset/train20000_ov1_2.z07 filter=lfs diff=lfs merge=lfs -text
67
+ dataset/train20000_ov1_2.z03 filter=lfs diff=lfs merge=lfs -text
68
+ dataset/train20000_ov1_2.z10 filter=lfs diff=lfs merge=lfs -text
69
+ dataset/train20000_ov1_1.z08 filter=lfs diff=lfs merge=lfs -text
70
+ dataset/train20000_ov1_2.z05 filter=lfs diff=lfs merge=lfs -text
71
+ dataset/train3500_ov3_2.z02 filter=lfs diff=lfs merge=lfs -text
72
+ dataset/train10000_ov2_1.z07 filter=lfs diff=lfs merge=lfs -text
73
+ dataset/train10000_ov2_2.z05 filter=lfs diff=lfs merge=lfs -text
74
+ dataset/test1800_ov1.z01 filter=lfs diff=lfs merge=lfs -text
75
+ dataset/train3500_ov3_2.z01 filter=lfs diff=lfs merge=lfs -text
76
+ dataset/train20000_ov1_2.z09 filter=lfs diff=lfs merge=lfs -text
77
+ dataset/train10000_ov2_1.z05 filter=lfs diff=lfs merge=lfs -text
78
+ dataset/train20000_ov1_1.z10 filter=lfs diff=lfs merge=lfs -text
79
+ dataset/train20000_ov1_2.z08 filter=lfs diff=lfs merge=lfs -text
80
+ dataset/train10000_ov2_1.z03 filter=lfs diff=lfs merge=lfs -text
81
+ dataset/train10000_ov2_2.z01 filter=lfs diff=lfs merge=lfs -text
82
+ dataset/train20000_ov1_1.z06 filter=lfs diff=lfs merge=lfs -text
83
+ dataset/train10000_ov2_2.z04 filter=lfs diff=lfs merge=lfs -text
84
+ dataset/train20000_ov1_2.z02 filter=lfs diff=lfs merge=lfs -text
85
+ dataset/train10000_ov2_1.z02 filter=lfs diff=lfs merge=lfs -text
86
+ dataset/train10000_ov2_2.z06 filter=lfs diff=lfs merge=lfs -text
87
+ dataset/train20000_ov1_2.z04 filter=lfs diff=lfs merge=lfs -text
88
+ dataset/train20000_ov1_1.z05 filter=lfs diff=lfs merge=lfs -text
89
+ dataset/train10000_ov2_2.z02 filter=lfs diff=lfs merge=lfs -text
90
+ dataset/train3500_ov3_1.z03 filter=lfs diff=lfs merge=lfs -text
91
+ dataset/train20000_ov1_1.z01 filter=lfs diff=lfs merge=lfs -text
92
+ dataset/train3500_ov3_1.z01 filter=lfs diff=lfs merge=lfs -text
93
+ dataset/train20000_ov1_1.z04 filter=lfs diff=lfs merge=lfs -text
94
+ dataset/train10000_ov2_2.z03 filter=lfs diff=lfs merge=lfs -text
95
+ dataset/train10000_ov2_1.z04 filter=lfs diff=lfs merge=lfs -text
96
+ dataset/train10000_ov2_1.z08 filter=lfs diff=lfs merge=lfs -text
97
+ dataset/train20000_ov1_1.z07 filter=lfs diff=lfs merge=lfs -text
98
+ dataset/train10000_ov2_1.z06 filter=lfs diff=lfs merge=lfs -text
99
+ dataset/train20000_ov1_2.z06 filter=lfs diff=lfs merge=lfs -text
100
+ dataset/train3500_ov3_1.z02 filter=lfs diff=lfs merge=lfs -text
101
+ dataset/train20000_ov1_1.z09 filter=lfs diff=lfs merge=lfs -text
102
+ dataset/train20000_ov1_2.z01 filter=lfs diff=lfs merge=lfs -text
models/PSELDNets/README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pretty_name: DataSynthSELD
3
+ size_categories:
4
+ - 100B<n<1T
5
+ task_categories:
6
+ - audio-classification
7
+ ---
8
+
9
+ # PSELDNets: Pre-trained Neural Networks on a Large-scale Synthetic Dataset for Sound Event Localization and Detection
10
+
11
+ - [Paper](https://arxiv.org/abs/2411.06399)
12
+ - [GitHub](https://github.com/Jinbo-Hu/PSELDNets)
13
+
14
+ 1. This repo contains 67,000 1-minute clips, amounting to approximately 1,117 hours for training, and 3,060 1-minute clips, amounting to roughly 51 hours for testing.
15
+ 2. The dataset features an ontology of 170 sound classes and is generated by convolving sound event clips from [FSD50K](https://zenodo.org/records/4060432) with simulated SRIRs (for training) or collected SRIRs from [TAU-SRIR DB](https://zenodo.org/records/6408611) (for testing).
16
+ 3. The datasets are generated by this [tools](https://github.com/Jinbo-Hu/SELD-Data-Generator).
17
+ 4. The pre-trained SELD checkpoints on the large-scale synthetic dataset are also publicly available.
18
+
19
+ ## New Updates
20
+ - (2025-05-22) We release `EINV2-HTSAT-AGG1-0.514.ckpt` and `SEDDOA-HTSAT-AGG1-0.531.ckpt`. The corresponding method is described [here](https://github.com/Jinbo-Hu/PSELDNets/blob/main/AGG_LOSS.md).
21
+
22
+
23
+ ## Download
24
+ - [Synthetic Datasets](https://huggingface.co/datasets/Jinbo-HU/PSELDNets/tree/main/dataset)
25
+ - [Pre-trained checkpoints](https://huggingface.co/datasets/Jinbo-HU/PSELDNets/tree/main/model)
26
+
27
+ ## Citation
28
+ Please cite our papers as below if you use the datasets, codes, and models of PSELDNets.
29
+
30
+
31
+ [1] Jinbo Hu, Yin Cao, Ming Wu, Fang Kang, Feiran Yang, Wenwu Wang, Mark D. Plumbley, Jun Yang, "PSELDNets: Pre-trained Neural Networks on Large-scale Synthetic Datasets for Sound Event Localization and Detection" [arXiv:2411.06399](https://arxiv.org/abs/2411.06399), 2024. [URL](https://arxiv.org/abs/2411.06399)
models/PSELDNets/model/ACCDOA-HTSAT-0.566.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c54509137cff95361290cb5578c02cb28a0858bd862f3447da4e46fcb8a5178d
3
+ size 121711961
models/PSELDNets/model/Cnn14_mAP%3D0.431.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dc499e40e9761ef5ea061ffc77697697f277f6a960894903df3ada000e34b31
3
+ size 327428481
models/PSELDNets/model/EINV2-HTSAT-0.597.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3a62db1d672c2c499a207cc62920908c3aa0f563a473d7cea84861d0b663557
3
+ size 234040898
models/PSELDNets/model/EINV2-HTSAT-AGG1-0.514.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b1cb776992e54933b7e1daefbbc6a535e694ccf3946e1d1d288278c7262e2bc
3
+ size 234009780
models/PSELDNets/model/HTSAT-fullset-imagenet-768d-32000hz.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6700e6480188ae00b7e133fdd127abf636941cd5bba31077867cb85c92cd0549
3
+ size 127072759
models/PSELDNets/model/SEDDOA-HTSAT-AGG1-0.531.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fe02816d1ce29581c89268518084f355acc6d29256ff1a531332d2db26ef8cf
3
+ size 121878584
models/PSELDNets/model/mACCDOA-CNN14-Conformer-0.582.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c516aac7b88ad90c4bf4f86f3d5d892300ca51164da3b7a9dd68e458a60a768
3
+ size 799827990
models/PSELDNets/model/mACCDOA-HTSAT-0.567.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:813083ac938c5974a6f36ceca29ea66c0382091db5df1d6d47ece9572d5ac71b
3
+ size 140516864
models/PSELDNets/model/mACCDOA-PaSST-0.562.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3dcf9a2bc39fa055b899f5c62f3269bbb1db4ac75c21410a8ca294659541bcf
3
+ size 209219358
models/PSELDNets/model/passt-l-kd-ap.47.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14ae3e192bf8e2df3c0960d67a031bac900fc35bee824decef1bf0f9549f4f9b
3
+ size 202889066
models/PSELDNets/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/datasets/Jinbo-HU/PSELDNets