ooshyun commited on
Commit
494a55b
·
verified ·
1 Parent(s): 1893f13

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - audio
5
+ - sound-event-detection
6
+ - audio-spectrogram-transformer
7
+ - yamnet
8
+ datasets:
9
+ - audioset
10
+ language:
11
+ - en
12
+ pipeline_tag: audio-classification
13
+ ---
14
+
15
+ # Sound Event Detection — Pretrained Models
16
+
17
+ Pretrained models for Sound Event Detection (SED) used in **MobiSys 2026 #198 "Aurchestra"**.
18
+
19
+ ## Models
20
+
21
+ ### 1. YAMNet (Pretrained Baseline)
22
+
23
+ - **Source**: [google/yamnet](https://huggingface.co/google/yamnet) (TensorFlow) / PyTorch reimplementation
24
+ - **Classes**: 521 AudioSet classes
25
+ - **Usage**: Loaded directly from HuggingFace — no checkpoint in this repo
26
+
27
+ ### 2. AST (Pretrained Baseline)
28
+
29
+ - **Source**: [MIT/ast-finetuned-audioset-10-10-0.4593](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
30
+ - **Architecture**: Audio Spectrogram Transformer
31
+ - **Classes**: 527 AudioSet classes
32
+ - **Usage**: Loaded directly from HuggingFace — no checkpoint in this repo
33
+
34
+ ### 3. Fine-tuned AST (`sed_ast_snr_ctl_v2_16k`)
35
+
36
+ - **Base model**: `MIT/ast-finetuned-audioset-10-10-0.4593`
37
+ - **Fine-tuned on**: On-the-fly synthesized binaural audio mixtures (SNR-controlled, 16kHz)
38
+ - **Classes**: 20 target sound classes
39
+ - **Training**: AdamW, OneCycleLR with group-wise learning rates (backbone 1e-5, head 1e-3), 80 epochs
40
+ - **Checkpoint**: `sed_ast_snr_ctl_v2_16k/checkpoints/best.pt`
41
+
42
+ ## File Structure
43
+
44
+ ```
45
+ .
46
+ ├── README.md
47
+ └── sed_ast_snr_ctl_v2_16k/
48
+ ├── config.json # Training configuration
49
+ └── checkpoints/
50
+ └── best.pt # Fine-tuned model weights (~2GB)
51
+ ```
52
+
53
+ ## Usage
54
+
55
+ ```python
56
+ # Fine-tuned AST
57
+ from huggingface_hub import hf_hub_download
58
+
59
+ checkpoint_path = hf_hub_download(
60
+ repo_id="ooshyun/sound_event_detection",
61
+ filename="sed_ast_snr_ctl_v2_16k/checkpoints/best.pt",
62
+ )
63
+
64
+ config_path = hf_hub_download(
65
+ repo_id="ooshyun/sound_event_detection",
66
+ filename="sed_ast_snr_ctl_v2_16k/config.json",
67
+ )
68
+ ```
69
+
70
+ For training and evaluation code, see [ooshyun/sound_event_detection](https://github.com/ooshyun/sound_event_detection).
71
+
72
+ ## Citation
73
+
74
+ If you use these models, please cite:
75
+
76
+ ```
77
+ MobiSys 2026 #198 "Aurchestra"
78
+ ```
sed_ast_snr_ctl_v2_16k/checkpoints/best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a640a4e87be91870e9b19b991297e598b36608d2cc7626132fa122dc3b0815d
3
+ size 1035160839
sed_ast_snr_ctl_v2_16k/config.json ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pl_module": "src.hl_modules.sed.Module",
3
+ "pl_module_args": {
4
+ "metrics": [
5
+ "accuracy"
6
+ ],
7
+ "model": "src.models.SED.ast_hf.ASTHuggingFace",
8
+ "model_params": {
9
+ "model_name": "MIT/ast-finetuned-audioset-10-10-0.4593",
10
+ "num_labels": 20,
11
+ "unfreeze_layers": [
12
+ "classifier"
13
+ ]
14
+ },
15
+ "samples_per_speaker_number": 20,
16
+ "optimizer": "torch.optim.AdamW",
17
+ "optimizer_params": {
18
+ "lr": 0.001
19
+ },
20
+ "loss": "src.losses.SEDLoss.MultiLabelBCELoss",
21
+ "loss_params": {},
22
+ "scheduler": "onecycle_with_groups",
23
+ "scheduler_params": {
24
+ "backbone_lr": 1e-05,
25
+ "head_lr": 0.001,
26
+ "backbone_max_lr": 1e-05,
27
+ "head_max_lr": 0.001,
28
+ "epochs": 80,
29
+ "steps_per_epoch": 2500,
30
+ "pct_start": 0.1,
31
+ "div_factor": 25.0,
32
+ "final_div_factor": 10000.0
33
+ },
34
+ "sr": 16000,
35
+ "grad_clip": 1,
36
+ "early_stopping": {
37
+ "enabled": true,
38
+ "monitor": "val/loss",
39
+ "mode": "min",
40
+ "patience": 20,
41
+ "min_delta": 0.0,
42
+ "verbose": true
43
+ }
44
+ },
45
+ "root_dataset_dir": "/scr",
46
+ "train_dataset": "src.frozen_dataset.frozen_dataset.FrozenMisophoniaDataset",
47
+ "train_data_args": {
48
+ "split_dir": "frozen-10c-40000/train"
49
+ },
50
+ "val_dataset": "src.frozen_dataset.frozen_dataset.FrozenMisophoniaDataset",
51
+ "val_data_args": {
52
+ "split_dir": "frozen-10c-40000/val"
53
+ },
54
+ "test_dataset": "src.frozen_dataset.frozen_dataset.FrozenMisophoniaDataset",
55
+ "test_data_args": {
56
+ "split_dir": "frozen-10c-40000/test"
57
+ },
58
+ "onflight_mode": 1,
59
+ "onflight_train_dataset": "src.datasets.MisophoniaDataset.MisophoniaDataset",
60
+ "onflight_train_data_args": {
61
+ "fg_sounds_dir": "BinauralCuratedDataset/scaper_fmt/train",
62
+ "bg_sounds_dir": "BinauralCuratedDataset/bg_scaper_fmt/train",
63
+ "noise_sounds_dir": "BinauralCuratedDataset/noise_scaper_fmt/train",
64
+ "hrtf_list": "BinauralCuratedDataset/hrtf/CIPIC/train_hrtf.txt",
65
+ "samples_per_epoch": 20000,
66
+ "duration": 5,
67
+ "sr": 16000,
68
+ "hrtf_type": "CIPIC",
69
+ "augmentations": [],
70
+ "num_total_labels": 20,
71
+ "num_fg_sounds_min": 1,
72
+ "num_fg_sounds_max": 5,
73
+ "num_bg_sounds_min": 1,
74
+ "num_bg_sounds_max": 3,
75
+ "num_noise_sounds_min": 1,
76
+ "num_noise_sounds_max": 1,
77
+ "num_output_channels": 5,
78
+ "snr_range_fg": [
79
+ 5,
80
+ 15
81
+ ],
82
+ "snr_range_bg": [
83
+ 0,
84
+ 10
85
+ ],
86
+ "ref_db": -50,
87
+ "onflight_mode": 1
88
+ },
89
+ "onflight_val_dataset": "src.datasets.MisophoniaDataset.MisophoniaDataset",
90
+ "onflight_val_data_args": {
91
+ "fg_sounds_dir": "BinauralCuratedDataset/scaper_fmt/val",
92
+ "bg_sounds_dir": "BinauralCuratedDataset/bg_scaper_fmt/val",
93
+ "noise_sounds_dir": "BinauralCuratedDataset/noise_scaper_fmt/val",
94
+ "hrtf_list": "BinauralCuratedDataset/hrtf/CIPIC/val_hrtf.txt",
95
+ "samples_per_epoch": 2000,
96
+ "duration": 5,
97
+ "sr": 16000,
98
+ "hrtf_type": "CIPIC",
99
+ "augmentations": [],
100
+ "num_total_labels": 20,
101
+ "num_fg_sounds_min": 1,
102
+ "num_fg_sounds_max": 5,
103
+ "num_bg_sounds_min": 1,
104
+ "num_bg_sounds_max": 3,
105
+ "num_noise_sounds_min": 1,
106
+ "num_noise_sounds_max": 1,
107
+ "num_output_channels": 5,
108
+ "snr_range_fg": [
109
+ 5,
110
+ 15
111
+ ],
112
+ "snr_range_bg": [
113
+ 0,
114
+ 10
115
+ ],
116
+ "ref_db": -50,
117
+ "onflight_mode": 1
118
+ },
119
+ "onflight_test_dataset": "src.datasets.MisophoniaDataset.MisophoniaDataset",
120
+ "onflight_test_data_args": {
121
+ "fg_sounds_dir": "BinauralCuratedDataset/scaper_fmt/test",
122
+ "bg_sounds_dir": "BinauralCuratedDataset/bg_scaper_fmt/test",
123
+ "noise_sounds_dir": "BinauralCuratedDataset/noise_scaper_fmt/test",
124
+ "hrtf_list": "BinauralCuratedDataset/hrtf/CIPIC/test_hrtf.txt",
125
+ "samples_per_epoch": 2000,
126
+ "duration": 2,
127
+ "sr": 16000,
128
+ "hrtf_type": "CIPIC",
129
+ "augmentations": [],
130
+ "num_total_labels": 20,
131
+ "num_fg_sounds_min": 5,
132
+ "num_fg_sounds_max": 5,
133
+ "num_bg_sounds_min": 1,
134
+ "num_bg_sounds_max": 1,
135
+ "num_noise_sounds_min": 1,
136
+ "num_noise_sounds_max": 1,
137
+ "num_output_channels": 5,
138
+ "snr_range_fg": [
139
+ 5,
140
+ 15
141
+ ],
142
+ "snr_range_bg": [
143
+ 0,
144
+ 10
145
+ ],
146
+ "ref_db": -50,
147
+ "onflight_mode": 1
148
+ },
149
+ "epochs": 80,
150
+ "batch_size": 8,
151
+ "eval_batch_size": 8,
152
+ "num_workers": 16,
153
+ "logging": {
154
+ "module_levels": {
155
+ "src.train": "INFO",
156
+ "src.training.train_val": "INFO",
157
+ "src.metrics.metrics": "INFO",
158
+ "src.hl_modules.sed": "INFO",
159
+ "src.datasets.compile.tau_label_collector": "DEBUG",
160
+ "src.datasets.MisophoniaDataset": "INFO",
161
+ "src.datasets.multi_ch_simulator": "INFO",
162
+ "src.models.GuidedTFNetwork.multiflim_guided_tfnet": "INFO",
163
+ "src.models.GuidedTFNetwork.guided_tfnet": "INFO",
164
+ "src.cuda.cuda": "DEBUG"
165
+ }
166
+ }
167
+ }