Loie commited on
Commit
4053b0f
·
verified ·
1 Parent(s): 607c7e1

Upload 8 files

Browse files
README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # SpotSound: Enhancing Large Audio-Language Models with Fine-Grained Temporal Grounding
3
+
4
+ [![GitHub](https://img.shields.io/badge/GitHub-Repo-black?logo=github)](https://github.com/LoieSun/SpotSound)
5
+ [![Paper](https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv)](#)
6
+ [![Benchmark](https://img.shields.io/badge/🤗_HuggingFace-Benchmark-yellow)](https://huggingface.co/datasets/Loie/SpotSound-Bench)
7
+
8
+ ## Model Summary
9
+
10
+ **SpotSound** is a model designed to enhance Large Audio-Language Models (ALMs) with fine-grained temporal grounding capabilities. Built on top of [Audio Flamingo 3](https://huggingface.co/nvidia/audio-flamingo-3), SpotSound is capable of accurately pinpointing the exact start and end timestamps of specific acoustic events within long, untrimmed audio recordings based on natural language queries.
11
+
12
+ This model is particularly effective for "needle-in-a-haystack" audio retrieval tasks, where short target sounds are embedded within complex background noise.
13
+
14
+
15
+ ## Usage / Quick Start
16
+
17
+ To use SpotSound for inference, you need to download both the base **Audio Flamingo 3** model and the **SpotSound** checkpoint.
18
+
19
+ ### 1. Installation
20
+
21
+ First, clone the official [SpotSound GitHub repository](#) and set up the environment:
22
+
23
+ ```bash
24
+ conda create -n SpotSound python=3.10
25
+ conda activate SpotSound
26
+ pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121
27
+ pip install -r requirements.txt
28
+ ```
29
+
30
+ ### 2. Run Inference
31
+
32
+ You can run inference directly from the command line using the provided script in the GitHub repository. Specify the path to the downloaded Audio Flamingo 3 model, your SpotSound checkpoint, the target audio file, and your text query.
33
+
34
+ ```bash
35
+ export CUDA_VISIBLE_DEVICES=0
36
+ python inference.py \
37
+ --pretrain_model path_to_audioflamingo3 \
38
+ --checkpoint ckpt/spotsound \
39
+ --audio_path data/audio.wav \
40
+ --query "dog barking"
41
+ ```
42
+
43
+ ## Citation
44
+
45
+ If you use SpotSound or our benchmark in your research, please cite our paper:
46
+
47
+ ```bibtex
48
+ @inproceedings{sun2026spotsound,
49
+ title={SpotSound: Enhancing Large Audio-Language Models with Fine-Grained Temporal Grounding},
50
+ author={Sun, Luoyi and Zhou, Xiao and Li, Zeqian and Zhang, Ya and Wang, Yanking and Xie, Weidi},
51
+ year={2026}
52
+ }
53
+ ```
54
+
55
+ ## Acknowledgements
56
+
57
+ This project builds upon several excellent open-source efforts, notably:
58
+ - **[Audio Flamingo 3](https://github.com/NVIDIA/audio-flamingo/tree/audio_flamingo_3)** by NVIDIA.
59
+ - **[UniTime](https://github.com/Lzq5/UniTime)** for temporal grounding insights.
60
+
61
+ ## Contact
62
+
63
+ For any questions or issues, please contact: loiesun411@gmail.com or open an issue on our GitHub repository.
adapter_config.json ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "audio-flamingo-3-hf",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "model.layers.0.self_attn.k_proj",
33
+ "model.layers.17.self_attn.k_proj",
34
+ "model.layers.24.self_attn.q_proj",
35
+ "model.layers.2.self_attn.q_proj",
36
+ "model.layers.0.self_attn.v_proj",
37
+ "model.layers.7.self_attn.k_proj",
38
+ "model.layers.15.self_attn.v_proj",
39
+ "model.layers.19.self_attn.k_proj",
40
+ "down_proj",
41
+ "model.layers.12.self_attn.v_proj",
42
+ "model.layers.21.self_attn.v_proj",
43
+ "model.layers.23.self_attn.v_proj",
44
+ "model.layers.26.self_attn.k_proj",
45
+ "model.layers.19.self_attn.v_proj",
46
+ "model.layers.15.self_attn.k_proj",
47
+ "model.layers.24.self_attn.k_proj",
48
+ "model.layers.18.self_attn.q_proj",
49
+ "model.layers.20.self_attn.k_proj",
50
+ "model.layers.5.self_attn.v_proj",
51
+ "model.layers.10.self_attn.v_proj",
52
+ "model.layers.22.self_attn.k_proj",
53
+ "model.layers.3.self_attn.k_proj",
54
+ "model.layers.6.self_attn.k_proj",
55
+ "model.layers.9.self_attn.k_proj",
56
+ "model.layers.22.self_attn.q_proj",
57
+ "model.layers.5.self_attn.k_proj",
58
+ "model.layers.25.self_attn.v_proj",
59
+ "model.layers.23.self_attn.q_proj",
60
+ "model.layers.13.self_attn.v_proj",
61
+ "model.layers.3.self_attn.q_proj",
62
+ "model.layers.10.self_attn.q_proj",
63
+ "model.layers.13.self_attn.k_proj",
64
+ "model.layers.23.self_attn.k_proj",
65
+ "model.layers.15.self_attn.q_proj",
66
+ "model.layers.1.self_attn.k_proj",
67
+ "model.layers.2.self_attn.v_proj",
68
+ "gate_proj",
69
+ "model.layers.17.self_attn.v_proj",
70
+ "model.layers.21.self_attn.q_proj",
71
+ "model.layers.27.self_attn.q_proj",
72
+ "model.layers.7.self_attn.v_proj",
73
+ "model.layers.11.self_attn.q_proj",
74
+ "model.layers.16.self_attn.v_proj",
75
+ "model.layers.11.self_attn.v_proj",
76
+ "model.layers.12.self_attn.q_proj",
77
+ "model.layers.18.self_attn.v_proj",
78
+ "model.layers.3.self_attn.v_proj",
79
+ "model.layers.27.self_attn.v_proj",
80
+ "model.layers.26.self_attn.v_proj",
81
+ "model.layers.21.self_attn.k_proj",
82
+ "model.layers.8.self_attn.q_proj",
83
+ "model.layers.25.self_attn.q_proj",
84
+ "model.layers.5.self_attn.q_proj",
85
+ "model.layers.9.self_attn.q_proj",
86
+ "model.layers.18.self_attn.k_proj",
87
+ "model.layers.4.self_attn.k_proj",
88
+ "model.layers.0.self_attn.q_proj",
89
+ "model.layers.11.self_attn.k_proj",
90
+ "model.layers.4.self_attn.v_proj",
91
+ "model.layers.20.self_attn.q_proj",
92
+ "model.layers.10.self_attn.k_proj",
93
+ "model.layers.22.self_attn.v_proj",
94
+ "model.layers.14.self_attn.k_proj",
95
+ "model.layers.1.self_attn.q_proj",
96
+ "model.layers.12.self_attn.k_proj",
97
+ "model.layers.13.self_attn.q_proj",
98
+ "model.layers.20.self_attn.v_proj",
99
+ "up_proj",
100
+ "model.layers.25.self_attn.k_proj",
101
+ "model.layers.17.self_attn.q_proj",
102
+ "model.layers.1.self_attn.v_proj",
103
+ "model.layers.4.self_attn.q_proj",
104
+ "model.layers.19.self_attn.q_proj",
105
+ "model.layers.14.self_attn.v_proj",
106
+ "model.layers.27.self_attn.k_proj",
107
+ "model.layers.14.self_attn.q_proj",
108
+ "model.layers.7.self_attn.q_proj",
109
+ "model.layers.16.self_attn.q_proj",
110
+ "o_proj",
111
+ "model.layers.16.self_attn.k_proj",
112
+ "model.layers.6.self_attn.v_proj",
113
+ "model.layers.8.self_attn.k_proj",
114
+ "model.layers.2.self_attn.k_proj",
115
+ "model.layers.26.self_attn.q_proj",
116
+ "model.layers.24.self_attn.v_proj",
117
+ "model.layers.8.self_attn.v_proj",
118
+ "model.layers.9.self_attn.v_proj",
119
+ "model.layers.6.self_attn.q_proj"
120
+ ],
121
+ "target_parameters": null,
122
+ "task_type": "CAUSAL_LM",
123
+ "trainable_token_indices": null,
124
+ "use_dora": false,
125
+ "use_qalora": false,
126
+ "use_rslora": false
127
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab87de32fc6ae5acfaccc22c5f9792230a4ac8f11ca15f968d21b9c26075cf5e
3
+ size 80797976
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6b92c23c4174f5dabf3c48a10081edd82868344e2cf75016708d02fa257ac2b
3
+ size 161810282
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ab158b3630065e0379f440b006083e3cb588fe2f21ffcb5a3a4b57b941537c1
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1996f8e7b4f0cb933e459b73d89902fdf29339d53a3bf845c48f923c01b102c0
3
+ size 1064
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc632d0ab1a2384fc16b375a26abd5fb27276c54e503f67ab4fa0533fc768cb
3
+ size 4856