happyme531 commited on
Commit
34ebdd6
·
verified ·
1 Parent(s): 2dc558a

Upload 15 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ librkllmrt.so filter=lfs diff=lfs merge=lfs -text
37
+ long_test.wav filter=lfs diff=lfs merge=lfs -text
38
+ rknn/audio_encoder.rknn filter=lfs diff=lfs merge=lfs -text
39
+ rknn/language_model.rkllm filter=lfs diff=lfs merge=lfs -text
chat_template.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chat_template": "{%- set ns = namespace(system_text=\"\") -%}\n{%- for m in messages -%}\n {%- if m.role == 'system' -%}\n {%- if m.content is string -%}\n {%- set ns.system_text = ns.system_text + m.content -%}\n {%- else -%}\n {%- for c in m.content -%}\n {%- if c.type == 'text' and (c.text is defined) -%}\n {%- set ns.system_text = ns.system_text + c.text -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n\n{%- set ns2 = namespace(audio_tokens=\"\") -%}\n{%- for m in messages -%}\n {%- if m.content is not string -%}\n {%- for c in m.content -%}\n {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) -%}\n {%- set ns2.audio_tokens = ns2.audio_tokens + \"<|audio_start|><|audio_pad|><|audio_end|>\" -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n{%- endfor -%}\n\n{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n{%- if add_generation_prompt -%}\n{{- '<|im_start|>assistant\\n' -}}\n{%- endif -%}"}
config.json ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ASRForConditionalGeneration"
4
+ ],
5
+ "model_type": "qwen3_asr",
6
+ "support_languages": [
7
+ "Chinese",
8
+ "English",
9
+ "Cantonese",
10
+ "Arabic",
11
+ "German",
12
+ "French",
13
+ "Spanish",
14
+ "Portuguese",
15
+ "Indonesian",
16
+ "Italian",
17
+ "Korean",
18
+ "Russian",
19
+ "Thai",
20
+ "Vietnamese",
21
+ "Japanese",
22
+ "Turkish",
23
+ "Hindi",
24
+ "Malay",
25
+ "Dutch",
26
+ "Swedish",
27
+ "Danish",
28
+ "Finnish",
29
+ "Polish",
30
+ "Czech",
31
+ "Filipino",
32
+ "Persian",
33
+ "Greek",
34
+ "Romanian",
35
+ "Hungarian",
36
+ "Macedonian"
37
+ ],
38
+ "thinker_config": {
39
+ "model_type": "qwen3_asr",
40
+ "architectures": [
41
+ "Qwen3ASRForConditionalGeneration"
42
+ ],
43
+ "audio_config": {
44
+ "_name_or_path": "",
45
+ "activation_dropout": 0,
46
+ "activation_function": "gelu",
47
+ "add_cross_attention": false,
48
+ "architectures": null,
49
+ "attention_dropout": 0,
50
+ "bad_words_ids": null,
51
+ "begin_suppress_tokens": null,
52
+ "bos_token_id": null,
53
+ "chunk_size_feed_forward": 0,
54
+ "conv_chunksize": 500,
55
+ "cross_attention_hidden_size": null,
56
+ "d_model": 1024,
57
+ "decoder_start_token_id": null,
58
+ "diversity_penalty": 0.0,
59
+ "do_sample": false,
60
+ "downsample_hidden_size": 480,
61
+ "dropout": 0,
62
+ "dtype": null,
63
+ "early_stopping": false,
64
+ "encoder_attention_heads": 16,
65
+ "encoder_ffn_dim": 4096,
66
+ "encoder_layers": 24,
67
+ "encoder_no_repeat_ngram_size": 0,
68
+ "eos_token_id": null,
69
+ "exponential_decay_length_penalty": null,
70
+ "finetuning_task": null,
71
+ "forced_bos_token_id": null,
72
+ "forced_eos_token_id": null,
73
+ "id2label": {
74
+ "0": "LABEL_0",
75
+ "1": "LABEL_1"
76
+ },
77
+ "initializer_range": 0.02,
78
+ "is_decoder": false,
79
+ "is_encoder_decoder": false,
80
+ "label2id": {
81
+ "LABEL_0": 0,
82
+ "LABEL_1": 1
83
+ },
84
+ "length_penalty": 1.0,
85
+ "max_length": 20,
86
+ "max_source_positions": 1500,
87
+ "min_length": 0,
88
+ "model_type": "qwen3_asr_audio_encoder",
89
+ "n_window": 50,
90
+ "n_window_infer": 800,
91
+ "no_repeat_ngram_size": 0,
92
+ "num_beam_groups": 1,
93
+ "num_beams": 1,
94
+ "num_hidden_layers": 24,
95
+ "num_mel_bins": 128,
96
+ "num_return_sequences": 1,
97
+ "output_attentions": false,
98
+ "output_dim": 2048,
99
+ "output_hidden_states": false,
100
+ "output_scores": false,
101
+ "pad_token_id": null,
102
+ "prefix": null,
103
+ "problem_type": null,
104
+ "pruned_heads": {},
105
+ "remove_invalid_values": false,
106
+ "repetition_penalty": 1.0,
107
+ "return_dict": true,
108
+ "return_dict_in_generate": false,
109
+ "scale_embedding": false,
110
+ "sep_token_id": null,
111
+ "suppress_tokens": null,
112
+ "task_specific_params": null,
113
+ "temperature": 1.0,
114
+ "tf_legacy_loss": false,
115
+ "tie_encoder_decoder": false,
116
+ "tie_word_embeddings": true,
117
+ "tokenizer_class": null,
118
+ "top_k": 50,
119
+ "top_p": 1.0,
120
+ "torchscript": false,
121
+ "typical_p": 1.0,
122
+ "use_bfloat16": false
123
+ },
124
+ "audio_end_token_id": 151670,
125
+ "audio_start_token_id": 151669,
126
+ "audio_token_id": 151676,
127
+ "dtype": "bfloat16",
128
+ "initializer_range": 0.02,
129
+ "text_config": {
130
+ "_name_or_path": "",
131
+ "add_cross_attention": false,
132
+ "architectures": null,
133
+ "attention_bias": false,
134
+ "attention_dropout": 0.0,
135
+ "bad_words_ids": null,
136
+ "begin_suppress_tokens": null,
137
+ "bos_token_id": null,
138
+ "chunk_size_feed_forward": 0,
139
+ "cross_attention_hidden_size": null,
140
+ "decoder_start_token_id": null,
141
+ "diversity_penalty": 0.0,
142
+ "do_sample": false,
143
+ "dtype": null,
144
+ "early_stopping": false,
145
+ "encoder_no_repeat_ngram_size": 0,
146
+ "eos_token_id": null,
147
+ "exponential_decay_length_penalty": null,
148
+ "finetuning_task": null,
149
+ "forced_bos_token_id": null,
150
+ "forced_eos_token_id": null,
151
+ "head_dim": 128,
152
+ "hidden_act": "silu",
153
+ "hidden_size": 2048,
154
+ "id2label": {
155
+ "0": "LABEL_0",
156
+ "1": "LABEL_1"
157
+ },
158
+ "initializer_range": 0.02,
159
+ "intermediate_size": 6144,
160
+ "is_decoder": false,
161
+ "is_encoder_decoder": false,
162
+ "label2id": {
163
+ "LABEL_0": 0,
164
+ "LABEL_1": 1
165
+ },
166
+ "length_penalty": 1.0,
167
+ "max_length": 20,
168
+ "max_position_embeddings": 65536,
169
+ "min_length": 0,
170
+ "model_type": "qwen3",
171
+ "no_repeat_ngram_size": 0,
172
+ "num_attention_heads": 16,
173
+ "num_beam_groups": 1,
174
+ "num_beams": 1,
175
+ "num_hidden_layers": 28,
176
+ "num_key_value_heads": 8,
177
+ "num_return_sequences": 1,
178
+ "output_attentions": false,
179
+ "output_hidden_states": false,
180
+ "output_scores": false,
181
+ "pad_token_id": null,
182
+ "prefix": null,
183
+ "problem_type": null,
184
+ "pruned_heads": {},
185
+ "remove_invalid_values": false,
186
+ "repetition_penalty": 1.0,
187
+ "return_dict": true,
188
+ "return_dict_in_generate": false,
189
+ "rms_norm_eps": 1e-06,
190
+ "rope_scaling": {
191
+ "interleaved": true,
192
+ "mrope_interleaved": true,
193
+ "mrope_section": [
194
+ 24,
195
+ 20,
196
+ 20
197
+ ],
198
+ "rope_type": "default",
199
+ "type": "default"
200
+ },
201
+ "rope_theta": 1000000,
202
+ "sep_token_id": null,
203
+ "suppress_tokens": null,
204
+ "task_specific_params": null,
205
+ "temperature": 1.0,
206
+ "tf_legacy_loss": false,
207
+ "tie_encoder_decoder": false,
208
+ "tie_word_embeddings": true,
209
+ "tokenizer_class": null,
210
+ "top_k": 50,
211
+ "top_p": 1.0,
212
+ "torchscript": false,
213
+ "typical_p": 1.0,
214
+ "use_bfloat16": false,
215
+ "use_cache": true,
216
+ "vocab_size": 151936
217
+ }
218
+ },
219
+ "transformers_version": "4.57.6"
220
+ }
221
+
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"auto-speech-recognition"}
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [151643,151645],
4
+ "pad_token_id": 151643,
5
+ "do_sample": false,
6
+ "temperature": 0.000001
7
+ }
librkllmrt.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbcf28a8666b9fbf7361d6aad892b957920f6ea92400c074899b48f4c5b2c96f
3
+ size 7543744
long_test.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4da15e0244edc3ca23fbcb4fe93669e8ce4d59002a72177906123a1a91f17c17
3
+ size 7110734
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "format": "pt"
4
+ },
5
+ "weight_map": {
6
+ "thinker.audio_tower.conv2d1.bias": "model-00001-of-00002.safetensors",
7
+ "thinker.audio_tower.conv2d1.weight": "model-00001-of-00002.safetensors",
8
+ "thinker.audio_tower.conv2d2.bias": "model-00001-of-00002.safetensors",
9
+ "thinker.audio_tower.conv2d2.weight": "model-00001-of-00002.safetensors",
10
+ "thinker.audio_tower.conv2d3.bias": "model-00001-of-00002.safetensors",
11
+ "thinker.audio_tower.conv2d3.weight": "model-00001-of-00002.safetensors",
12
+ "thinker.audio_tower.conv_out.weight": "model-00001-of-00002.safetensors",
13
+ "thinker.audio_tower.layers.0.fc1.bias": "model-00001-of-00002.safetensors",
14
+ "thinker.audio_tower.layers.0.fc1.weight": "model-00001-of-00002.safetensors",
15
+ "thinker.audio_tower.layers.0.fc2.bias": "model-00001-of-00002.safetensors",
16
+ "thinker.audio_tower.layers.0.fc2.weight": "model-00001-of-00002.safetensors",
17
+ "thinker.audio_tower.layers.0.final_layer_norm.bias": "model-00001-of-00002.safetensors",
18
+ "thinker.audio_tower.layers.0.final_layer_norm.weight": "model-00001-of-00002.safetensors",
19
+ "thinker.audio_tower.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
20
+ "thinker.audio_tower.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
21
+ "thinker.audio_tower.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
22
+ "thinker.audio_tower.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
23
+ "thinker.audio_tower.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
24
+ "thinker.audio_tower.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
25
+ "thinker.audio_tower.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
26
+ "thinker.audio_tower.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
27
+ "thinker.audio_tower.layers.0.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
28
+ "thinker.audio_tower.layers.0.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
29
+ "thinker.audio_tower.layers.1.fc1.bias": "model-00001-of-00002.safetensors",
30
+ "thinker.audio_tower.layers.1.fc1.weight": "model-00001-of-00002.safetensors",
31
+ "thinker.audio_tower.layers.1.fc2.bias": "model-00001-of-00002.safetensors",
32
+ "thinker.audio_tower.layers.1.fc2.weight": "model-00001-of-00002.safetensors",
33
+ "thinker.audio_tower.layers.1.final_layer_norm.bias": "model-00001-of-00002.safetensors",
34
+ "thinker.audio_tower.layers.1.final_layer_norm.weight": "model-00001-of-00002.safetensors",
35
+ "thinker.audio_tower.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
36
+ "thinker.audio_tower.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
37
+ "thinker.audio_tower.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
38
+ "thinker.audio_tower.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
39
+ "thinker.audio_tower.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
40
+ "thinker.audio_tower.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
41
+ "thinker.audio_tower.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
42
+ "thinker.audio_tower.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
43
+ "thinker.audio_tower.layers.1.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
44
+ "thinker.audio_tower.layers.1.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
45
+ "thinker.audio_tower.layers.10.fc1.bias": "model-00001-of-00002.safetensors",
46
+ "thinker.audio_tower.layers.10.fc1.weight": "model-00001-of-00002.safetensors",
47
+ "thinker.audio_tower.layers.10.fc2.bias": "model-00001-of-00002.safetensors",
48
+ "thinker.audio_tower.layers.10.fc2.weight": "model-00001-of-00002.safetensors",
49
+ "thinker.audio_tower.layers.10.final_layer_norm.bias": "model-00001-of-00002.safetensors",
50
+ "thinker.audio_tower.layers.10.final_layer_norm.weight": "model-00001-of-00002.safetensors",
51
+ "thinker.audio_tower.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
52
+ "thinker.audio_tower.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
53
+ "thinker.audio_tower.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
54
+ "thinker.audio_tower.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
55
+ "thinker.audio_tower.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
56
+ "thinker.audio_tower.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
57
+ "thinker.audio_tower.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
58
+ "thinker.audio_tower.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
59
+ "thinker.audio_tower.layers.10.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
60
+ "thinker.audio_tower.layers.10.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
61
+ "thinker.audio_tower.layers.11.fc1.bias": "model-00001-of-00002.safetensors",
62
+ "thinker.audio_tower.layers.11.fc1.weight": "model-00001-of-00002.safetensors",
63
+ "thinker.audio_tower.layers.11.fc2.bias": "model-00001-of-00002.safetensors",
64
+ "thinker.audio_tower.layers.11.fc2.weight": "model-00001-of-00002.safetensors",
65
+ "thinker.audio_tower.layers.11.final_layer_norm.bias": "model-00001-of-00002.safetensors",
66
+ "thinker.audio_tower.layers.11.final_layer_norm.weight": "model-00001-of-00002.safetensors",
67
+ "thinker.audio_tower.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
68
+ "thinker.audio_tower.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
69
+ "thinker.audio_tower.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
70
+ "thinker.audio_tower.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
71
+ "thinker.audio_tower.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
72
+ "thinker.audio_tower.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
73
+ "thinker.audio_tower.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
74
+ "thinker.audio_tower.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
75
+ "thinker.audio_tower.layers.11.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
76
+ "thinker.audio_tower.layers.11.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
77
+ "thinker.audio_tower.layers.12.fc1.bias": "model-00001-of-00002.safetensors",
78
+ "thinker.audio_tower.layers.12.fc1.weight": "model-00001-of-00002.safetensors",
79
+ "thinker.audio_tower.layers.12.fc2.bias": "model-00001-of-00002.safetensors",
80
+ "thinker.audio_tower.layers.12.fc2.weight": "model-00001-of-00002.safetensors",
81
+ "thinker.audio_tower.layers.12.final_layer_norm.bias": "model-00001-of-00002.safetensors",
82
+ "thinker.audio_tower.layers.12.final_layer_norm.weight": "model-00001-of-00002.safetensors",
83
+ "thinker.audio_tower.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
84
+ "thinker.audio_tower.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
85
+ "thinker.audio_tower.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
86
+ "thinker.audio_tower.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
87
+ "thinker.audio_tower.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
88
+ "thinker.audio_tower.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
89
+ "thinker.audio_tower.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
90
+ "thinker.audio_tower.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
91
+ "thinker.audio_tower.layers.12.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
92
+ "thinker.audio_tower.layers.12.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
93
+ "thinker.audio_tower.layers.13.fc1.bias": "model-00001-of-00002.safetensors",
94
+ "thinker.audio_tower.layers.13.fc1.weight": "model-00001-of-00002.safetensors",
95
+ "thinker.audio_tower.layers.13.fc2.bias": "model-00001-of-00002.safetensors",
96
+ "thinker.audio_tower.layers.13.fc2.weight": "model-00001-of-00002.safetensors",
97
+ "thinker.audio_tower.layers.13.final_layer_norm.bias": "model-00001-of-00002.safetensors",
98
+ "thinker.audio_tower.layers.13.final_layer_norm.weight": "model-00001-of-00002.safetensors",
99
+ "thinker.audio_tower.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
100
+ "thinker.audio_tower.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
101
+ "thinker.audio_tower.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
102
+ "thinker.audio_tower.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
103
+ "thinker.audio_tower.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
104
+ "thinker.audio_tower.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
105
+ "thinker.audio_tower.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
106
+ "thinker.audio_tower.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
+ "thinker.audio_tower.layers.13.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
108
+ "thinker.audio_tower.layers.13.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
109
+ "thinker.audio_tower.layers.14.fc1.bias": "model-00001-of-00002.safetensors",
110
+ "thinker.audio_tower.layers.14.fc1.weight": "model-00001-of-00002.safetensors",
111
+ "thinker.audio_tower.layers.14.fc2.bias": "model-00001-of-00002.safetensors",
112
+ "thinker.audio_tower.layers.14.fc2.weight": "model-00001-of-00002.safetensors",
113
+ "thinker.audio_tower.layers.14.final_layer_norm.bias": "model-00001-of-00002.safetensors",
114
+ "thinker.audio_tower.layers.14.final_layer_norm.weight": "model-00001-of-00002.safetensors",
115
+ "thinker.audio_tower.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
116
+ "thinker.audio_tower.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
117
+ "thinker.audio_tower.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
118
+ "thinker.audio_tower.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
119
+ "thinker.audio_tower.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
120
+ "thinker.audio_tower.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
121
+ "thinker.audio_tower.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
122
+ "thinker.audio_tower.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
123
+ "thinker.audio_tower.layers.14.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
124
+ "thinker.audio_tower.layers.14.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
125
+ "thinker.audio_tower.layers.15.fc1.bias": "model-00001-of-00002.safetensors",
126
+ "thinker.audio_tower.layers.15.fc1.weight": "model-00001-of-00002.safetensors",
127
+ "thinker.audio_tower.layers.15.fc2.bias": "model-00001-of-00002.safetensors",
128
+ "thinker.audio_tower.layers.15.fc2.weight": "model-00001-of-00002.safetensors",
129
+ "thinker.audio_tower.layers.15.final_layer_norm.bias": "model-00001-of-00002.safetensors",
130
+ "thinker.audio_tower.layers.15.final_layer_norm.weight": "model-00001-of-00002.safetensors",
131
+ "thinker.audio_tower.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
132
+ "thinker.audio_tower.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
133
+ "thinker.audio_tower.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
134
+ "thinker.audio_tower.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
135
+ "thinker.audio_tower.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
136
+ "thinker.audio_tower.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
137
+ "thinker.audio_tower.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
138
+ "thinker.audio_tower.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
139
+ "thinker.audio_tower.layers.15.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
140
+ "thinker.audio_tower.layers.15.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
141
+ "thinker.audio_tower.layers.16.fc1.bias": "model-00001-of-00002.safetensors",
142
+ "thinker.audio_tower.layers.16.fc1.weight": "model-00001-of-00002.safetensors",
143
+ "thinker.audio_tower.layers.16.fc2.bias": "model-00001-of-00002.safetensors",
144
+ "thinker.audio_tower.layers.16.fc2.weight": "model-00001-of-00002.safetensors",
145
+ "thinker.audio_tower.layers.16.final_layer_norm.bias": "model-00001-of-00002.safetensors",
146
+ "thinker.audio_tower.layers.16.final_layer_norm.weight": "model-00001-of-00002.safetensors",
147
+ "thinker.audio_tower.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
148
+ "thinker.audio_tower.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
149
+ "thinker.audio_tower.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
150
+ "thinker.audio_tower.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
151
+ "thinker.audio_tower.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
152
+ "thinker.audio_tower.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
153
+ "thinker.audio_tower.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
154
+ "thinker.audio_tower.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
155
+ "thinker.audio_tower.layers.16.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
156
+ "thinker.audio_tower.layers.16.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
157
+ "thinker.audio_tower.layers.17.fc1.bias": "model-00001-of-00002.safetensors",
158
+ "thinker.audio_tower.layers.17.fc1.weight": "model-00001-of-00002.safetensors",
159
+ "thinker.audio_tower.layers.17.fc2.bias": "model-00001-of-00002.safetensors",
160
+ "thinker.audio_tower.layers.17.fc2.weight": "model-00001-of-00002.safetensors",
161
+ "thinker.audio_tower.layers.17.final_layer_norm.bias": "model-00001-of-00002.safetensors",
162
+ "thinker.audio_tower.layers.17.final_layer_norm.weight": "model-00001-of-00002.safetensors",
163
+ "thinker.audio_tower.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
164
+ "thinker.audio_tower.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
165
+ "thinker.audio_tower.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
166
+ "thinker.audio_tower.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
167
+ "thinker.audio_tower.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
168
+ "thinker.audio_tower.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
169
+ "thinker.audio_tower.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
170
+ "thinker.audio_tower.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
171
+ "thinker.audio_tower.layers.17.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
172
+ "thinker.audio_tower.layers.17.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
173
+ "thinker.audio_tower.layers.18.fc1.bias": "model-00001-of-00002.safetensors",
174
+ "thinker.audio_tower.layers.18.fc1.weight": "model-00001-of-00002.safetensors",
175
+ "thinker.audio_tower.layers.18.fc2.bias": "model-00001-of-00002.safetensors",
176
+ "thinker.audio_tower.layers.18.fc2.weight": "model-00001-of-00002.safetensors",
177
+ "thinker.audio_tower.layers.18.final_layer_norm.bias": "model-00001-of-00002.safetensors",
178
+ "thinker.audio_tower.layers.18.final_layer_norm.weight": "model-00001-of-00002.safetensors",
179
+ "thinker.audio_tower.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
180
+ "thinker.audio_tower.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
181
+ "thinker.audio_tower.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
182
+ "thinker.audio_tower.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
183
+ "thinker.audio_tower.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
184
+ "thinker.audio_tower.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
185
+ "thinker.audio_tower.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
186
+ "thinker.audio_tower.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
187
+ "thinker.audio_tower.layers.18.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
188
+ "thinker.audio_tower.layers.18.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
189
+ "thinker.audio_tower.layers.19.fc1.bias": "model-00001-of-00002.safetensors",
190
+ "thinker.audio_tower.layers.19.fc1.weight": "model-00001-of-00002.safetensors",
191
+ "thinker.audio_tower.layers.19.fc2.bias": "model-00001-of-00002.safetensors",
192
+ "thinker.audio_tower.layers.19.fc2.weight": "model-00001-of-00002.safetensors",
193
+ "thinker.audio_tower.layers.19.final_layer_norm.bias": "model-00001-of-00002.safetensors",
194
+ "thinker.audio_tower.layers.19.final_layer_norm.weight": "model-00001-of-00002.safetensors",
195
+ "thinker.audio_tower.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
196
+ "thinker.audio_tower.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
197
+ "thinker.audio_tower.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
198
+ "thinker.audio_tower.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
199
+ "thinker.audio_tower.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
200
+ "thinker.audio_tower.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
201
+ "thinker.audio_tower.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
202
+ "thinker.audio_tower.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
203
+ "thinker.audio_tower.layers.19.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
204
+ "thinker.audio_tower.layers.19.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
205
+ "thinker.audio_tower.layers.2.fc1.bias": "model-00001-of-00002.safetensors",
206
+ "thinker.audio_tower.layers.2.fc1.weight": "model-00001-of-00002.safetensors",
207
+ "thinker.audio_tower.layers.2.fc2.bias": "model-00001-of-00002.safetensors",
208
+ "thinker.audio_tower.layers.2.fc2.weight": "model-00001-of-00002.safetensors",
209
+ "thinker.audio_tower.layers.2.final_layer_norm.bias": "model-00001-of-00002.safetensors",
210
+ "thinker.audio_tower.layers.2.final_layer_norm.weight": "model-00001-of-00002.safetensors",
211
+ "thinker.audio_tower.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
212
+ "thinker.audio_tower.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
213
+ "thinker.audio_tower.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
214
+ "thinker.audio_tower.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
215
+ "thinker.audio_tower.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
216
+ "thinker.audio_tower.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
217
+ "thinker.audio_tower.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
218
+ "thinker.audio_tower.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
219
+ "thinker.audio_tower.layers.2.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
220
+ "thinker.audio_tower.layers.2.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
221
+ "thinker.audio_tower.layers.20.fc1.bias": "model-00001-of-00002.safetensors",
222
+ "thinker.audio_tower.layers.20.fc1.weight": "model-00001-of-00002.safetensors",
223
+ "thinker.audio_tower.layers.20.fc2.bias": "model-00001-of-00002.safetensors",
224
+ "thinker.audio_tower.layers.20.fc2.weight": "model-00001-of-00002.safetensors",
225
+ "thinker.audio_tower.layers.20.final_layer_norm.bias": "model-00001-of-00002.safetensors",
226
+ "thinker.audio_tower.layers.20.final_layer_norm.weight": "model-00001-of-00002.safetensors",
227
+ "thinker.audio_tower.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
228
+ "thinker.audio_tower.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
229
+ "thinker.audio_tower.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
230
+ "thinker.audio_tower.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
231
+ "thinker.audio_tower.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
232
+ "thinker.audio_tower.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
233
+ "thinker.audio_tower.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
234
+ "thinker.audio_tower.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
235
+ "thinker.audio_tower.layers.20.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
236
+ "thinker.audio_tower.layers.20.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
237
+ "thinker.audio_tower.layers.21.fc1.bias": "model-00001-of-00002.safetensors",
238
+ "thinker.audio_tower.layers.21.fc1.weight": "model-00001-of-00002.safetensors",
239
+ "thinker.audio_tower.layers.21.fc2.bias": "model-00001-of-00002.safetensors",
240
+ "thinker.audio_tower.layers.21.fc2.weight": "model-00001-of-00002.safetensors",
241
+ "thinker.audio_tower.layers.21.final_layer_norm.bias": "model-00001-of-00002.safetensors",
242
+ "thinker.audio_tower.layers.21.final_layer_norm.weight": "model-00001-of-00002.safetensors",
243
+ "thinker.audio_tower.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
244
+ "thinker.audio_tower.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
245
+ "thinker.audio_tower.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
246
+ "thinker.audio_tower.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
247
+ "thinker.audio_tower.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
248
+ "thinker.audio_tower.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
249
+ "thinker.audio_tower.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
250
+ "thinker.audio_tower.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
251
+ "thinker.audio_tower.layers.21.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
252
+ "thinker.audio_tower.layers.21.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
253
+ "thinker.audio_tower.layers.22.fc1.bias": "model-00001-of-00002.safetensors",
254
+ "thinker.audio_tower.layers.22.fc1.weight": "model-00001-of-00002.safetensors",
255
+ "thinker.audio_tower.layers.22.fc2.bias": "model-00001-of-00002.safetensors",
256
+ "thinker.audio_tower.layers.22.fc2.weight": "model-00001-of-00002.safetensors",
257
+ "thinker.audio_tower.layers.22.final_layer_norm.bias": "model-00001-of-00002.safetensors",
258
+ "thinker.audio_tower.layers.22.final_layer_norm.weight": "model-00001-of-00002.safetensors",
259
+ "thinker.audio_tower.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
260
+ "thinker.audio_tower.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
261
+ "thinker.audio_tower.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
262
+ "thinker.audio_tower.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
263
+ "thinker.audio_tower.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
264
+ "thinker.audio_tower.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
265
+ "thinker.audio_tower.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
266
+ "thinker.audio_tower.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
267
+ "thinker.audio_tower.layers.22.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
268
+ "thinker.audio_tower.layers.22.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
269
+ "thinker.audio_tower.layers.23.fc1.bias": "model-00001-of-00002.safetensors",
270
+ "thinker.audio_tower.layers.23.fc1.weight": "model-00001-of-00002.safetensors",
271
+ "thinker.audio_tower.layers.23.fc2.bias": "model-00001-of-00002.safetensors",
272
+ "thinker.audio_tower.layers.23.fc2.weight": "model-00001-of-00002.safetensors",
273
+ "thinker.audio_tower.layers.23.final_layer_norm.bias": "model-00001-of-00002.safetensors",
274
+ "thinker.audio_tower.layers.23.final_layer_norm.weight": "model-00001-of-00002.safetensors",
275
+ "thinker.audio_tower.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
276
+ "thinker.audio_tower.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
277
+ "thinker.audio_tower.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
278
+ "thinker.audio_tower.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
279
+ "thinker.audio_tower.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
280
+ "thinker.audio_tower.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
281
+ "thinker.audio_tower.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
282
+ "thinker.audio_tower.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
283
+ "thinker.audio_tower.layers.23.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
284
+ "thinker.audio_tower.layers.23.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
285
+ "thinker.audio_tower.layers.3.fc1.bias": "model-00001-of-00002.safetensors",
286
+ "thinker.audio_tower.layers.3.fc1.weight": "model-00001-of-00002.safetensors",
287
+ "thinker.audio_tower.layers.3.fc2.bias": "model-00001-of-00002.safetensors",
288
+ "thinker.audio_tower.layers.3.fc2.weight": "model-00001-of-00002.safetensors",
289
+ "thinker.audio_tower.layers.3.final_layer_norm.bias": "model-00001-of-00002.safetensors",
290
+ "thinker.audio_tower.layers.3.final_layer_norm.weight": "model-00001-of-00002.safetensors",
291
+ "thinker.audio_tower.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
292
+ "thinker.audio_tower.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
293
+ "thinker.audio_tower.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
294
+ "thinker.audio_tower.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
295
+ "thinker.audio_tower.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
296
+ "thinker.audio_tower.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
297
+ "thinker.audio_tower.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
298
+ "thinker.audio_tower.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
299
+ "thinker.audio_tower.layers.3.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
300
+ "thinker.audio_tower.layers.3.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
301
+ "thinker.audio_tower.layers.4.fc1.bias": "model-00001-of-00002.safetensors",
302
+ "thinker.audio_tower.layers.4.fc1.weight": "model-00001-of-00002.safetensors",
303
+ "thinker.audio_tower.layers.4.fc2.bias": "model-00001-of-00002.safetensors",
304
+ "thinker.audio_tower.layers.4.fc2.weight": "model-00001-of-00002.safetensors",
305
+ "thinker.audio_tower.layers.4.final_layer_norm.bias": "model-00001-of-00002.safetensors",
306
+ "thinker.audio_tower.layers.4.final_layer_norm.weight": "model-00001-of-00002.safetensors",
307
+ "thinker.audio_tower.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
308
+ "thinker.audio_tower.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
309
+ "thinker.audio_tower.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
310
+ "thinker.audio_tower.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
311
+ "thinker.audio_tower.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
312
+ "thinker.audio_tower.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
313
+ "thinker.audio_tower.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
314
+ "thinker.audio_tower.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
315
+ "thinker.audio_tower.layers.4.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
316
+ "thinker.audio_tower.layers.4.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
317
+ "thinker.audio_tower.layers.5.fc1.bias": "model-00001-of-00002.safetensors",
318
+ "thinker.audio_tower.layers.5.fc1.weight": "model-00001-of-00002.safetensors",
319
+ "thinker.audio_tower.layers.5.fc2.bias": "model-00001-of-00002.safetensors",
320
+ "thinker.audio_tower.layers.5.fc2.weight": "model-00001-of-00002.safetensors",
321
+ "thinker.audio_tower.layers.5.final_layer_norm.bias": "model-00001-of-00002.safetensors",
322
+ "thinker.audio_tower.layers.5.final_layer_norm.weight": "model-00001-of-00002.safetensors",
323
+ "thinker.audio_tower.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
324
+ "thinker.audio_tower.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
325
+ "thinker.audio_tower.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
326
+ "thinker.audio_tower.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
327
+ "thinker.audio_tower.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
328
+ "thinker.audio_tower.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
329
+ "thinker.audio_tower.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
330
+ "thinker.audio_tower.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
331
+ "thinker.audio_tower.layers.5.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
332
+ "thinker.audio_tower.layers.5.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
333
+ "thinker.audio_tower.layers.6.fc1.bias": "model-00001-of-00002.safetensors",
334
+ "thinker.audio_tower.layers.6.fc1.weight": "model-00001-of-00002.safetensors",
335
+ "thinker.audio_tower.layers.6.fc2.bias": "model-00001-of-00002.safetensors",
336
+ "thinker.audio_tower.layers.6.fc2.weight": "model-00001-of-00002.safetensors",
337
+ "thinker.audio_tower.layers.6.final_layer_norm.bias": "model-00001-of-00002.safetensors",
338
+ "thinker.audio_tower.layers.6.final_layer_norm.weight": "model-00001-of-00002.safetensors",
339
+ "thinker.audio_tower.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
340
+ "thinker.audio_tower.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
341
+ "thinker.audio_tower.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
342
+ "thinker.audio_tower.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
343
+ "thinker.audio_tower.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
344
+ "thinker.audio_tower.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
345
+ "thinker.audio_tower.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
346
+ "thinker.audio_tower.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
347
+ "thinker.audio_tower.layers.6.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
348
+ "thinker.audio_tower.layers.6.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
349
+ "thinker.audio_tower.layers.7.fc1.bias": "model-00001-of-00002.safetensors",
350
+ "thinker.audio_tower.layers.7.fc1.weight": "model-00001-of-00002.safetensors",
351
+ "thinker.audio_tower.layers.7.fc2.bias": "model-00001-of-00002.safetensors",
352
+ "thinker.audio_tower.layers.7.fc2.weight": "model-00001-of-00002.safetensors",
353
+ "thinker.audio_tower.layers.7.final_layer_norm.bias": "model-00001-of-00002.safetensors",
354
+ "thinker.audio_tower.layers.7.final_layer_norm.weight": "model-00001-of-00002.safetensors",
355
+ "thinker.audio_tower.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
356
+ "thinker.audio_tower.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
357
+ "thinker.audio_tower.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
358
+ "thinker.audio_tower.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
359
+ "thinker.audio_tower.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
360
+ "thinker.audio_tower.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
361
+ "thinker.audio_tower.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
362
+ "thinker.audio_tower.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
363
+ "thinker.audio_tower.layers.7.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
364
+ "thinker.audio_tower.layers.7.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
365
+ "thinker.audio_tower.layers.8.fc1.bias": "model-00001-of-00002.safetensors",
366
+ "thinker.audio_tower.layers.8.fc1.weight": "model-00001-of-00002.safetensors",
367
+ "thinker.audio_tower.layers.8.fc2.bias": "model-00001-of-00002.safetensors",
368
+ "thinker.audio_tower.layers.8.fc2.weight": "model-00001-of-00002.safetensors",
369
+ "thinker.audio_tower.layers.8.final_layer_norm.bias": "model-00001-of-00002.safetensors",
370
+ "thinker.audio_tower.layers.8.final_layer_norm.weight": "model-00001-of-00002.safetensors",
371
+ "thinker.audio_tower.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
372
+ "thinker.audio_tower.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
373
+ "thinker.audio_tower.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
374
+ "thinker.audio_tower.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
375
+ "thinker.audio_tower.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
376
+ "thinker.audio_tower.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
377
+ "thinker.audio_tower.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
378
+ "thinker.audio_tower.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
379
+ "thinker.audio_tower.layers.8.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
380
+ "thinker.audio_tower.layers.8.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
381
+ "thinker.audio_tower.layers.9.fc1.bias": "model-00001-of-00002.safetensors",
382
+ "thinker.audio_tower.layers.9.fc1.weight": "model-00001-of-00002.safetensors",
383
+ "thinker.audio_tower.layers.9.fc2.bias": "model-00001-of-00002.safetensors",
384
+ "thinker.audio_tower.layers.9.fc2.weight": "model-00001-of-00002.safetensors",
385
+ "thinker.audio_tower.layers.9.final_layer_norm.bias": "model-00001-of-00002.safetensors",
386
+ "thinker.audio_tower.layers.9.final_layer_norm.weight": "model-00001-of-00002.safetensors",
387
+ "thinker.audio_tower.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
388
+ "thinker.audio_tower.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
389
+ "thinker.audio_tower.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
390
+ "thinker.audio_tower.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
391
+ "thinker.audio_tower.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
392
+ "thinker.audio_tower.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
393
+ "thinker.audio_tower.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
394
+ "thinker.audio_tower.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
395
+ "thinker.audio_tower.layers.9.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
396
+ "thinker.audio_tower.layers.9.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
397
+ "thinker.audio_tower.ln_post.bias": "model-00001-of-00002.safetensors",
398
+ "thinker.audio_tower.ln_post.weight": "model-00001-of-00002.safetensors",
399
+ "thinker.audio_tower.proj1.bias": "model-00001-of-00002.safetensors",
400
+ "thinker.audio_tower.proj1.weight": "model-00001-of-00002.safetensors",
401
+ "thinker.audio_tower.proj2.bias": "model-00001-of-00002.safetensors",
402
+ "thinker.audio_tower.proj2.weight": "model-00001-of-00002.safetensors",
403
+ "thinker.lm_head.weight": "model-00001-of-00002.safetensors",
404
+ "thinker.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
405
+ "thinker.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
406
+ "thinker.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
407
+ "thinker.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
408
+ "thinker.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
409
+ "thinker.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
410
+ "thinker.model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
411
+ "thinker.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
412
+ "thinker.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
413
+ "thinker.model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
414
+ "thinker.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
415
+ "thinker.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
416
+ "thinker.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
417
+ "thinker.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
418
+ "thinker.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
419
+ "thinker.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
420
+ "thinker.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
421
+ "thinker.model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
422
+ "thinker.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
423
+ "thinker.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
424
+ "thinker.model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
425
+ "thinker.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
426
+ "thinker.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
427
+ "thinker.model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
428
+ "thinker.model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
429
+ "thinker.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
430
+ "thinker.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
431
+ "thinker.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
432
+ "thinker.model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
433
+ "thinker.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
434
+ "thinker.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
435
+ "thinker.model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
436
+ "thinker.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
437
+ "thinker.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
438
+ "thinker.model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
439
+ "thinker.model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
440
+ "thinker.model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
441
+ "thinker.model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
442
+ "thinker.model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
443
+ "thinker.model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
444
+ "thinker.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
445
+ "thinker.model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
446
+ "thinker.model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
447
+ "thinker.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
448
+ "thinker.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
449
+ "thinker.model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
450
+ "thinker.model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
451
+ "thinker.model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
452
+ "thinker.model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
453
+ "thinker.model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
454
+ "thinker.model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
455
+ "thinker.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
456
+ "thinker.model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
457
+ "thinker.model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
458
+ "thinker.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
459
+ "thinker.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
460
+ "thinker.model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
461
+ "thinker.model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
462
+ "thinker.model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
463
+ "thinker.model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
464
+ "thinker.model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
465
+ "thinker.model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
466
+ "thinker.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
467
+ "thinker.model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
468
+ "thinker.model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
469
+ "thinker.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
470
+ "thinker.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
471
+ "thinker.model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
472
+ "thinker.model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
473
+ "thinker.model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
474
+ "thinker.model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
475
+ "thinker.model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
476
+ "thinker.model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
477
+ "thinker.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
478
+ "thinker.model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
479
+ "thinker.model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
480
+ "thinker.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
481
+ "thinker.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
482
+ "thinker.model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
483
+ "thinker.model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
484
+ "thinker.model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
485
+ "thinker.model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
486
+ "thinker.model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
487
+ "thinker.model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
488
+ "thinker.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
489
+ "thinker.model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
490
+ "thinker.model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
491
+ "thinker.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
492
+ "thinker.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
493
+ "thinker.model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
494
+ "thinker.model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
495
+ "thinker.model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
496
+ "thinker.model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
497
+ "thinker.model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
498
+ "thinker.model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
499
+ "thinker.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
500
+ "thinker.model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
501
+ "thinker.model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
502
+ "thinker.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
503
+ "thinker.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
504
+ "thinker.model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
505
+ "thinker.model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
506
+ "thinker.model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
507
+ "thinker.model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
508
+ "thinker.model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
509
+ "thinker.model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
510
+ "thinker.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
511
+ "thinker.model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
512
+ "thinker.model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
513
+ "thinker.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
514
+ "thinker.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
515
+ "thinker.model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
516
+ "thinker.model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
517
+ "thinker.model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
518
+ "thinker.model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
519
+ "thinker.model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
520
+ "thinker.model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
521
+ "thinker.model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
522
+ "thinker.model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
523
+ "thinker.model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
524
+ "thinker.model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
525
+ "thinker.model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
526
+ "thinker.model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
527
+ "thinker.model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
528
+ "thinker.model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
529
+ "thinker.model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
530
+ "thinker.model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
531
+ "thinker.model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
532
+ "thinker.model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
533
+ "thinker.model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
534
+ "thinker.model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
535
+ "thinker.model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
536
+ "thinker.model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
537
+ "thinker.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
538
+ "thinker.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
539
+ "thinker.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
540
+ "thinker.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
541
+ "thinker.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
542
+ "thinker.model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
543
+ "thinker.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
544
+ "thinker.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
545
+ "thinker.model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
546
+ "thinker.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
547
+ "thinker.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
548
+ "thinker.model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
549
+ "thinker.model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
550
+ "thinker.model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
551
+ "thinker.model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
552
+ "thinker.model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
553
+ "thinker.model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
554
+ "thinker.model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
555
+ "thinker.model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
556
+ "thinker.model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
557
+ "thinker.model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
558
+ "thinker.model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
559
+ "thinker.model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
560
+ "thinker.model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
561
+ "thinker.model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
562
+ "thinker.model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
563
+ "thinker.model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
564
+ "thinker.model.layers.21.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
565
+ "thinker.model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
566
+ "thinker.model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
567
+ "thinker.model.layers.21.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
568
+ "thinker.model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
569
+ "thinker.model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
570
+ "thinker.model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
571
+ "thinker.model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
572
+ "thinker.model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
573
+ "thinker.model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
574
+ "thinker.model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
575
+ "thinker.model.layers.22.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
576
+ "thinker.model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
577
+ "thinker.model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
578
+ "thinker.model.layers.22.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
579
+ "thinker.model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
580
+ "thinker.model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
581
+ "thinker.model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
582
+ "thinker.model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
583
+ "thinker.model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
584
+ "thinker.model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
585
+ "thinker.model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
586
+ "thinker.model.layers.23.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
587
+ "thinker.model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
588
+ "thinker.model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
589
+ "thinker.model.layers.23.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
590
+ "thinker.model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
591
+ "thinker.model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
592
+ "thinker.model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
593
+ "thinker.model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
594
+ "thinker.model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
595
+ "thinker.model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
596
+ "thinker.model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
597
+ "thinker.model.layers.24.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
598
+ "thinker.model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
599
+ "thinker.model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
600
+ "thinker.model.layers.24.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
601
+ "thinker.model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
602
+ "thinker.model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
603
+ "thinker.model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
604
+ "thinker.model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
605
+ "thinker.model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
606
+ "thinker.model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
607
+ "thinker.model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
608
+ "thinker.model.layers.25.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
609
+ "thinker.model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
610
+ "thinker.model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
611
+ "thinker.model.layers.25.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
612
+ "thinker.model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
613
+ "thinker.model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
614
+ "thinker.model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
615
+ "thinker.model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
616
+ "thinker.model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
617
+ "thinker.model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
618
+ "thinker.model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
619
+ "thinker.model.layers.26.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
620
+ "thinker.model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
621
+ "thinker.model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
622
+ "thinker.model.layers.26.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
623
+ "thinker.model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
624
+ "thinker.model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
625
+ "thinker.model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
626
+ "thinker.model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
627
+ "thinker.model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
628
+ "thinker.model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
629
+ "thinker.model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
630
+ "thinker.model.layers.27.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
631
+ "thinker.model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
632
+ "thinker.model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
633
+ "thinker.model.layers.27.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
634
+ "thinker.model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
635
+ "thinker.model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
636
+ "thinker.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
637
+ "thinker.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
638
+ "thinker.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
639
+ "thinker.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
640
+ "thinker.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
641
+ "thinker.model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
642
+ "thinker.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
643
+ "thinker.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
644
+ "thinker.model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
645
+ "thinker.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
646
+ "thinker.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
647
+ "thinker.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
648
+ "thinker.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
649
+ "thinker.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
650
+ "thinker.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
651
+ "thinker.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
652
+ "thinker.model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
653
+ "thinker.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
654
+ "thinker.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
655
+ "thinker.model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
656
+ "thinker.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
657
+ "thinker.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
658
+ "thinker.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
659
+ "thinker.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
660
+ "thinker.model.layers.5.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
661
+ "thinker.model.layers.5.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
662
+ "thinker.model.layers.5.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
663
+ "thinker.model.layers.5.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
664
+ "thinker.model.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
665
+ "thinker.model.layers.5.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
666
+ "thinker.model.layers.5.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
667
+ "thinker.model.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
668
+ "thinker.model.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
669
+ "thinker.model.layers.6.input_layernorm.weight": "model-00002-of-00002.safetensors",
670
+ "thinker.model.layers.6.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
671
+ "thinker.model.layers.6.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
672
+ "thinker.model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
673
+ "thinker.model.layers.6.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
674
+ "thinker.model.layers.6.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
675
+ "thinker.model.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
676
+ "thinker.model.layers.6.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
677
+ "thinker.model.layers.6.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
678
+ "thinker.model.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
679
+ "thinker.model.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
680
+ "thinker.model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors",
681
+ "thinker.model.layers.7.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
682
+ "thinker.model.layers.7.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
683
+ "thinker.model.layers.7.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
684
+ "thinker.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
685
+ "thinker.model.layers.7.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
686
+ "thinker.model.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
687
+ "thinker.model.layers.7.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
688
+ "thinker.model.layers.7.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
689
+ "thinker.model.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
690
+ "thinker.model.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
691
+ "thinker.model.layers.8.input_layernorm.weight": "model-00002-of-00002.safetensors",
692
+ "thinker.model.layers.8.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
693
+ "thinker.model.layers.8.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
694
+ "thinker.model.layers.8.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
695
+ "thinker.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
696
+ "thinker.model.layers.8.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
697
+ "thinker.model.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
698
+ "thinker.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
699
+ "thinker.model.layers.8.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
700
+ "thinker.model.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
701
+ "thinker.model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
702
+ "thinker.model.layers.9.input_layernorm.weight": "model-00002-of-00002.safetensors",
703
+ "thinker.model.layers.9.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
704
+ "thinker.model.layers.9.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
705
+ "thinker.model.layers.9.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
706
+ "thinker.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
707
+ "thinker.model.layers.9.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
708
+ "thinker.model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
709
+ "thinker.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
710
+ "thinker.model.layers.9.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
711
+ "thinker.model.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
712
+ "thinker.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
713
+ "thinker.model.norm.weight": "model-00002-of-00002.safetensors"
714
+ }
715
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "Qwen3ASRProcessor",
13
+ "return_attention_mask": true
14
+ }
rkllm_binding.py ADDED
@@ -0,0 +1,1324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import ctypes
3
+ import enum
4
+ import os
5
+ import threading
6
+ from typing import Optional, Sequence, Tuple
7
+
8
+ import numpy as np
9
+
10
+ # Define constants from the header
11
+ CPU0 = (1 << 0) # 0x01
12
+ CPU1 = (1 << 1) # 0x02
13
+ CPU2 = (1 << 2) # 0x04
14
+ CPU3 = (1 << 3) # 0x08
15
+ CPU4 = (1 << 4) # 0x10
16
+ CPU5 = (1 << 5) # 0x20
17
+ CPU6 = (1 << 6) # 0x40
18
+ CPU7 = (1 << 7) # 0x80
19
+
20
+ # --- Enums ---
21
+ class LLMCallState(enum.IntEnum):
22
+ RKLLM_RUN_NORMAL = 0
23
+ RKLLM_RUN_WAITING = 1
24
+ RKLLM_RUN_FINISH = 2
25
+ RKLLM_RUN_ERROR = 3
26
+
27
+ class RKLLMInputType(enum.IntEnum):
28
+ RKLLM_INPUT_PROMPT = 0
29
+ RKLLM_INPUT_TOKEN = 1
30
+ RKLLM_INPUT_EMBED = 2
31
+ RKLLM_INPUT_MULTIMODAL = 3
32
+
33
+ class RKLLMInferMode(enum.IntEnum):
34
+ RKLLM_INFER_GENERATE = 0
35
+ RKLLM_INFER_GET_LAST_HIDDEN_LAYER = 1
36
+ RKLLM_INFER_GET_LOGITS = 2
37
+
38
+ # --- Structures ---
39
+ class RKLLMExtendParam(ctypes.Structure):
40
+ base_domain_id: ctypes.c_int32
41
+ embed_flash: ctypes.c_int8
42
+ enabled_cpus_num: ctypes.c_int8
43
+ enabled_cpus_mask: ctypes.c_uint32
44
+ n_batch: ctypes.c_uint8
45
+ use_cross_attn: ctypes.c_int8
46
+ reserved: ctypes.c_uint8 * 104
47
+
48
+ _fields_ = [
49
+ ("base_domain_id", ctypes.c_int32), # 基础域ID
50
+ ("embed_flash", ctypes.c_int8), # 是否从闪存查询词嵌入向量(1启用,0禁用)
51
+ ("enabled_cpus_num", ctypes.c_int8), # 推理启用的CPU数量
52
+ ("enabled_cpus_mask", ctypes.c_uint32), # 指示启用哪些CPU的位掩码
53
+ ("n_batch", ctypes.c_uint8), # 一次前向传播中并发处理的输入样本数,设置>1启用批量推理,默认为1
54
+ ("use_cross_attn", ctypes.c_int8), # 是否启用交叉注意力(非零启用,0禁用)
55
+ ("reserved", ctypes.c_uint8 * 104) # 保留字段
56
+ ]
57
+
58
+ class RKLLMParam(ctypes.Structure):
59
+ model_path: ctypes.c_char_p
60
+ max_context_len: ctypes.c_int32
61
+ max_new_tokens: ctypes.c_int32
62
+ top_k: ctypes.c_int32
63
+ n_keep: ctypes.c_int32
64
+ top_p: ctypes.c_float
65
+ temperature: ctypes.c_float
66
+ repeat_penalty: ctypes.c_float
67
+ frequency_penalty: ctypes.c_float
68
+ presence_penalty: ctypes.c_float
69
+ mirostat: ctypes.c_int32
70
+ mirostat_tau: ctypes.c_float
71
+ mirostat_eta: ctypes.c_float
72
+ skip_special_token: ctypes.c_bool
73
+ is_async: ctypes.c_bool
74
+ img_start: ctypes.c_char_p
75
+ img_end: ctypes.c_char_p
76
+ img_content: ctypes.c_char_p
77
+ extend_param: RKLLMExtendParam
78
+
79
+ _fields_ = [
80
+ ("model_path", ctypes.c_char_p), # 模型文件路径
81
+ ("max_context_len", ctypes.c_int32), # 上下文窗口最大token数
82
+ ("max_new_tokens", ctypes.c_int32), # 最大生成新token数
83
+ ("top_k", ctypes.c_int32), # Top-K采样参数
84
+ ("n_keep", ctypes.c_int32), # 上下文窗口移动时保留的kv缓存数量
85
+ ("top_p", ctypes.c_float), # Top-P(nucleus)采样参数
86
+ ("temperature", ctypes.c_float), # 采样温度,影响token选择的随机性
87
+ ("repeat_penalty", ctypes.c_float), # 重复token惩罚
88
+ ("frequency_penalty", ctypes.c_float), # 频繁token惩罚
89
+ ("presence_penalty", ctypes.c_float), # 输入中已存在token的惩罚
90
+ ("mirostat", ctypes.c_int32), # Mirostat采样策略标志(0表示禁用)
91
+ ("mirostat_tau", ctypes.c_float), # Mirostat采样Tau参数
92
+ ("mirostat_eta", ctypes.c_float), # Mirostat采样Eta参数
93
+ ("skip_special_token", ctypes.c_bool), # 是否跳过特殊token
94
+ ("is_async", ctypes.c_bool), # 是否异步推理
95
+ ("img_start", ctypes.c_char_p), # 多模态输入中图像的起始位置
96
+ ("img_end", ctypes.c_char_p), # 多模态输入中图像的结束位置
97
+ ("img_content", ctypes.c_char_p), # 图像内容指针
98
+ ("extend_param", RKLLMExtendParam) # 扩展参数
99
+ ]
100
+
101
+ class RKLLMLoraAdapter(ctypes.Structure):
102
+ lora_adapter_path: ctypes.c_char_p
103
+ lora_adapter_name: ctypes.c_char_p
104
+ scale: ctypes.c_float
105
+
106
+ _fields_ = [
107
+ ("lora_adapter_path", ctypes.c_char_p),
108
+ ("lora_adapter_name", ctypes.c_char_p),
109
+ ("scale", ctypes.c_float)
110
+ ]
111
+
112
+ class RKLLMEmbedInput(ctypes.Structure):
113
+ embed: ctypes.POINTER(ctypes.c_float)
114
+ n_tokens: ctypes.c_size_t
115
+
116
+ _fields_ = [
117
+ ("embed", ctypes.POINTER(ctypes.c_float)),
118
+ ("n_tokens", ctypes.c_size_t)
119
+ ]
120
+
121
+ class RKLLMTokenInput(ctypes.Structure):
122
+ input_ids: ctypes.POINTER(ctypes.c_int32)
123
+ n_tokens: ctypes.c_size_t
124
+
125
+ _fields_ = [
126
+ ("input_ids", ctypes.POINTER(ctypes.c_int32)),
127
+ ("n_tokens", ctypes.c_size_t)
128
+ ]
129
+
130
+ class RKLLMMultiModelInput(ctypes.Structure):
131
+ prompt: ctypes.c_char_p
132
+ image_embed: ctypes.POINTER(ctypes.c_float)
133
+ n_image_tokens: ctypes.c_size_t
134
+ n_image: ctypes.c_size_t
135
+ image_width: ctypes.c_size_t
136
+ image_height: ctypes.c_size_t
137
+
138
+ _fields_ = [
139
+ ("prompt", ctypes.c_char_p),
140
+ ("image_embed", ctypes.POINTER(ctypes.c_float)),
141
+ ("n_image_tokens", ctypes.c_size_t),
142
+ ("n_image", ctypes.c_size_t),
143
+ ("image_width", ctypes.c_size_t),
144
+ ("image_height", ctypes.c_size_t)
145
+ ]
146
+
147
+ class RKLLMCrossAttnParam(ctypes.Structure):
148
+ """
149
+ 交叉注意力参数结构体
150
+
151
+ 该结构体用于在解码器中执行交叉注意力时使用。
152
+ 它提供编码器输出(键/值缓存)、位置索引和注意力掩码。
153
+
154
+ - encoder_k_cache必须存储在连续内存中,布局为:
155
+ [num_layers][num_tokens][num_kv_heads][head_dim]
156
+ - encoder_v_cache必须存储在连续内存中,布局为:
157
+ [num_layers][num_kv_heads][head_dim][num_tokens]
158
+ """
159
+ encoder_k_cache: ctypes.POINTER(ctypes.c_float)
160
+ encoder_v_cache: ctypes.POINTER(ctypes.c_float)
161
+ encoder_mask: ctypes.POINTER(ctypes.c_float)
162
+ encoder_pos: ctypes.POINTER(ctypes.c_int32)
163
+ num_tokens: ctypes.c_int
164
+
165
+ _fields_ = [
166
+ ("encoder_k_cache", ctypes.POINTER(ctypes.c_float)), # 编码器键缓存指针(大小:num_layers * num_tokens * num_kv_heads * head_dim)
167
+ ("encoder_v_cache", ctypes.POINTER(ctypes.c_float)), # 编码器值缓存指针(大小:num_layers * num_kv_heads * head_dim * num_tokens)
168
+ ("encoder_mask", ctypes.POINTER(ctypes.c_float)), # 编码器注意力掩码指针(大小:num_tokens的数组)
169
+ ("encoder_pos", ctypes.POINTER(ctypes.c_int32)), # 编码器token位置指针(大小:num_tokens的数组)
170
+ ("num_tokens", ctypes.c_int) # 编码器序列中的token数量
171
+ ]
172
+
173
+ class RKLLMPerfStat(ctypes.Structure):
174
+ """
175
+ 性能统计结构体
176
+
177
+ 用于保存预填充和生成阶段的性能统计信息。
178
+ """
179
+ prefill_time_ms: ctypes.c_float
180
+ prefill_tokens: ctypes.c_int
181
+ generate_time_ms: ctypes.c_float
182
+ generate_tokens: ctypes.c_int
183
+ memory_usage_mb: ctypes.c_float
184
+
185
+ _fields_ = [
186
+ ("prefill_time_ms", ctypes.c_float), # 预填充阶段总耗时(毫秒)
187
+ ("prefill_tokens", ctypes.c_int), # 预填充阶段处理的token数量
188
+ ("generate_time_ms", ctypes.c_float), # 生成阶段总耗时(毫秒)
189
+ ("generate_tokens", ctypes.c_int), # 生成阶段处理的token数量
190
+ ("memory_usage_mb", ctypes.c_float) # 推理期间VmHWM常驻内存使用量(MB)
191
+ ]
192
+
193
+ class _RKLLMInputUnion(ctypes.Union):
194
+ prompt_input: ctypes.c_char_p
195
+ embed_input: RKLLMEmbedInput
196
+ token_input: RKLLMTokenInput
197
+ multimodal_input: RKLLMMultiModelInput
198
+
199
+ _fields_ = [
200
+ ("prompt_input", ctypes.c_char_p),
201
+ ("embed_input", RKLLMEmbedInput),
202
+ ("token_input", RKLLMTokenInput),
203
+ ("multimodal_input", RKLLMMultiModelInput)
204
+ ]
205
+
206
+ class RKLLMInput(ctypes.Structure):
207
+ """
208
+ LLM输入结构体
209
+
210
+ 通过联合体表示不同类型的LLM输入。
211
+ """
212
+ role: ctypes.c_char_p
213
+ enable_thinking: ctypes.c_bool
214
+ input_type: ctypes.c_int
215
+ _union_data: _RKLLMInputUnion
216
+
217
+ _fields_ = [
218
+ ("role", ctypes.c_char_p), # 消息角色:"user"(用户输入)、"tool"(函数结果)
219
+ ("enable_thinking", ctypes.c_bool), # 控制Qwen3模型是否启用"思考模式"
220
+ ("input_type", ctypes.c_int), # 枚举类型,指定输入类型(如prompt、token、embed、multimodal)
221
+ ("_union_data", _RKLLMInputUnion) # 联合体数据
222
+ ]
223
+ # Properties to make accessing union members easier
224
+ @property
225
+ def prompt_input(self) -> bytes: # Assuming c_char_p maps to bytes
226
+ if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
227
+ return self._union_data.prompt_input
228
+ raise AttributeError("Not a prompt input")
229
+ @prompt_input.setter
230
+ def prompt_input(self, value: bytes): # Assuming c_char_p maps to bytes
231
+ if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
232
+ self._union_data.prompt_input = value
233
+ else:
234
+ raise AttributeError("Not a prompt input")
235
+ @property
236
+ def embed_input(self) -> RKLLMEmbedInput:
237
+ if self.input_type == RKLLMInputType.RKLLM_INPUT_EMBED:
238
+ return self._union_data.embed_input
239
+ raise AttributeError("Not an embed input")
240
+ @embed_input.setter
241
+ def embed_input(self, value: RKLLMEmbedInput):
242
+ if self.input_type == RKLLMInputType.RKLLM_INPUT_EMBED:
243
+ self._union_data.embed_input = value
244
+ else:
245
+ raise AttributeError("Not an embed input")
246
+
247
+ @property
248
+ def token_input(self) -> RKLLMTokenInput:
249
+ if self.input_type == RKLLMInputType.RKLLM_INPUT_TOKEN:
250
+ return self._union_data.token_input
251
+ raise AttributeError("Not a token input")
252
+ @token_input.setter
253
+ def token_input(self, value: RKLLMTokenInput):
254
+ if self.input_type == RKLLMInputType.RKLLM_INPUT_TOKEN:
255
+ self._union_data.token_input = value
256
+ else:
257
+ raise AttributeError("Not a token input")
258
+
259
+ @property
260
+ def multimodal_input(self) -> RKLLMMultiModelInput:
261
+ if self.input_type == RKLLMInputType.RKLLM_INPUT_MULTIMODAL:
262
+ return self._union_data.multimodal_input
263
+ raise AttributeError("Not a multimodal input")
264
+ @multimodal_input.setter
265
+ def multimodal_input(self, value: RKLLMMultiModelInput):
266
+ if self.input_type == RKLLMInputType.RKLLM_INPUT_MULTIMODAL:
267
+ self._union_data.multimodal_input = value
268
+ else:
269
+ raise AttributeError("Not a multimodal input")
270
+
271
+ class RKLLMLoraParam(ctypes.Structure): # For inference
272
+ lora_adapter_name: ctypes.c_char_p
273
+
274
+ _fields_ = [
275
+ ("lora_adapter_name", ctypes.c_char_p)
276
+ ]
277
+
278
+ class RKLLMPromptCacheParam(ctypes.Structure): # For inference
279
+ save_prompt_cache: ctypes.c_int # bool-like
280
+ prompt_cache_path: ctypes.c_char_p
281
+
282
+ _fields_ = [
283
+ ("save_prompt_cache", ctypes.c_int), # bool-like
284
+ ("prompt_cache_path", ctypes.c_char_p)
285
+ ]
286
+
287
+ class RKLLMInferParam(ctypes.Structure):
288
+ mode: ctypes.c_int
289
+ lora_params: ctypes.POINTER(RKLLMLoraParam)
290
+ prompt_cache_params: ctypes.POINTER(RKLLMPromptCacheParam)
291
+ keep_history: ctypes.c_int # bool-like
292
+
293
+ _fields_ = [
294
+ ("mode", ctypes.c_int), # Enum will be passed as int, changed RKLLMInferMode to ctypes.c_int
295
+ ("lora_params", ctypes.POINTER(RKLLMLoraParam)),
296
+ ("prompt_cache_params", ctypes.POINTER(RKLLMPromptCacheParam)),
297
+ ("keep_history", ctypes.c_int) # bool-like
298
+ ]
299
+
300
+ class RKLLMResultLastHiddenLayer(ctypes.Structure):
301
+ hidden_states: ctypes.POINTER(ctypes.c_float)
302
+ embd_size: ctypes.c_int
303
+ num_tokens: ctypes.c_int
304
+
305
+ _fields_ = [
306
+ ("hidden_states", ctypes.POINTER(ctypes.c_float)),
307
+ ("embd_size", ctypes.c_int),
308
+ ("num_tokens", ctypes.c_int)
309
+ ]
310
+
311
+ class RKLLMResultLogits(ctypes.Structure):
312
+ logits: ctypes.POINTER(ctypes.c_float)
313
+ vocab_size: ctypes.c_int
314
+ num_tokens: ctypes.c_int
315
+
316
+ _fields_ = [
317
+ ("logits", ctypes.POINTER(ctypes.c_float)),
318
+ ("vocab_size", ctypes.c_int),
319
+ ("num_tokens", ctypes.c_int)
320
+ ]
321
+
322
+ class RKLLMResult(ctypes.Structure):
323
+ """
324
+ LLM推理结果结构体
325
+
326
+ 表示LLM推理的结果,包含生成的文本、token ID、隐藏层状态、logits和性能统计。
327
+ """
328
+ text: ctypes.c_char_p
329
+ token_id: ctypes.c_int32
330
+ last_hidden_layer: RKLLMResultLastHiddenLayer
331
+ logits: RKLLMResultLogits
332
+ perf: RKLLMPerfStat
333
+
334
+ _fields_ = [
335
+ ("text", ctypes.c_char_p), # 生成的文本结果
336
+ ("token_id", ctypes.c_int32), # 生成的token ID
337
+ ("last_hidden_layer", RKLLMResultLastHiddenLayer), # 最后一层的隐藏状态(如果请求的话)
338
+ ("logits", RKLLMResultLogits), # 模型输出的logits
339
+ ("perf", RKLLMPerfStat) # 性能统计(预填充和生成)
340
+ ]
341
+
342
+ # --- Typedefs ---
343
+ LLMHandle = ctypes.c_void_p
344
+
345
+ # --- Callback Function Type ---
346
+ LLMResultCallback = ctypes.CFUNCTYPE(
347
+ ctypes.c_int, # 返回类型:int,表示处理状态
348
+ ctypes.POINTER(RKLLMResult), # LLM结果指针
349
+ ctypes.c_void_p, # 用户数据指针
350
+ ctypes.c_int # LLM调用状态(LLMCallState枚举值)
351
+ )
352
+ """
353
+ 回调函数类型定义
354
+
355
+ 用于处理LLM结果的回调函数。
356
+
357
+ 参数:
358
+ - result: 指向LLM结果的指针
359
+ - userdata: 回调的用户数据指针
360
+ - state: LLM调用状态(例如:完成、错误)
361
+
362
+ 返回值:
363
+ - 0: 正常继续推理
364
+ - 1: 暂停推理。如果用户想要修改或干预结果(例如编辑输出、注入新提示),
365
+ 返回1以暂停当前推理。稍后,使用更新的内容调用rkllm_run来恢复推理。
366
+ """
367
+
368
+ def _iter_library_candidates(library_path: str) -> Tuple[Sequence[str], Sequence[str]]:
369
+ default_name = "librkllmrt.so"
370
+ user_path = library_path or default_name
371
+
372
+ lib_name = default_name
373
+ user_dir = None
374
+ if os.path.isdir(user_path):
375
+ user_dir = user_path
376
+ else:
377
+ lib_name = os.path.basename(user_path) or default_name
378
+ user_dir = os.path.dirname(user_path) or None
379
+ if os.path.isfile(user_path):
380
+ return [user_path], [f"user file: {user_path}"]
381
+
382
+ module_dir = os.path.dirname(os.path.abspath(__file__))
383
+ search_dirs = []
384
+ if user_dir:
385
+ search_dirs.append(user_dir)
386
+ search_dirs.extend([module_dir, os.getcwd()])
387
+
388
+ seen = set()
389
+ candidates = []
390
+ labels = []
391
+ for base_dir in search_dirs:
392
+ norm = os.path.abspath(base_dir)
393
+ if norm in seen:
394
+ continue
395
+ seen.add(norm)
396
+ candidate = os.path.join(base_dir, lib_name)
397
+ candidates.append(candidate)
398
+ labels.append(f"dir: {base_dir}")
399
+
400
+ # System path lookup comes last via the loader's default search.
401
+ candidates.append(lib_name)
402
+ labels.append("system path")
403
+ return candidates, labels
404
+
405
+
406
+ class RKLLMRuntime:
407
+ def __init__(self, library_path="./librkllmrt.so"):
408
+ candidates, labels = _iter_library_candidates(library_path)
409
+ self.lib = None
410
+ errors = []
411
+ for candidate, label in zip(candidates, labels):
412
+ try:
413
+ self.lib = ctypes.CDLL(candidate)
414
+ break
415
+ except OSError as e:
416
+ errors.append((candidate, label, str(e)))
417
+ if self.lib is None:
418
+ lines = ["Failed to load RKLLM library. Tried:"]
419
+ for candidate, label, err in errors:
420
+ lines.append(f"- {candidate} ({label}): {err}")
421
+ raise OSError("\n".join(lines))
422
+ self._setup_functions()
423
+ self.llm_handle = LLMHandle()
424
+ self._c_callback = None # To keep the callback object alive
425
+ self._user_callback = None
426
+
427
+ def _setup_functions(self):
428
+ # RKLLMParam rkllm_createDefaultParam();
429
+ self.lib.rkllm_createDefaultParam.restype = RKLLMParam
430
+ self.lib.rkllm_createDefaultParam.argtypes = []
431
+
432
+ # int rkllm_init(LLMHandle* handle, RKLLMParam* param, LLMResultCallback callback);
433
+ self.lib.rkllm_init.restype = ctypes.c_int
434
+ self.lib.rkllm_init.argtypes = [
435
+ ctypes.POINTER(LLMHandle),
436
+ ctypes.POINTER(RKLLMParam),
437
+ LLMResultCallback
438
+ ]
439
+
440
+ # int rkllm_load_lora(LLMHandle handle, RKLLMLoraAdapter* lora_adapter);
441
+ self.lib.rkllm_load_lora.restype = ctypes.c_int
442
+ self.lib.rkllm_load_lora.argtypes = [LLMHandle, ctypes.POINTER(RKLLMLoraAdapter)]
443
+
444
+ # int rkllm_load_prompt_cache(LLMHandle handle, const char* prompt_cache_path);
445
+ self.lib.rkllm_load_prompt_cache.restype = ctypes.c_int
446
+ self.lib.rkllm_load_prompt_cache.argtypes = [LLMHandle, ctypes.c_char_p]
447
+
448
+ # int rkllm_release_prompt_cache(LLMHandle handle);
449
+ self.lib.rkllm_release_prompt_cache.restype = ctypes.c_int
450
+ self.lib.rkllm_release_prompt_cache.argtypes = [LLMHandle]
451
+
452
+ # int rkllm_destroy(LLMHandle handle);
453
+ self.lib.rkllm_destroy.restype = ctypes.c_int
454
+ self.lib.rkllm_destroy.argtypes = [LLMHandle]
455
+
456
+ # int rkllm_run(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
457
+ self.lib.rkllm_run.restype = ctypes.c_int
458
+ self.lib.rkllm_run.argtypes = [
459
+ LLMHandle,
460
+ ctypes.POINTER(RKLLMInput),
461
+ ctypes.POINTER(RKLLMInferParam),
462
+ ctypes.c_void_p # userdata
463
+ ]
464
+
465
+ # int rkllm_run_async(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
466
+ # Assuming async also takes userdata for the callback context
467
+ self.lib.rkllm_run_async.restype = ctypes.c_int
468
+ self.lib.rkllm_run_async.argtypes = [
469
+ LLMHandle,
470
+ ctypes.POINTER(RKLLMInput),
471
+ ctypes.POINTER(RKLLMInferParam),
472
+ ctypes.c_void_p # userdata
473
+ ]
474
+
475
+ # int rkllm_abort(LLMHandle handle);
476
+ self.lib.rkllm_abort.restype = ctypes.c_int
477
+ self.lib.rkllm_abort.argtypes = [LLMHandle]
478
+
479
+ # int rkllm_is_running(LLMHandle handle);
480
+ self.lib.rkllm_is_running.restype = ctypes.c_int # 0 if running, non-zero otherwise
481
+ self.lib.rkllm_is_running.argtypes = [LLMHandle]
482
+
483
+ # int rkllm_clear_kv_cache(LLMHandle handle, int keep_system_prompt, int* start_pos, int* end_pos);
484
+ self.lib.rkllm_clear_kv_cache.restype = ctypes.c_int
485
+ self.lib.rkllm_clear_kv_cache.argtypes = [
486
+ LLMHandle,
487
+ ctypes.c_int,
488
+ ctypes.POINTER(ctypes.c_int), # start_pos
489
+ ctypes.POINTER(ctypes.c_int) # end_pos
490
+ ]
491
+
492
+ # int rkllm_get_kv_cache_size(LLMHandle handle, int* cache_sizes);
493
+ self.lib.rkllm_get_kv_cache_size.restype = ctypes.c_int
494
+ self.lib.rkllm_get_kv_cache_size.argtypes = [LLMHandle, ctypes.POINTER(ctypes.c_int)]
495
+
496
+ # int rkllm_set_chat_template(LLMHandle handle, const char* system_prompt, const char* prompt_prefix, const char* prompt_postfix);
497
+ self.lib.rkllm_set_chat_template.restype = ctypes.c_int
498
+ self.lib.rkllm_set_chat_template.argtypes = [
499
+ LLMHandle,
500
+ ctypes.c_char_p,
501
+ ctypes.c_char_p,
502
+ ctypes.c_char_p
503
+ ]
504
+
505
+ # int rkllm_set_function_tools(LLMHandle handle, const char* system_prompt, const char* tools, const char* tool_response_str);
506
+ self.lib.rkllm_set_function_tools.restype = ctypes.c_int
507
+ self.lib.rkllm_set_function_tools.argtypes = [
508
+ LLMHandle,
509
+ ctypes.c_char_p, # system_prompt
510
+ ctypes.c_char_p, # tools
511
+ ctypes.c_char_p # tool_response_str
512
+ ]
513
+
514
+ # int rkllm_set_cross_attn_params(LLMHandle handle, RKLLMCrossAttnParam* cross_attn_params);
515
+ self.lib.rkllm_set_cross_attn_params.restype = ctypes.c_int
516
+ self.lib.rkllm_set_cross_attn_params.argtypes = [LLMHandle, ctypes.POINTER(RKLLMCrossAttnParam)]
517
+
518
+ def create_default_param(self) -> RKLLMParam:
519
+ """Creates a default RKLLMParam structure."""
520
+ return self.lib.rkllm_createDefaultParam()
521
+
522
+ def init(self, param: RKLLMParam, callback_func) -> int:
523
+ """
524
+ Initializes the LLM.
525
+ :param param: RKLLMParam structure.
526
+ :param callback_func: A Python function that matches the signature:
527
+ def my_callback(result_ptr, userdata_ptr, state_enum):
528
+ result = result_ptr.contents # RKLLMResult
529
+ # Process result
530
+ # userdata can be retrieved if passed during run, or ignored
531
+ # state = LLMCallState(state_enum)
532
+ :return: 0 for success, non-zero for failure.
533
+ """
534
+ if not callable(callback_func):
535
+ raise ValueError("callback_func must be a callable Python function.")
536
+
537
+ self._user_callback = callback_func
538
+
539
+ # Keep a reference to the ctypes callback object to prevent it from being garbage collected.
540
+ # Always register a trampoline so we can swap the Python-level handler when needed.
541
+ self._c_callback = LLMResultCallback(self._callback_trampoline)
542
+
543
+ ret = self.lib.rkllm_init(ctypes.byref(self.llm_handle), ctypes.byref(param), self._c_callback)
544
+ if ret != 0:
545
+ raise RuntimeError(f"rkllm_init failed with error code {ret}")
546
+ return ret
547
+
548
+ def load_lora(self, lora_adapter: RKLLMLoraAdapter) -> int:
549
+ """Loads a Lora adapter."""
550
+ ret = self.lib.rkllm_load_lora(self.llm_handle, ctypes.byref(lora_adapter))
551
+ if ret != 0:
552
+ raise RuntimeError(f"rkllm_load_lora failed with error code {ret}")
553
+ return ret
554
+
555
+ def load_prompt_cache(self, prompt_cache_path: str) -> int:
556
+ """Loads a prompt cache from a file."""
557
+ c_path = prompt_cache_path.encode('utf-8')
558
+ ret = self.lib.rkllm_load_prompt_cache(self.llm_handle, c_path)
559
+ if ret != 0:
560
+ raise RuntimeError(f"rkllm_load_prompt_cache failed for {prompt_cache_path} with error code {ret}")
561
+ return ret
562
+
563
+ def release_prompt_cache(self) -> int:
564
+ """Releases the prompt cache from memory."""
565
+ ret = self.lib.rkllm_release_prompt_cache(self.llm_handle)
566
+ if ret != 0:
567
+ raise RuntimeError(f"rkllm_release_prompt_cache failed with error code {ret}")
568
+ return ret
569
+
570
+ def destroy(self) -> int:
571
+ """Destroys the LLM instance and releases resources."""
572
+ if self.llm_handle and self.llm_handle.value: # Check if handle is not NULL
573
+ ret = self.lib.rkllm_destroy(self.llm_handle)
574
+ self.llm_handle = LLMHandle() # Reset handle
575
+ if ret != 0:
576
+ # Don't raise here as it might be called in __del__
577
+ print(f"Warning: rkllm_destroy failed with error code {ret}")
578
+ return ret
579
+ return 0 # Already destroyed or not initialized
580
+
581
+ def run(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
582
+ """Runs an LLM inference task synchronously."""
583
+ # userdata can be a ctypes.py_object if you want to pass Python objects,
584
+ # then cast to c_void_p. Or simply None.
585
+ if userdata is not None:
586
+ # Store the userdata object to keep it alive during the call
587
+ self._userdata_ref = userdata
588
+ c_userdata = ctypes.cast(ctypes.pointer(ctypes.py_object(userdata)), ctypes.c_void_p)
589
+ else:
590
+ c_userdata = None
591
+ ret = self.lib.rkllm_run(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
592
+ if ret != 0:
593
+ raise RuntimeError(f"rkllm_run failed with error code {ret}")
594
+ return ret
595
+
596
+ def run_async(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
597
+ """Runs an LLM inference task asynchronously."""
598
+ if userdata is not None:
599
+ # Store the userdata object to keep it alive during the call
600
+ self._userdata_ref = userdata
601
+ c_userdata = ctypes.cast(ctypes.pointer(ctypes.py_object(userdata)), ctypes.c_void_p)
602
+ else:
603
+ c_userdata = None
604
+ ret = self.lib.rkllm_run_async(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
605
+ if ret != 0:
606
+ raise RuntimeError(f"rkllm_run_async failed with error code {ret}")
607
+ return ret
608
+
609
+ def abort(self) -> int:
610
+ """Aborts an ongoing LLM task."""
611
+ ret = self.lib.rkllm_abort(self.llm_handle)
612
+ if ret != 0:
613
+ raise RuntimeError(f"rkllm_abort failed with error code {ret}")
614
+ return ret
615
+
616
+ def is_running(self) -> bool:
617
+ """Checks if an LLM task is currently running. Returns True if running."""
618
+ # The C API returns 0 if running, non-zero otherwise.
619
+ # This is a bit counter-intuitive for a boolean "is_running".
620
+ return self.lib.rkllm_is_running(self.llm_handle) == 0
621
+
622
+ def clear_kv_cache(self, keep_system_prompt: bool, start_pos: list = None, end_pos: list = None) -> int:
623
+ """
624
+ 清除键值缓存
625
+
626
+ 此函数用于清除部分或全部KV缓存。
627
+
628
+ 参数:
629
+ - keep_system_prompt: 是否在缓存中保留系统提示(True保留,False清除)
630
+ 如果提供了特定范围[start_pos, end_pos),此标志将被忽略
631
+ - start_pos: 要清除的KV缓存范围的起始位置数组(包含),每个批次一个
632
+ - end_pos: 要清除的KV缓存范围的结束位置数组(不包含),每个批次一个
633
+ 如果start_pos和end_pos都设置为None,将清除整个缓存,keep_system_prompt将生效
634
+ 如果start_pos[i] < end_pos[i],只有指定的范围会被清除,keep_system_prompt将被忽略
635
+
636
+ 注意:start_pos或end_pos只有在keep_history == 0且生成已通过在回调中返回1暂停时才有效
637
+
638
+ 返回:0表示缓存清除成功,非零表示失败
639
+ """
640
+ # 准备C数组参数
641
+ c_start_pos = None
642
+ c_end_pos = None
643
+
644
+ if start_pos is not None and end_pos is not None:
645
+ if len(start_pos) != len(end_pos):
646
+ raise ValueError("start_pos和end_pos数组长度必须相同")
647
+
648
+ # 创建C数组
649
+ c_start_pos = (ctypes.c_int * len(start_pos))(*start_pos)
650
+ c_end_pos = (ctypes.c_int * len(end_pos))(*end_pos)
651
+
652
+ ret = self.lib.rkllm_clear_kv_cache(
653
+ self.llm_handle,
654
+ ctypes.c_int(1 if keep_system_prompt else 0),
655
+ c_start_pos,
656
+ c_end_pos
657
+ )
658
+ if ret != 0:
659
+ raise RuntimeError(f"rkllm_clear_kv_cache失败,错误代码:{ret}")
660
+ return ret
661
+
662
+ def set_chat_template(self, system_prompt: str, prompt_prefix: str, prompt_postfix: str) -> int:
663
+ """Sets the chat template for the LLM."""
664
+ c_system = system_prompt.encode('utf-8') if system_prompt else b""
665
+ c_prefix = prompt_prefix.encode('utf-8') if prompt_prefix else b""
666
+ c_postfix = prompt_postfix.encode('utf-8') if prompt_postfix else b""
667
+
668
+ ret = self.lib.rkllm_set_chat_template(self.llm_handle, c_system, c_prefix, c_postfix)
669
+ if ret != 0:
670
+ raise RuntimeError(f"rkllm_set_chat_template failed with error code {ret}")
671
+ return ret
672
+
673
+ def get_kv_cache_size(self, n_batch: int) -> list:
674
+ """
675
+ 获取给定LLM句柄的键值缓存当前大小
676
+
677
+ 此函数返回当前存储在模型KV缓存中的位置总数。
678
+
679
+ 参数:
680
+ - n_batch: 批次数量,用于确定返回数组的大小
681
+
682
+ 返回:
683
+ - list: 每个批次的缓存大小列表
684
+ """
685
+ # 预分配数组以存储每个批次的缓存大小
686
+ cache_sizes = (ctypes.c_int * n_batch)()
687
+
688
+ ret = self.lib.rkllm_get_kv_cache_size(self.llm_handle, cache_sizes)
689
+ if ret != 0:
690
+ raise RuntimeError(f"rkllm_get_kv_cache_size失败,错误代码:{ret}")
691
+
692
+ # 转换为Python列表
693
+ return [cache_sizes[i] for i in range(n_batch)]
694
+
695
+ def set_function_tools(self, system_prompt: str, tools: str, tool_response_str: str) -> int:
696
+ """
697
+ 为LLM设置函数调用配置,包括系统提示、工具定义和工具响应token
698
+
699
+ 参数:
700
+ - system_prompt: 定义语言模型上下文或行为的系统提示
701
+ - tools: JSON格式的字符串,定义可用的函数,包括它们的名称、描述和参数
702
+ - tool_response_str: 用于识别对话中函数调用结果的唯一标签。它作为标记标签,
703
+ 允许分词器将工具输出与正常对话轮次分开识别
704
+
705
+ 返回:0表示配置设置成功,非零表示错误
706
+ """
707
+ c_system = system_prompt.encode('utf-8') if system_prompt else b""
708
+ c_tools = tools.encode('utf-8') if tools else b""
709
+ c_tool_response = tool_response_str.encode('utf-8') if tool_response_str else b""
710
+
711
+ ret = self.lib.rkllm_set_function_tools(self.llm_handle, c_system, c_tools, c_tool_response)
712
+ if ret != 0:
713
+ raise RuntimeError(f"rkllm_set_function_tools失败,错误代码:{ret}")
714
+ return ret
715
+
716
+ def set_cross_attn_params(self, cross_attn_params: RKLLMCrossAttnParam) -> int:
717
+ """
718
+ 为LLM解码器设置交叉注意力参数
719
+
720
+ 参数:
721
+ - cross_attn_params: 包含用于交叉注意力的编码器相关输入数据的结构体
722
+ (详见RKLLMCrossAttnParam说明)
723
+
724
+ 返回:0表示参数设置成功,非零表示错误
725
+ """
726
+ ret = self.lib.rkllm_set_cross_attn_params(self.llm_handle, ctypes.byref(cross_attn_params))
727
+ if ret != 0:
728
+ raise RuntimeError(f"rkllm_set_cross_attn_params失败,错误代码:{ret}")
729
+ return ret
730
+
731
+ def __enter__(self):
732
+ return self
733
+
734
+ def __exit__(self, exc_type, exc_val, exc_tb):
735
+ self.destroy()
736
+
737
+ def __del__(self):
738
+ self.destroy() # Ensure resources are freed if object is garbage collected
739
+
740
+ def _callback_trampoline(self, result_ptr, userdata_ptr, state_enum):
741
+ """
742
+ Bridge callback that forwards to the currently active Python handler.
743
+ This keeps the C callback pointer stable while allowing per-call overrides.
744
+ """
745
+ handler = self._user_callback
746
+ if handler is None:
747
+ return 0
748
+ try:
749
+ return handler(result_ptr, userdata_ptr, state_enum)
750
+ except Exception as exc:
751
+ # Avoid propagating exceptions through the C callback boundary.
752
+ print(f"[rkllm_binding] Callback raised an exception: {exc}")
753
+ return 0
754
+
755
+ def forward_embed(
756
+ self,
757
+ embeds: np.ndarray,
758
+ *,
759
+ keep_history: bool = False,
760
+ timeout: Optional[float] = None,
761
+ return_last_only: bool = False,
762
+ ) -> np.ndarray:
763
+ """
764
+ Run a single forward pass with embedding input and return the last hidden layer.
765
+
766
+ Args:
767
+ embeds: Float32 embeddings shaped (T, H) or (1, T, H). Batch>1 is not supported.
768
+ keep_history: When False, KV cache will be cleared after the call. When True,
769
+ cache is kept; call clear_kv_cache() manually if needed.
770
+ timeout: Optional timeout (seconds) for waiting on the callback.
771
+ return_last_only: If True, return the last token vector shape (H,).
772
+
773
+ Returns:
774
+ np.ndarray containing hidden states (T, H) or the last token (H,).
775
+ """
776
+ if embeds is None:
777
+ raise ValueError("embeds must not be None.")
778
+
779
+ np_embeds = np.asarray(embeds, dtype=np.float32)
780
+ if np_embeds.ndim == 3:
781
+ if np_embeds.shape[0] != 1:
782
+ raise ValueError("Only batch size 1 is supported for forward_embed.")
783
+ num_tokens = np_embeds.shape[1]
784
+ flat = np_embeds.reshape(-1)
785
+ elif np_embeds.ndim == 2:
786
+ num_tokens = np_embeds.shape[0]
787
+ flat = np_embeds.reshape(-1)
788
+ else:
789
+ raise ValueError("embeds must have shape (T, H) or (1, T, H).")
790
+
791
+ flat = np.ascontiguousarray(flat, dtype=np.float32)
792
+ embed_buffer = (ctypes.c_float * flat.size)(*flat)
793
+
794
+ rk_input = RKLLMInput()
795
+ rk_input.input_type = RKLLMInputType.RKLLM_INPUT_EMBED
796
+ embed_input = RKLLMEmbedInput()
797
+ embed_input.embed = embed_buffer
798
+ embed_input.n_tokens = num_tokens
799
+ rk_input._union_data.embed_input = embed_input
800
+
801
+ infer_params = RKLLMInferParam()
802
+ infer_params.mode = RKLLMInferMode.RKLLM_INFER_GET_LAST_HIDDEN_LAYER
803
+ infer_params.keep_history = 1 if keep_history else 0
804
+ infer_params.lora_params = None
805
+ infer_params.prompt_cache_params = None
806
+
807
+ done = threading.Event()
808
+ result_holder = {"hidden": None, "error": None}
809
+
810
+ def _capture_hidden(result_ptr, userdata_ptr, state_enum):
811
+ state = LLMCallState(state_enum)
812
+ if state == LLMCallState.RKLLM_RUN_ERROR:
813
+ result_holder["error"] = "RKLLM reported an error state."
814
+ done.set()
815
+ return 0
816
+
817
+ if not result_ptr:
818
+ result_holder["error"] = "Empty result pointer received."
819
+ done.set()
820
+ return 0
821
+
822
+ result = result_ptr.contents
823
+ if result.last_hidden_layer.hidden_states and result.last_hidden_layer.embd_size > 0:
824
+ hidden = np.ctypeslib.as_array(
825
+ result.last_hidden_layer.hidden_states,
826
+ shape=(result.last_hidden_layer.num_tokens, result.last_hidden_layer.embd_size),
827
+ ).copy()
828
+ result_holder["hidden"] = hidden[-1].copy() if return_last_only else hidden
829
+ done.set()
830
+ return 1 # Pause further work; we already have the hidden states.
831
+
832
+ if state == LLMCallState.RKLLM_RUN_FINISH:
833
+ done.set()
834
+ return 0
835
+
836
+ previous_callback = self._user_callback
837
+ self._user_callback = _capture_hidden
838
+ try:
839
+ self.run(rk_input, infer_params)
840
+ if not done.wait(timeout):
841
+ raise TimeoutError("forward_embed timed out waiting for hidden states.")
842
+ finally:
843
+ self._user_callback = previous_callback
844
+
845
+ if result_holder["error"]:
846
+ raise RuntimeError(result_holder["error"])
847
+ if result_holder["hidden"] is None:
848
+ raise RuntimeError("forward_embed did not receive hidden states.")
849
+
850
+ try:
851
+ if not keep_history:
852
+ self.clear_kv_cache(True)
853
+ except Exception:
854
+ # Cache clearing best-effort; keep the forward result usable even if clearing fails.
855
+ pass
856
+
857
+ return result_holder["hidden"]
858
+
859
+ # --- Demo CLI ---
860
+ def _cli_parse_arguments() -> argparse.Namespace:
861
+ parser = argparse.ArgumentParser(
862
+ description="Demo application showcasing rkllm_binding usage."
863
+ )
864
+ parser.add_argument(
865
+ "model",
866
+ help="Path to the .rkllm model file used for inference."
867
+ )
868
+ parser.add_argument(
869
+ "--lib",
870
+ default="./librkllmrt.so",
871
+ help="Path to librkllmrt.so. Defaults to ./librkllmrt.so."
872
+ )
873
+
874
+ # Core generation parameters
875
+ parser.add_argument("--max-context-len", type=int, default=512, help="Maximum context length.")
876
+ parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of new tokens to generate.")
877
+ parser.add_argument("--top-k", type=int, default=1, help="Top-K sampling parameter.")
878
+ parser.add_argument("--top-p", type=float, default=0.0, help="Top-P (nucleus) sampling parameter.")
879
+ parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature.")
880
+ parser.add_argument("--repeat-penalty", type=float, default=1.1, help="Penalty applied to repeated tokens.")
881
+ parser.add_argument("--n-keep", type=int, default=0, help="Number of tokens to keep when context slides.")
882
+ parser.add_argument("--mirostat", type=int, default=0, help="Enable Mirostat sampling (0 disables).")
883
+ parser.add_argument("--mirostat-tau", type=float, default=5.0, help="Mirostat tau parameter.")
884
+ parser.add_argument("--mirostat-eta", type=float, default=0.1, help="Mirostat eta parameter.")
885
+ parser.add_argument(
886
+ "--skip-special-token",
887
+ action="store_true",
888
+ help="Skip special tokens when generating output."
889
+ )
890
+
891
+ # Input management
892
+ parser.add_argument(
893
+ "--input-type",
894
+ choices=("prompt", "token", "multimodal"),
895
+ default="prompt",
896
+ help="Select prompt, raw token, or multimodal (image + prompt) input."
897
+ )
898
+ parser.add_argument("--prompt", help="Prompt text to send to the model.")
899
+ parser.add_argument("--prompt-file", help="Path to a UTF-8 text file containing the prompt.")
900
+ parser.add_argument(
901
+ "--token-ids",
902
+ type=int,
903
+ nargs="+",
904
+ help="Raw token IDs (space separated). Only valid when --input-type token."
905
+ )
906
+ parser.add_argument("--role", default="user", help="Role metadata for the input message (e.g., user/system).")
907
+ parser.add_argument(
908
+ "--enable-thinking",
909
+ action="store_true",
910
+ help="Enable thinking mode for supported models."
911
+ )
912
+ parser.add_argument("--image", help="Path to an image file used when --input-type multimodal.")
913
+ parser.add_argument("--vision-encoder", help="Path to the ONNX vision encoder model.")
914
+ parser.add_argument(
915
+ "--encoder-provider",
916
+ help="Comma separated ONNX Runtime providers (e.g., 'CPUExecutionProvider')."
917
+ )
918
+ parser.add_argument(
919
+ "--encoder-threads",
920
+ type=int,
921
+ help="Thread count hint for ONNX Runtime session."
922
+ )
923
+ parser.add_argument(
924
+ "--encoder-input-shape",
925
+ help="Override encoder input spatial size as HxW or H,W (e.g., 392x392)."
926
+ )
927
+ parser.add_argument(
928
+ "--norm",
929
+ choices=("imagenet", "divide_255", "divide_128_sub_1"),
930
+ default="imagenet",
931
+ help="Image normalization preset."
932
+ )
933
+ parser.add_argument(
934
+ "--norm-mean",
935
+ type=float,
936
+ nargs=3,
937
+ metavar=("R", "G", "B"),
938
+ help="Override normalization mean (RGB order)."
939
+ )
940
+ parser.add_argument(
941
+ "--norm-std",
942
+ type=float,
943
+ nargs=3,
944
+ metavar=("R", "G", "B"),
945
+ help="Override normalization std (RGB order)."
946
+ )
947
+ parser.add_argument(
948
+ "--image-background",
949
+ type=int,
950
+ nargs=3,
951
+ metavar=("R", "G", "B"),
952
+ default=(128, 128, 128),
953
+ help="Background color used when padding image to target size."
954
+ )
955
+ parser.add_argument("--img-start-token", help="Override image start token string passed to the model.")
956
+ parser.add_argument("--img-end-token", help="Override image end token string passed to the model.")
957
+ parser.add_argument("--img-content-token", help="Override image content token string passed to the model.")
958
+
959
+ # Inference options
960
+ parser.add_argument(
961
+ "--mode",
962
+ choices=("generate", "hidden", "logits"),
963
+ default="generate",
964
+ help="Inference mode: generate tokens, return last hidden layer, or logits."
965
+ )
966
+ parser.add_argument(
967
+ "--no-keep-history",
968
+ action="store_true",
969
+ help="Do not keep dialogue history on the device."
970
+ )
971
+
972
+ # Output options
973
+ parser.add_argument(
974
+ "--stream",
975
+ action="store_true",
976
+ default=True,
977
+ help="Stream tokens to stdout as they arrive from the callback."
978
+ )
979
+ parser.add_argument(
980
+ "--hide-stats",
981
+ action="store_true",
982
+ help="Suppress performance statistics after inference."
983
+ )
984
+
985
+ args = parser.parse_args()
986
+
987
+ if args.prompt and args.prompt_file:
988
+ parser.error("Arguments --prompt and --prompt-file cannot be used together.")
989
+
990
+ if args.input_type == "prompt":
991
+ if not args.prompt and not args.prompt_file:
992
+ parser.error("Provide --prompt or --prompt-file when --input-type is prompt.")
993
+ if args.token_ids:
994
+ parser.error("--token-ids is only valid when --input-type token.")
995
+ elif args.input_type == "token":
996
+ if not args.token_ids:
997
+ parser.error("--token-ids is required when --input-type token.")
998
+ if args.prompt or args.prompt_file:
999
+ parser.error("--prompt/--prompt-file cannot be combined with --input-type token.")
1000
+ else: # multimodal
1001
+ if args.token_ids:
1002
+ parser.error("--token-ids cannot be used with --input-type multimodal.")
1003
+ if not args.prompt and not args.prompt_file:
1004
+ parser.error("Provide --prompt or --prompt-file when --input-type is multimodal.")
1005
+ if not args.image:
1006
+ parser.error("--image is required when --input-type multimodal.")
1007
+ if not args.vision_encoder:
1008
+ parser.error("--vision-encoder is required when --input-type multimodal.")
1009
+
1010
+ if args.image_background:
1011
+ for component in args.image_background:
1012
+ if component < 0 or component > 255:
1013
+ parser.error("--image-background values must be in the range [0, 255].")
1014
+
1015
+ return args
1016
+
1017
+
1018
+ def _load_prompt_from_args(args: argparse.Namespace) -> str:
1019
+ if args.prompt:
1020
+ return args.prompt
1021
+ if args.prompt_file:
1022
+ try:
1023
+ with open(args.prompt_file, "r", encoding="utf-8") as fp:
1024
+ return fp.read()
1025
+ except OSError as exc:
1026
+ raise RuntimeError(f"Failed to read prompt file '{args.prompt_file}': {exc}") from exc
1027
+ raise RuntimeError("Prompt text is required but not provided.")
1028
+
1029
+
1030
+ def _mode_to_enum(mode: str) -> int:
1031
+ mapping = {
1032
+ "generate": RKLLMInferMode.RKLLM_INFER_GENERATE,
1033
+ "hidden": RKLLMInferMode.RKLLM_INFER_GET_LAST_HIDDEN_LAYER,
1034
+ "logits": RKLLMInferMode.RKLLM_INFER_GET_LOGITS,
1035
+ }
1036
+ return mapping[mode]
1037
+
1038
+
1039
+ def _parse_hw_string(value: str) -> Tuple[int, int]:
1040
+ separators = ("x", "X", ",", " ")
1041
+ token = value.strip()
1042
+ for sep in separators:
1043
+ if sep in token:
1044
+ parts = [p for p in token.split(sep) if p]
1045
+ break
1046
+ else:
1047
+ parts = [token]
1048
+ if len(parts) != 2:
1049
+ raise ValueError(f"Unable to parse height/width from '{value}'. Expected format like 392x392.")
1050
+ try:
1051
+ height = int(parts[0])
1052
+ width = int(parts[1])
1053
+ except ValueError as exc:
1054
+ raise ValueError(f"Height/width must be integers, got '{value}'.") from exc
1055
+ if height <= 0 or width <= 0:
1056
+ raise ValueError("Height and width must be positive integers.")
1057
+ return height, width
1058
+
1059
+
1060
+ def _infer_hw_from_onnx_shape(shape: Sequence) -> Tuple[Optional[int], Optional[int]]:
1061
+ if shape is None or len(shape) < 4:
1062
+ return None, None
1063
+ height = shape[-2]
1064
+ width = shape[-1]
1065
+ if isinstance(height, str) or height is None:
1066
+ height = None
1067
+ if isinstance(width, str) or width is None:
1068
+ width = None
1069
+ return height, width
1070
+
1071
+
1072
+ def _parse_providers(provider_str: Optional[str]) -> Optional[list]:
1073
+ if not provider_str:
1074
+ return None
1075
+ providers = [item.strip() for item in provider_str.split(",") if item.strip()]
1076
+ return providers or None
1077
+
1078
+
1079
+ def _load_vision_encoder_session(encoder_path: str, providers: Optional[list], threads: Optional[int]):
1080
+ try:
1081
+ import onnxruntime as ort
1082
+ except ImportError as exc:
1083
+ raise RuntimeError("onnxruntime is required for multimodal inference. Please install onnxruntime.") from exc
1084
+
1085
+ sess_options = ort.SessionOptions()
1086
+ if threads and threads > 0:
1087
+ sess_options.intra_op_num_threads = threads
1088
+ try:
1089
+ if providers:
1090
+ session = ort.InferenceSession(encoder_path, sess_options=sess_options, providers=providers)
1091
+ else:
1092
+ session = ort.InferenceSession(encoder_path, sess_options=sess_options)
1093
+ except Exception as exc:
1094
+ raise RuntimeError(f"Failed to load vision encoder '{encoder_path}': {exc}") from exc
1095
+ return session
1096
+
1097
+
1098
+ def _letterbox_resize(image, target_hw: Tuple[int, int], background_color: Sequence[int]):
1099
+ try:
1100
+ import cv2
1101
+ import numpy as np
1102
+ except ImportError as exc:
1103
+ raise RuntimeError("OpenCV (cv2) and numpy are required for multimodal preprocessing.") from exc
1104
+
1105
+ target_h, target_w = target_hw
1106
+ if image.ndim != 3 or image.shape[2] != 3:
1107
+ raise RuntimeError("Expected RGB image with 3 channels.")
1108
+
1109
+ src_h, src_w = image.shape[:2]
1110
+ if src_h == 0 or src_w == 0:
1111
+ raise RuntimeError("Loaded image has invalid dimensions.")
1112
+
1113
+ scale = min(target_w / src_w, target_h / src_h)
1114
+ resized_w = max(1, int(round(src_w * scale)))
1115
+ resized_h = max(1, int(round(src_h * scale)))
1116
+ resized = cv2.resize(image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR)
1117
+
1118
+ canvas = np.full((target_h, target_w, 3), background_color, dtype=resized.dtype)
1119
+ top = (target_h - resized_h) // 2
1120
+ left = (target_w - resized_w) // 2
1121
+ canvas[top:top + resized_h, left:left + resized_w] = resized
1122
+ return canvas, resized_h, resized_w
1123
+
1124
+
1125
+ def _normalize_image(image, method: str, mean: Optional[Sequence[float]], std: Optional[Sequence[float]]):
1126
+ import numpy as np
1127
+
1128
+ img = image.astype(np.float32)
1129
+ mean_arr = np.array(mean, dtype=np.float32) if mean else None
1130
+ std_arr = np.array(std, dtype=np.float32) if std else None
1131
+
1132
+ if method == "imagenet":
1133
+ img = img / 255.0
1134
+ if mean_arr is None:
1135
+ mean_arr = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
1136
+ if std_arr is None:
1137
+ std_arr = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)
1138
+ img = (img - mean_arr) / std_arr
1139
+ elif method == "divide_255":
1140
+ img = img / 255.0
1141
+ if mean_arr is not None:
1142
+ img = img - mean_arr
1143
+ if std_arr is not None:
1144
+ img = img / std_arr
1145
+ elif method == "divide_128_sub_1":
1146
+ img = img / 128.0 - 1.0
1147
+ if mean_arr is not None:
1148
+ img = img - mean_arr
1149
+ if std_arr is not None:
1150
+ img = img / std_arr
1151
+ else:
1152
+ raise RuntimeError(f"Unsupported normalization method '{method}'.")
1153
+
1154
+ return img
1155
+
1156
+
1157
+ def _encode_image_to_embedding(
1158
+ session,
1159
+ image_path: str,
1160
+ input_name: str,
1161
+ output_name: str,
1162
+ target_hw: Tuple[int, int],
1163
+ background_color: Sequence[int],
1164
+ norm_method: str,
1165
+ norm_mean: Optional[Sequence[float]],
1166
+ norm_std: Optional[Sequence[float]]
1167
+ ):
1168
+ try:
1169
+ import cv2
1170
+ import numpy as np
1171
+ except ImportError as exc:
1172
+ raise RuntimeError("OpenCV (cv2) and numpy are required for multimodal preprocessing.") from exc
1173
+
1174
+ image = cv2.imread(image_path, cv2.IMREAD_COLOR)
1175
+ if image is None:
1176
+ raise RuntimeError(f"Failed to read image from '{image_path}'.")
1177
+
1178
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
1179
+ padded, resized_h, resized_w = _letterbox_resize(image, target_hw, background_color)
1180
+
1181
+ normalized = _normalize_image(padded, norm_method, norm_mean, norm_std)
1182
+ tensor = np.transpose(normalized, (2, 0, 1)) # HWC -> CHW
1183
+ tensor = np.expand_dims(tensor, axis=0) # Add batch dimension
1184
+ tensor = np.ascontiguousarray(tensor, dtype=np.float32)
1185
+
1186
+ try:
1187
+ output_list = session.run([output_name], {input_name: tensor})
1188
+ except Exception as exc:
1189
+ raise RuntimeError(f"Vision encoder inference failed: {exc}") from exc
1190
+
1191
+ if not output_list:
1192
+ raise RuntimeError("Vision encoder returned no outputs.")
1193
+
1194
+ embedding = output_list[0]
1195
+ if embedding.ndim == 3:
1196
+ if embedding.shape[0] != 1:
1197
+ raise RuntimeError("Vision encoder output batch dimension must be 1 for a single image.")
1198
+ n_tokens = embedding.shape[1]
1199
+ elif embedding.ndim == 2:
1200
+ n_tokens = embedding.shape[0]
1201
+ else:
1202
+ raise RuntimeError(f"Unsupported vision encoder output shape {embedding.shape}.")
1203
+
1204
+ flat_embedding = embedding.reshape(-1).astype(np.float32, copy=False)
1205
+ flat_embedding = np.ascontiguousarray(flat_embedding)
1206
+
1207
+ return flat_embedding, n_tokens, target_hw
1208
+
1209
+ if __name__ == "__main__":
1210
+ import os
1211
+ os.environ["RKLLM_LOG_LEVEL"] = "1"
1212
+ args = _cli_parse_arguments()
1213
+
1214
+ prompt_text = None
1215
+ if args.input_type == "prompt":
1216
+ prompt_text = _load_prompt_from_args(args)
1217
+
1218
+ token_id_array = None
1219
+ token_input_struct = None
1220
+
1221
+ generated_chunks = []
1222
+ perf_snapshot = {
1223
+ "prefill_tokens": 0,
1224
+ "prefill_time_ms": 0.0,
1225
+ "generate_tokens": 0,
1226
+ "generate_time_ms": 0.0,
1227
+ "memory_usage_mb": 0.0,
1228
+ }
1229
+
1230
+ def demo_callback(result_ptr, userdata_ptr, state_enum):
1231
+ state = LLMCallState(state_enum)
1232
+ result = result_ptr.contents
1233
+
1234
+ current_text = ""
1235
+ if result.text:
1236
+ current_text = result.text.decode("utf-8", errors="ignore")
1237
+ generated_chunks.append(current_text)
1238
+ if args.stream and current_text:
1239
+ print(current_text, end="", flush=True)
1240
+
1241
+ perf_snapshot.update(
1242
+ prefill_tokens=result.perf.prefill_tokens,
1243
+ prefill_time_ms=result.perf.prefill_time_ms,
1244
+ generate_tokens=result.perf.generate_tokens,
1245
+ generate_time_ms=result.perf.generate_time_ms,
1246
+ memory_usage_mb=result.perf.memory_usage_mb,
1247
+ )
1248
+
1249
+ if state == LLMCallState.RKLLM_RUN_ERROR:
1250
+ print("\n[Callback] 推理过程中出现错误。")
1251
+
1252
+ return 0
1253
+
1254
+ try:
1255
+ with RKLLMRuntime(library_path=args.lib) as rk_llm:
1256
+ params = rk_llm.create_default_param()
1257
+ params.model_path = os.path.abspath(args.model).encode("utf-8")
1258
+ params.max_context_len = args.max_context_len
1259
+ params.max_new_tokens = args.max_new_tokens
1260
+ params.top_k = args.top_k
1261
+ params.top_p = float(args.top_p)
1262
+ params.temperature = float(args.temperature)
1263
+ params.repeat_penalty = float(args.repeat_penalty)
1264
+ params.n_keep = args.n_keep
1265
+ params.mirostat = args.mirostat
1266
+ params.mirostat_tau = float(args.mirostat_tau)
1267
+ params.mirostat_eta = float(args.mirostat_eta)
1268
+ params.skip_special_token = bool(args.skip_special_token)
1269
+ params.is_async = False
1270
+
1271
+ rk_llm.init(params, demo_callback)
1272
+
1273
+ rk_input = RKLLMInput()
1274
+ rk_input.role = args.role.encode("utf-8")
1275
+ rk_input.enable_thinking = bool(args.enable_thinking)
1276
+
1277
+ if args.input_type == "prompt":
1278
+ rk_input.input_type = RKLLMInputType.RKLLM_INPUT_PROMPT
1279
+ rk_input._union_data.prompt_input = prompt_text.encode("utf-8")
1280
+ else:
1281
+ rk_input.input_type = RKLLMInputType.RKLLM_INPUT_TOKEN
1282
+ token_id_array = (ctypes.c_int32 * len(args.token_ids))(*args.token_ids)
1283
+ token_input_struct = RKLLMTokenInput()
1284
+ token_input_struct.input_ids = token_id_array
1285
+ token_input_struct.n_tokens = len(args.token_ids)
1286
+ rk_input._union_data.token_input = token_input_struct
1287
+
1288
+ infer_params = RKLLMInferParam()
1289
+ infer_params.mode = _mode_to_enum(args.mode)
1290
+ infer_params.keep_history = 0 if args.no_keep_history else 1
1291
+ infer_params.lora_params = None
1292
+ infer_params.prompt_cache_params = None
1293
+
1294
+ if args.stream:
1295
+ print("=== Streaming Output ===")
1296
+
1297
+ rk_llm.run(rk_input, infer_params)
1298
+
1299
+ except OSError as exc:
1300
+ print(f"无法加载 RKLLM 运行时库:{exc}")
1301
+ except RuntimeError as exc:
1302
+ print(f"推理失败:{exc}")
1303
+ except Exception as exc:
1304
+ print(f"发生未预期的错误:{exc}")
1305
+ else:
1306
+ if args.stream:
1307
+ print() # Ensure newline after streaming output
1308
+
1309
+ final_text = "".join(generated_chunks)
1310
+ if final_text:
1311
+ print("=== 生成结果 ===")
1312
+ print(final_text)
1313
+ else:
1314
+ print("未收到生成文本。")
1315
+
1316
+ if not args.hide_stats:
1317
+ print("=== 性能统计 ===")
1318
+ print(
1319
+ f"预填充: {perf_snapshot['prefill_tokens']} tokens / {perf_snapshot['prefill_time_ms']:.2f} ms"
1320
+ )
1321
+ print(
1322
+ f"生成: {perf_snapshot['generate_tokens']} tokens / {perf_snapshot['generate_time_ms']:.2f} ms"
1323
+ )
1324
+ print(f"最大常驻内存: {perf_snapshot['memory_usage_mb']:.2f} MB")
rknn/audio_encoder.rknn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdf3c0ae8f3921061115747cc0bff82011c6444083d8e48fcea5b219a3af62a0
3
+ size 643043907
rknn/language_model.rkllm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ce12ddf59600513439d8ec597c5f83631a872ba457c3ca967c9e36992532a8d
3
+ size 4092852588
run_qwen3_asr_e2e.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import ctypes
3
+ import math
4
+ import os
5
+ import sys
6
+ import time
7
+ from pathlib import Path
8
+
9
+ import faulthandler
10
+ import numpy as np
11
+ import soundfile as sf
12
+ from scipy.signal import resample_poly
13
+ from transformers import WhisperFeatureExtractor
14
+
15
+ faulthandler.enable()
16
+ os.environ.setdefault("RKLLM_LOG_LEVEL", "1")
17
+
18
+
19
+ REPO_ROOT = Path(__file__).resolve().parents[1]
20
+ if str(REPO_ROOT) not in sys.path:
21
+ sys.path.insert(0, str(REPO_ROOT))
22
+
23
+ from rkllm_binding import ( # noqa: E402
24
+ LLMCallState,
25
+ RKLLMInferMode,
26
+ RKLLMInferParam,
27
+ RKLLMInput,
28
+ RKLLMInputType,
29
+ RKLLMResult,
30
+ RKLLMRuntime,
31
+ )
32
+
33
+
34
+ import ztu_somemodelruntime_ez_rknn_async as ort
35
+
36
+
37
+
38
+ DEFAULT_ENCODER_PATH = "rknn/audio_encoder.rknn"
39
+ DEFAULT_LLM_PATH = "rknn/language_model.rkllm"
40
+
41
+
42
+ def now() -> float:
43
+ return time.perf_counter()
44
+
45
+
46
+ class StreamingTextCollector:
47
+ def __init__(self, stream_output: bool = True):
48
+ self.stream_output = stream_output
49
+ self.parts: list[str] = []
50
+ self.error = False
51
+
52
+ def __call__(self, result_ptr, userdata_ptr, state_enum):
53
+ state = LLMCallState(state_enum)
54
+ result: RKLLMResult = result_ptr.contents
55
+
56
+ if state == LLMCallState.RKLLM_RUN_NORMAL and result.text:
57
+ chunk = result.text.decode("utf-8", errors="ignore")
58
+ self.parts.append(chunk)
59
+ if self.stream_output:
60
+ print(chunk, end="", flush=True)
61
+ elif state == LLMCallState.RKLLM_RUN_FINISH and self.stream_output:
62
+ print("(finish)", flush=True)
63
+ elif state == LLMCallState.RKLLM_RUN_ERROR:
64
+ self.error = True
65
+ if self.stream_output:
66
+ print("\nrun error", flush=True)
67
+ return 0
68
+
69
+ @property
70
+ def text(self) -> str:
71
+ return "".join(self.parts)
72
+
73
+
74
+ def load_waveform(audio_path: str, target_sr: int = 16000) -> np.ndarray:
75
+ audio, sr = sf.read(audio_path, dtype="float32", always_2d=False)
76
+ audio = np.asarray(audio, dtype=np.float32)
77
+ if audio.ndim == 2:
78
+ audio = audio.mean(axis=-1)
79
+ if sr != target_sr:
80
+ divisor = math.gcd(int(sr), int(target_sr))
81
+ up = int(target_sr // divisor)
82
+ down = int(sr // divisor)
83
+ audio = resample_poly(audio, up=up, down=down).astype(np.float32)
84
+ return audio
85
+
86
+
87
+ def configure_feature_extractor_for_audio(feature_extractor: WhisperFeatureExtractor, waveform: np.ndarray) -> None:
88
+ required_seconds = max(1, math.ceil(waveform.shape[0] / float(feature_extractor.sampling_rate)))
89
+ if required_seconds <= feature_extractor.chunk_length:
90
+ return
91
+ feature_extractor.chunk_length = required_seconds
92
+ feature_extractor.n_samples = int(required_seconds * feature_extractor.sampling_rate)
93
+ feature_extractor.nb_max_frames = feature_extractor.n_samples // feature_extractor.hop_length
94
+
95
+
96
+ def extract_mel_features(model_path: str, audio_path: str) -> tuple[np.ndarray, int]:
97
+ feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path)
98
+ waveform = load_waveform(audio_path)
99
+ configure_feature_extractor_for_audio(feature_extractor, waveform)
100
+ outputs = feature_extractor(
101
+ waveform,
102
+ sampling_rate=16000,
103
+ return_attention_mask=True,
104
+ return_tensors="np",
105
+ )
106
+ input_features = outputs["input_features"][0].astype(np.float32)
107
+ feature_len = int(outputs["attention_mask"][0].sum())
108
+ return input_features, feature_len
109
+
110
+
111
+ def split_mel_features(input_features: np.ndarray, feature_len: int, chunk_frames: int) -> list[tuple[np.ndarray, int]]:
112
+ chunks = []
113
+ start = 0
114
+ while start < feature_len:
115
+ cur_len = min(chunk_frames, feature_len - start)
116
+ chunk = np.zeros((input_features.shape[0], chunk_frames), dtype=np.float32)
117
+ chunk[:, :cur_len] = input_features[:, start : start + cur_len]
118
+ chunks.append((chunk, cur_len))
119
+ start += cur_len
120
+ return chunks
121
+
122
+
123
+ def get_chunk_output_length_value(length: int) -> int:
124
+ value = int(length)
125
+ value = (value + 1) // 2
126
+ value = (value + 1) // 2
127
+ value = (value + 1) // 2
128
+ return value
129
+
130
+
131
+ def parse_args():
132
+ parser = argparse.ArgumentParser(description="Run end-to-end Qwen3-ASR with RKNN audio encoder and RKLLM decoder.")
133
+ parser.add_argument("--model-path", type=str, default=".", help="Path to the original Qwen3-ASR model directory.")
134
+ parser.add_argument("--audio-path", type=str, required=True, help="Path to the input audio file.")
135
+ parser.add_argument(
136
+ "--encoder-model-path",
137
+ type=str,
138
+ default=DEFAULT_ENCODER_PATH,
139
+ help="Path to the audio encoder model (.rknn).",
140
+ )
141
+ parser.add_argument(
142
+ "--llm-model-path",
143
+ type=str,
144
+ default=DEFAULT_LLM_PATH,
145
+ help="Path to the exported .rkllm text model.",
146
+ )
147
+ parser.add_argument("--chunk-frames", type=int, default=100, help="Fixed mel chunk length.")
148
+ parser.add_argument("--max-new-tokens", type=int, default=1024, help="Maximum number of new tokens to generate.")
149
+ parser.add_argument("--max-context-len", type=int, default=4096, help="Maximum context length for RKLLM.")
150
+ parser.add_argument("--top-k", type=int, default=5, help="Top-k used by RKLLM decoding.")
151
+ parser.add_argument("--system-prompt", type=str, default="", help="Optional system prompt.")
152
+ parser.add_argument(
153
+ "--force-language",
154
+ type=str,
155
+ default=None,
156
+ help="Optional language suffix, for example 'Chinese'. Appends 'language X<asr_text>' after the assistant prompt.",
157
+ )
158
+ parser.add_argument("--save-audio-features", type=str, default=None, help="Optional path to save concatenated audio features.")
159
+ parser.add_argument("--save-text", type=str, default=None, help="Optional path to save the final decoded text.")
160
+ parser.add_argument("--no-stream", action="store_true", help="Disable streaming stdout from the RKLLM callback.")
161
+ return parser.parse_args()
162
+
163
+
164
+ def build_chat_template(system_prompt: str, force_language) -> tuple[str, str, str]:
165
+ assistant_prefix = ""
166
+ if force_language:
167
+ assistant_prefix = f"language {force_language}<asr_text>"
168
+ return (
169
+ f"<|im_start|>system\n{system_prompt or ''}<|im_end|>\n",
170
+ "<|im_start|>user\n",
171
+ f"<|im_end|>\n<|im_start|>assistant\n{assistant_prefix}",
172
+ )
173
+
174
+
175
+ def run_audio_encoder(
176
+ session,
177
+ input_features: np.ndarray,
178
+ feature_len: int,
179
+ chunk_frames: int,
180
+ ) -> np.ndarray:
181
+ outputs = []
182
+ for chunk, chunk_len in split_mel_features(input_features, feature_len, chunk_frames):
183
+ session_outputs = session.run(
184
+ None,
185
+ {
186
+ "input_features": np.ascontiguousarray(chunk[None, ...], dtype=np.float32),
187
+ #"feature_len": np.asarray([chunk_len], dtype=np.int32),
188
+ },
189
+ )
190
+ audio_features = np.asarray(session_outputs[0], dtype=np.float32)
191
+ if len(session_outputs) >= 2:
192
+ valid_len = int(np.asarray(session_outputs[1]).reshape(-1)[0])
193
+ else:
194
+ valid_len = get_chunk_output_length_value(chunk_len)
195
+ outputs.append(audio_features[0, :valid_len])
196
+ return np.concatenate(outputs, axis=0) if outputs else np.zeros((0, 2048), dtype=np.float32)
197
+
198
+ def load_rkllm(
199
+ llm_model_path: str,
200
+ max_new_tokens: int,
201
+ max_context_len: int,
202
+ top_k: int,
203
+ system_prompt: str,
204
+ force_language,
205
+ stream_output: bool,
206
+ ):
207
+ collector = StreamingTextCollector(stream_output=stream_output)
208
+ rk_llm = RKLLMRuntime()
209
+ param = rk_llm.create_default_param()
210
+
211
+ param.model_path = llm_model_path.encode("utf-8")
212
+ param.top_k = top_k
213
+ param.max_new_tokens = max_new_tokens
214
+ param.max_context_len = max_context_len
215
+ param.skip_special_token = True
216
+ param.img_start = b"<|audio_start|>"
217
+ param.img_end = b"<|audio_end|>"
218
+ param.img_content = b"<|audio_pad|>"
219
+ param.extend_param.base_domain_id = 1 # 4GB is not enough
220
+
221
+ rk_llm.init(param, collector)
222
+
223
+ system_text, prompt_prefix, prompt_postfix = build_chat_template(
224
+ system_prompt=system_prompt,
225
+ force_language=force_language,
226
+ )
227
+ rk_llm.set_chat_template(
228
+ system_prompt=system_text,
229
+ prompt_prefix=prompt_prefix,
230
+ prompt_postfix=prompt_postfix,
231
+ )
232
+ return rk_llm, collector
233
+
234
+
235
+ def run_rkllm(
236
+ rk_llm: RKLLMRuntime,
237
+ audio_features: np.ndarray,
238
+ ) -> None:
239
+ rkllm_input = RKLLMInput()
240
+ rkllm_input.role = b"user"
241
+ rkllm_input.input_type = RKLLMInputType.RKLLM_INPUT_MULTIMODAL
242
+
243
+ # RKLLM multimodal prompt must contain the literal "<image>" placeholder.
244
+ flattened = np.ascontiguousarray(audio_features.reshape(-1), dtype=np.float32)
245
+ rkllm_input.multimodal_input.prompt = b"<image>"
246
+ rkllm_input.multimodal_input.image_embed = flattened.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
247
+ rkllm_input.multimodal_input.n_image_tokens = audio_features.shape[0]
248
+ rkllm_input.multimodal_input.n_image = 1
249
+ rkllm_input.multimodal_input.image_height = 1
250
+ rkllm_input.multimodal_input.image_width = max(audio_features.shape[0], 1)
251
+
252
+ infer_param = RKLLMInferParam()
253
+ infer_param.mode = RKLLMInferMode.RKLLM_INFER_GENERATE
254
+ infer_param.keep_history = 0
255
+
256
+ rk_llm.run(rkllm_input, infer_param)
257
+
258
+
259
+ def main():
260
+ args = parse_args()
261
+ total_t0 = now()
262
+ encoder_session = None
263
+ rk_llm = None
264
+ collector = None
265
+
266
+ load_t0 = now()
267
+ mel_t0 = now()
268
+ input_features, feature_len = extract_mel_features(args.model_path, args.audio_path)
269
+ mel_elapsed = now() - mel_t0
270
+
271
+ encoder_session = ort.InferenceSession(args.encoder_model_path)
272
+ rkllm_init_t0 = now()
273
+ rk_llm, collector = load_rkllm(
274
+ llm_model_path=args.llm_model_path,
275
+ max_new_tokens=args.max_new_tokens,
276
+ max_context_len=args.max_context_len,
277
+ top_k=args.top_k,
278
+ system_prompt=args.system_prompt,
279
+ force_language=args.force_language,
280
+ stream_output=not args.no_stream,
281
+ )
282
+ rkllm_init_elapsed = now() - rkllm_init_t0
283
+ load_elapsed = now() - load_t0
284
+
285
+ infer_t0 = now()
286
+ encoder_t0 = now()
287
+ audio_features = run_audio_encoder(
288
+ session=encoder_session,
289
+ input_features=input_features,
290
+ feature_len=feature_len,
291
+ chunk_frames=args.chunk_frames,
292
+ )
293
+ encoder_elapsed = now() - encoder_t0
294
+
295
+ print(f"input_feature_len: {feature_len}")
296
+ print(f"audio_features: {audio_features.shape}")
297
+ print(f"time_mel_sec: {mel_elapsed:.3f}")
298
+ print(f"time_rkllm_init_sec: {rkllm_init_elapsed:.3f}")
299
+ print(f"time_load_total_sec: {load_elapsed:.3f}")
300
+ print(f"time_audio_encoder_sec: {encoder_elapsed:.3f}")
301
+
302
+ if args.save_audio_features:
303
+ savepath = Path(args.save_audio_features)
304
+ savepath.parent.mkdir(parents=True, exist_ok=True)
305
+ np.save(savepath, audio_features)
306
+ print(f"saved_audio_features: {savepath}")
307
+
308
+ generate_t0 = now()
309
+ run_rkllm(rk_llm=rk_llm, audio_features=audio_features)
310
+ generate_elapsed = now() - generate_t0
311
+ infer_elapsed = now() - infer_t0
312
+ total_elapsed = now() - total_t0
313
+
314
+ if collector and collector.error:
315
+ raise RuntimeError("RKLLM generation failed.")
316
+ text = collector.text if collector else ""
317
+
318
+ print(f"time_generate_sec: {generate_elapsed:.3f}")
319
+ print(f"time_infer_total_sec: {infer_elapsed:.3f}")
320
+ print(f"time_total_sec: {total_elapsed:.3f}")
321
+
322
+ if args.save_text:
323
+ savepath = Path(args.save_text)
324
+ savepath.parent.mkdir(parents=True, exist_ok=True)
325
+ savepath.write_text(text, encoding="utf-8")
326
+ print(f"saved_text: {savepath}")
327
+
328
+ if args.no_stream:
329
+ print(text)
330
+
331
+
332
+ if __name__ == "__main__":
333
+ main()
tokenizer_config.json ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "151669": {
214
+ "content": "<|audio_start|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<|audio_end|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<tts_pad>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<tts_text_bos>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<tts_text_eod>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<tts_text_bos_single>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<non_speech>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "151676": {
270
+ "content": "<|audio_pad|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "151677": {
278
+ "content": "<blank1>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "151678": {
286
+ "content": "<blank2>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "151679": {
294
+ "content": "<blank3>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "151680": {
302
+ "content": "<blank4>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "151681": {
310
+ "content": "<blank5>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "151682": {
318
+ "content": "<blank6>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "151683": {
326
+ "content": "<blank7>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "151684": {
334
+ "content": "<blank8>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "151685": {
342
+ "content": "<blank9>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "151686": {
350
+ "content": "<blank10>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "151687": {
358
+ "content": "<blank11>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "151688": {
366
+ "content": "<blank12>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "151689": {
374
+ "content": "<blank13>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "151690": {
382
+ "content": "<blank14>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "151691": {
390
+ "content": "<blank15>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "151692": {
398
+ "content": "<blank16>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "151693": {
406
+ "content": "<blank17>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "151694": {
414
+ "content": "<blank18>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "151695": {
422
+ "content": "<blank19>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "151696": {
430
+ "content": "<blank20>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ },
437
+ "151697": {
438
+ "content": "<blank21>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": true
444
+ },
445
+ "151698": {
446
+ "content": "<blank22>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": true
452
+ },
453
+ "151699": {
454
+ "content": "<blank23>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": true
460
+ },
461
+ "151700": {
462
+ "content": "<blank24>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": true
468
+ },
469
+ "151701": {
470
+ "content": "<blank25>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": true
476
+ },
477
+ "151702": {
478
+ "content": "<blank26>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": true
484
+ },
485
+ "151703": {
486
+ "content": "<blank27>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": true
492
+ },
493
+ "151704": {
494
+ "content": "<asr_text>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": false
500
+ }
501
+ },
502
+ "additional_special_tokens": [
503
+ "<|im_start|>",
504
+ "<|im_end|>",
505
+ "<|object_ref_start|>",
506
+ "<|object_ref_end|>",
507
+ "<|box_start|>",
508
+ "<|box_end|>",
509
+ "<|quad_start|>",
510
+ "<|quad_end|>",
511
+ "<|vision_start|>",
512
+ "<|vision_end|>",
513
+ "<|vision_pad|>",
514
+ "<|image_pad|>",
515
+ "<|video_pad|>",
516
+ "<|audio_start|>",
517
+ "<|audio_end|>",
518
+ "<tts_pad>",
519
+ "<tts_text_bos>",
520
+ "<tts_text_bos_single>",
521
+ "<|audio_pad|>"
522
+ ],
523
+ "audio_bos_token": "<|audio_start|>",
524
+ "audio_eos_token": "<|audio_end|>",
525
+ "audio_token": "<|audio_pad|>",
526
+ "bos_token": null,
527
+ "clean_up_tokenization_spaces": false,
528
+ "eos_token": "<|im_end|>",
529
+ "errors": "replace",
530
+ "extra_special_tokens": {
531
+ "audio_bos_token": "<|audio_start|>",
532
+ "audio_eos_token": "<|audio_end|>",
533
+ "audio_token": "<|audio_pad|>",
534
+ "image_token": "<|image_pad|>",
535
+ "video_token": "<|video_pad|>",
536
+ "vision_bos_token": "<|vision_start|>",
537
+ "vision_eos_token": "<|vision_end|>"
538
+ },
539
+ "image_token": "<|image_pad|>",
540
+ "model_max_length": 131072,
541
+ "pad_token": "<|endoftext|>",
542
+ "processor_class": "Qwen3ASRProcessor",
543
+ "split_special_tokens": false,
544
+ "tokenizer_class": "Qwen2Tokenizer",
545
+ "unk_token": null,
546
+ "video_token": "<|video_pad|>",
547
+ "vision_bos_token": "<|vision_start|>",
548
+ "vision_eos_token": "<|vision_end|>"
549
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff